tayawelba commited on
Commit
29de65c
1 Parent(s): 73ac996

Upload fine_tune_json3.ipynb

Browse files
Files changed (1) hide show
  1. fine_tune_json3.ipynb +503 -0
fine_tune_json3.ipynb ADDED
@@ -0,0 +1,503 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "c:\\Users\\aft.AFREETECH\\Downloads\\model\\.venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
+ " from .autonotebook import tqdm as notebook_tqdm\n"
14
+ ]
15
+ }
16
+ ],
17
+ "source": [
18
+ "import json\n",
19
+ "import numpy as np\n",
20
+ "import torch\n",
21
+ "from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup\n"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 2,
27
+ "metadata": {},
28
+ "outputs": [],
29
+ "source": [
30
+ "# Charger le dataset\n",
31
+ "with open('data/data_test.json', 'r') as f:\n",
32
+ " data = json.load(f)\n"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": 3,
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "# Séparer les données d'entrée et de sortie\n",
42
+ "inputs = [d['input'] for d in data]\n",
43
+ "outputs = [d['output'] for d in data]\n"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "execution_count": 4,
49
+ "metadata": {},
50
+ "outputs": [],
51
+ "source": [
52
+ "# Tokenizer\n",
53
+ "tokenizer = GPT2Tokenizer.from_pretrained('gpt2')\n"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "code",
58
+ "execution_count": 5,
59
+ "metadata": {},
60
+ "outputs": [],
61
+ "source": [
62
+ "# Définir le token de padding\n",
63
+ "tokenizer.pad_token = tokenizer.eos_token # Utiliser le token de fin comme token de padding\n"
64
+ ]
65
+ },
66
+ {
67
+ "cell_type": "code",
68
+ "execution_count": 6,
69
+ "metadata": {},
70
+ "outputs": [],
71
+ "source": [
72
+ "# Encoder les entrées et sorties avec padding\n",
73
+ "max_length = 128\n",
74
+ "input_ids = [tokenizer.encode(input_text, add_special_tokens=True, max_length=max_length, truncation=True, padding='max_length') for input_text in inputs]\n",
75
+ "output_ids = [tokenizer.encode(output_text, add_special_tokens=True, max_length=max_length, truncation=True, padding='max_length') for output_text in outputs]\n"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 7,
81
+ "metadata": {},
82
+ "outputs": [],
83
+ "source": [
84
+ "# Convertir en tenseurs numpy\n",
85
+ "input_ids = np.array(input_ids).astype(np.int64) # Convertir en entiers longs\n",
86
+ "output_ids = np.array(output_ids).astype(np.int64) # Convertir en entiers longs\n"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": 8,
92
+ "metadata": {},
93
+ "outputs": [],
94
+ "source": [
95
+ "# Charger le modèle pré-entraîné\n",
96
+ "model = GPT2LMHeadModel.from_pretrained('gpt2')\n"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": 9,
102
+ "metadata": {},
103
+ "outputs": [
104
+ {
105
+ "name": "stderr",
106
+ "output_type": "stream",
107
+ "text": [
108
+ "c:\\Users\\aft.AFREETECH\\Downloads\\model\\.venv\\Lib\\site-packages\\transformers\\optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
109
+ " warnings.warn(\n"
110
+ ]
111
+ }
112
+ ],
113
+ "source": [
114
+ "# Configurer l'entraînement\n",
115
+ "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
116
+ "model.to(device)\n",
117
+ "optimizer = AdamW(model.parameters(), lr=1e-5)\n",
118
+ "scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(input_ids) * 10)\n"
119
+ ]
120
+ },
121
+ {
122
+ "cell_type": "code",
123
+ "execution_count": 10,
124
+ "metadata": {},
125
+ "outputs": [],
126
+ "source": [
127
+ "# Configurer l'entraînement\n",
128
+ "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
129
+ "model.to(device)\n",
130
+ "optimizer = AdamW(model.parameters(), lr=1e-5)\n",
131
+ "scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(input_ids) * 10)\n"
132
+ ]
133
+ },
134
+ {
135
+ "cell_type": "code",
136
+ "execution_count": 11,
137
+ "metadata": {},
138
+ "outputs": [
139
+ {
140
+ "name": "stderr",
141
+ "output_type": "stream",
142
+ "text": [
143
+ "C:\\Users\\aft.AFREETECH\\AppData\\Local\\Temp\\ipykernel_3092\\3831564103.py:8: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at ..\\torch\\csrc\\utils\\tensor_new.cpp:277.)\n",
144
+ " input_ids_tensor = torch.tensor([input_ids[i]], device=device, dtype=torch.long) # Convertir en tenseur Long\n"
145
+ ]
146
+ },
147
+ {
148
+ "name": "stdout",
149
+ "output_type": "stream",
150
+ "text": [
151
+ "Epoch 1/10, Loss: 9.18336634202437\n",
152
+ "Epoch 2/10, Loss: 7.024676236239347\n",
153
+ "Epoch 3/10, Loss: 6.38383115421642\n",
154
+ "Epoch 4/10, Loss: 6.083545771512118\n",
155
+ "Epoch 5/10, Loss: 5.902632973410866\n",
156
+ "Epoch 6/10, Loss: 5.789845033125444\n",
157
+ "Epoch 7/10, Loss: 5.625866976651278\n",
158
+ "Epoch 8/10, Loss: 5.553086866031993\n",
159
+ "Epoch 9/10, Loss: 5.5227460861206055\n",
160
+ "Epoch 10/10, Loss: 5.475944844159213\n"
161
+ ]
162
+ }
163
+ ],
164
+ "source": [
165
+ "# Entraîner le modèle\n",
166
+ "epochs = 10\n",
167
+ "for epoch in range(epochs):\n",
168
+ " model.train()\n",
169
+ " total_loss = 0\n",
170
+ " for i in range(len(input_ids)):\n",
171
+ " optimizer.zero_grad()\n",
172
+ " input_ids_tensor = torch.tensor([input_ids[i]], device=device, dtype=torch.long) # Convertir en tenseur Long\n",
173
+ " output_ids_tensor = torch.tensor([output_ids[i]], device=device, dtype=torch.long) # Convertir en tenseur Long\n",
174
+ " outputs = model(input_ids_tensor, labels=output_ids_tensor)\n",
175
+ " loss = outputs[0]\n",
176
+ " loss.backward()\n",
177
+ " optimizer.step()\n",
178
+ " scheduler.step()\n",
179
+ " total_loss += loss.item()\n",
180
+ " print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(input_ids)}')\n"
181
+ ]
182
+ },
183
+ {
184
+ "cell_type": "code",
185
+ "execution_count": 12,
186
+ "metadata": {},
187
+ "outputs": [],
188
+ "source": [
189
+ "# Sauvegarder le modèle\n",
190
+ "model.save_pretrained('test_generator_model')"
191
+ ]
192
+ },
193
+ {
194
+ "cell_type": "code",
195
+ "execution_count": 13,
196
+ "metadata": {},
197
+ "outputs": [
198
+ {
199
+ "data": {
200
+ "text/plain": [
201
+ "GPT2LMHeadModel(\n",
202
+ " (transformer): GPT2Model(\n",
203
+ " (wte): Embedding(50257, 768)\n",
204
+ " (wpe): Embedding(1024, 768)\n",
205
+ " (drop): Dropout(p=0.1, inplace=False)\n",
206
+ " (h): ModuleList(\n",
207
+ " (0-11): 12 x GPT2Block(\n",
208
+ " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
209
+ " (attn): GPT2SdpaAttention(\n",
210
+ " (c_attn): Conv1D()\n",
211
+ " (c_proj): Conv1D()\n",
212
+ " (attn_dropout): Dropout(p=0.1, inplace=False)\n",
213
+ " (resid_dropout): Dropout(p=0.1, inplace=False)\n",
214
+ " )\n",
215
+ " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
216
+ " (mlp): GPT2MLP(\n",
217
+ " (c_fc): Conv1D()\n",
218
+ " (c_proj): Conv1D()\n",
219
+ " (act): NewGELUActivation()\n",
220
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
221
+ " )\n",
222
+ " )\n",
223
+ " )\n",
224
+ " (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
225
+ " )\n",
226
+ " (lm_head): Linear(in_features=768, out_features=50257, bias=False)\n",
227
+ ")"
228
+ ]
229
+ },
230
+ "execution_count": 13,
231
+ "metadata": {},
232
+ "output_type": "execute_result"
233
+ }
234
+ ],
235
+ "source": [
236
+ "model"
237
+ ]
238
+ },
239
+ {
240
+ "cell_type": "markdown",
241
+ "metadata": {},
242
+ "source": [
243
+ "## tome 2"
244
+ ]
245
+ },
246
+ {
247
+ "cell_type": "code",
248
+ "execution_count": 15,
249
+ "metadata": {},
250
+ "outputs": [
251
+ {
252
+ "name": "stderr",
253
+ "output_type": "stream",
254
+ "text": [
255
+ "c:\\Users\\aft.AFREETECH\\Downloads\\model\\.venv\\Lib\\site-packages\\transformers\\generation\\configuration_utils.py:567: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.7` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n",
256
+ " warnings.warn(\n",
257
+ "c:\\Users\\aft.AFREETECH\\Downloads\\model\\.venv\\Lib\\site-packages\\transformers\\generation\\configuration_utils.py:572: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.95` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.\n",
258
+ " warnings.warn(\n",
259
+ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
260
+ ]
261
+ },
262
+ {
263
+ "name": "stdout",
264
+ "output_type": "stream",
265
+ "text": [
266
+ "Transform the following Python function into a pytest unit test function:\n",
267
+ "def mel2hz(mel):\n",
268
+ " return 700*(10**(mel/2595.0)-1)\n",
269
+ "def mel3hz(mel):\n",
270
+ " \n",
271
+ "def mel4hz(mel):\n",
272
+ " \n",
273
+ "def mel5hz(mel):\n",
274
+ " \n",
275
+ "def mel6hz(mel):\n",
276
+ " \n",
277
+ "def mel7hz(mel):\n",
278
+ " \n",
279
+ "def mel8hz(mel):\n",
280
+ " \n",
281
+ "def mel9hz(mel):\n",
282
+ " \n"
283
+ ]
284
+ }
285
+ ],
286
+ "source": [
287
+ "import inspect\n",
288
+ "import torch\n",
289
+ "from transformers import GPT2LMHeadModel, GPT2Tokenizer\n",
290
+ "\n",
291
+ "# Charger le modèle et le tokenizer\n",
292
+ "model = GPT2LMHeadModel.from_pretrained('test_generator_model')\n",
293
+ "tokenizer = GPT2Tokenizer.from_pretrained('gpt2')\n",
294
+ "\n",
295
+ "# Définir la fonction Python à tester\n",
296
+ "def mel2hz(mel):\n",
297
+ " return 700*(10**(mel/2595.0)-1)\n",
298
+ "\n",
299
+ "# Encoder la fonction en entrée\n",
300
+ "input_text = f\"Transform the following Python function into a pytest unit test function:\\n{inspect.getsource(mel2hz)}\"\n",
301
+ "input_ids = tokenizer.encode(input_text, return_tensors='pt')\n",
302
+ "\n",
303
+ "# Créer un attention mask\n",
304
+ "attention_mask = torch.ones(input_ids.shape, dtype=torch.long)\n",
305
+ "\n",
306
+ "# Générer le test unitaire avec des paramètres ajustés\n",
307
+ "output_ids = model.generate(\n",
308
+ " input_ids,\n",
309
+ " attention_mask=attention_mask,\n",
310
+ " max_length=256,\n",
311
+ " num_return_sequences=1,\n",
312
+ " num_beams=5, \n",
313
+ " temperature=0.7,\n",
314
+ " top_k=50, \n",
315
+ " top_p=0.95, \n",
316
+ " early_stopping=True\n",
317
+ ")\n",
318
+ "\n",
319
+ "# Décoder et afficher le résultat\n",
320
+ "generated_test = tokenizer.decode(output_ids[0], skip_special_tokens=True)\n",
321
+ "print(generated_test)"
322
+ ]
323
+ },
324
+ {
325
+ "cell_type": "code",
326
+ "execution_count": null,
327
+ "metadata": {},
328
+ "outputs": [],
329
+ "source": []
330
+ },
331
+ {
332
+ "cell_type": "code",
333
+ "execution_count": null,
334
+ "metadata": {},
335
+ "outputs": [],
336
+ "source": []
337
+ },
338
+ {
339
+ "cell_type": "code",
340
+ "execution_count": null,
341
+ "metadata": {},
342
+ "outputs": [],
343
+ "source": []
344
+ },
345
+ {
346
+ "cell_type": "code",
347
+ "execution_count": null,
348
+ "metadata": {},
349
+ "outputs": [],
350
+ "source": [
351
+ "import json\n",
352
+ "import numpy as np\n",
353
+ "import torch\n",
354
+ "from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW, get_linear_schedule_with_warmup\n"
355
+ ]
356
+ },
357
+ {
358
+ "cell_type": "code",
359
+ "execution_count": null,
360
+ "metadata": {},
361
+ "outputs": [],
362
+ "source": [
363
+ "# Charger le dataset\n",
364
+ "with open('data/data_test.json', 'r') as f:\n",
365
+ " data = json.load(f)\n",
366
+ "\n",
367
+ "# Séparer les données d'entrée et de sortie\n",
368
+ "inputs = [d['input'] for d in data]\n",
369
+ "outputs = [d['output'] for d in data]\n"
370
+ ]
371
+ },
372
+ {
373
+ "cell_type": "code",
374
+ "execution_count": null,
375
+ "metadata": {},
376
+ "outputs": [],
377
+ "source": [
378
+ "# Tokenizer\n",
379
+ "tokenizer = T5Tokenizer.from_pretrained('t5-small')\n",
380
+ "\n",
381
+ "# Encoder les entrées et sorties\n",
382
+ "max_length = 128\n",
383
+ "input_ids = tokenizer(inputs, max_length=max_length, padding=True, truncation=True, return_tensors='pt').input_ids\n",
384
+ "output_ids = tokenizer(outputs, max_length=max_length, padding=True, truncation=True, return_tensors='pt').input_ids\n",
385
+ "\n",
386
+ "# Charger le modèle pré-entraîné\n",
387
+ "model = T5ForConditionalGeneration.from_pretrained('t5-small')"
388
+ ]
389
+ },
390
+ {
391
+ "cell_type": "code",
392
+ "execution_count": null,
393
+ "metadata": {},
394
+ "outputs": [],
395
+ "source": [
396
+ "# Configurer l'entraînement\n",
397
+ "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
398
+ "model.to(device)\n",
399
+ "optimizer = AdamW(model.parameters(), lr=1e-5)\n",
400
+ "scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(input_ids) * 10)\n"
401
+ ]
402
+ },
403
+ {
404
+ "cell_type": "code",
405
+ "execution_count": null,
406
+ "metadata": {},
407
+ "outputs": [],
408
+ "source": [
409
+ "# Entraîner le modèle\n",
410
+ "epochs = 5\n",
411
+ "for epoch in range(epochs):\n",
412
+ " model.train()\n",
413
+ " total_loss = 0\n",
414
+ " for i in range(len(input_ids)):\n",
415
+ " optimizer.zero_grad()\n",
416
+ " input_ids_tensor = input_ids[i].unsqueeze(0).to(device) # Convertir en tenseur Long\n",
417
+ " output_ids_tensor = output_ids[i].unsqueeze(0).to(device) # Convertir en tenseur Long\n",
418
+ " outputs = model(input_ids_tensor, labels=output_ids_tensor)\n",
419
+ " loss = outputs.loss\n",
420
+ " loss.backward()\n",
421
+ " optimizer.step()\n",
422
+ " scheduler.step()\n",
423
+ " total_loss += loss.item()\n",
424
+ " print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(input_ids)}')\n"
425
+ ]
426
+ },
427
+ {
428
+ "cell_type": "code",
429
+ "execution_count": null,
430
+ "metadata": {},
431
+ "outputs": [],
432
+ "source": [
433
+ "\n",
434
+ "# Sauvegarder le modèle\n",
435
+ "model.save_pretrained('test_generator_model_t5')"
436
+ ]
437
+ },
438
+ {
439
+ "cell_type": "code",
440
+ "execution_count": null,
441
+ "metadata": {},
442
+ "outputs": [],
443
+ "source": [
444
+ "import inspect\n",
445
+ "import torch\n",
446
+ "from transformers import T5ForConditionalGeneration, T5Tokenizer\n",
447
+ "\n",
448
+ "# Charger le modèle et le tokenizer\n",
449
+ "model = T5ForConditionalGeneration.from_pretrained('test_generator_model_t5')\n",
450
+ "tokenizer = T5Tokenizer.from_pretrained('t5-small')\n",
451
+ "\n",
452
+ "# Définir la fonction Python à tester\n",
453
+ "def mel2hz(mel):\n",
454
+ " return 700 * (10 ** (mel / 2595.0) - 1)\n",
455
+ "\n",
456
+ "# Encoder la fonction en entrée\n",
457
+ "input_text = f\"Please write a pytest unit test for the following Python function:\\n{inspect.getsource(mel2hz)}\"\n",
458
+ "input_ids = tokenizer.encode(input_text, return_tensors='pt')\n",
459
+ "\n",
460
+ "# Créer un attention mask\n",
461
+ "attention_mask = torch.ones(input_ids.shape, dtype=torch.long)\n",
462
+ "\n",
463
+ "# Générer le test unitaire avec des paramètres ajustés\n",
464
+ "output_ids = model.generate(\n",
465
+ " input_ids,\n",
466
+ " attention_mask=attention_mask,\n",
467
+ " max_length=256,\n",
468
+ " num_return_sequences=1,\n",
469
+ " num_beams=5,\n",
470
+ " temperature=0.7, # Ajustement de la température\n",
471
+ " top_k=50,\n",
472
+ " top_p=0.95,\n",
473
+ " early_stopping=True\n",
474
+ ")\n",
475
+ "\n",
476
+ "# Décoder et afficher le résultat\n",
477
+ "generated_test = tokenizer.decode(output_ids[0], skip_special_tokens=True)\n",
478
+ "print(generated_test)"
479
+ ]
480
+ }
481
+ ],
482
+ "metadata": {
483
+ "kernelspec": {
484
+ "display_name": ".venv",
485
+ "language": "python",
486
+ "name": "python3"
487
+ },
488
+ "language_info": {
489
+ "codemirror_mode": {
490
+ "name": "ipython",
491
+ "version": 3
492
+ },
493
+ "file_extension": ".py",
494
+ "mimetype": "text/x-python",
495
+ "name": "python",
496
+ "nbconvert_exporter": "python",
497
+ "pygments_lexer": "ipython3",
498
+ "version": "3.11.1"
499
+ }
500
+ },
501
+ "nbformat": 4,
502
+ "nbformat_minor": 2
503
+ }