{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\aft.AFREETECH\\Downloads\\model\\.venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "import json\n", "import numpy as np\n", "import torch\n", "from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Charger le dataset\n", "with open('data/data_test.json', 'r') as f:\n", " data = json.load(f)\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Séparer les données d'entrée et de sortie\n", "inputs = [d['input'] for d in data]\n", "outputs = [d['output'] for d in data]\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Tokenizer\n", "tokenizer = GPT2Tokenizer.from_pretrained('gpt2')\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Définir le token de padding\n", "tokenizer.pad_token = tokenizer.eos_token # Utiliser le token de fin comme token de padding\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Encoder les entrées et sorties avec padding\n", "max_length = 128\n", "input_ids = [tokenizer.encode(input_text, add_special_tokens=True, max_length=max_length, truncation=True, padding='max_length') for input_text in inputs]\n", "output_ids = [tokenizer.encode(output_text, add_special_tokens=True, max_length=max_length, truncation=True, padding='max_length') for output_text in outputs]\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# Convertir en tenseurs numpy\n", "input_ids = np.array(input_ids).astype(np.int64) # Convertir en entiers longs\n", "output_ids = np.array(output_ids).astype(np.int64) # Convertir en entiers longs\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Charger le modèle pré-entraîné\n", "model = GPT2LMHeadModel.from_pretrained('gpt2')\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\aft.AFREETECH\\Downloads\\model\\.venv\\Lib\\site-packages\\transformers\\optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", " warnings.warn(\n" ] } ], "source": [ "# Configurer l'entraînement\n", "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", "model.to(device)\n", "optimizer = AdamW(model.parameters(), lr=1e-5)\n", "scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(input_ids) * 10)\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# Configurer l'entraînement\n", "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", "model.to(device)\n", "optimizer = AdamW(model.parameters(), lr=1e-5)\n", "scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(input_ids) * 10)\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\aft.AFREETECH\\AppData\\Local\\Temp\\ipykernel_3092\\3831564103.py:8: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at ..\\torch\\csrc\\utils\\tensor_new.cpp:277.)\n", " input_ids_tensor = torch.tensor([input_ids[i]], device=device, dtype=torch.long) # Convertir en tenseur Long\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/10, Loss: 9.18336634202437\n", "Epoch 2/10, Loss: 7.024676236239347\n", "Epoch 3/10, Loss: 6.38383115421642\n", "Epoch 4/10, Loss: 6.083545771512118\n", "Epoch 5/10, Loss: 5.902632973410866\n", "Epoch 6/10, Loss: 5.789845033125444\n", "Epoch 7/10, Loss: 5.625866976651278\n", "Epoch 8/10, Loss: 5.553086866031993\n", "Epoch 9/10, Loss: 5.5227460861206055\n", "Epoch 10/10, Loss: 5.475944844159213\n" ] } ], "source": [ "# Entraîner le modèle\n", "epochs = 10\n", "for epoch in range(epochs):\n", " model.train()\n", " total_loss = 0\n", " for i in range(len(input_ids)):\n", " optimizer.zero_grad()\n", " input_ids_tensor = torch.tensor([input_ids[i]], device=device, dtype=torch.long) # Convertir en tenseur Long\n", " output_ids_tensor = torch.tensor([output_ids[i]], device=device, dtype=torch.long) # Convertir en tenseur Long\n", " outputs = model(input_ids_tensor, labels=output_ids_tensor)\n", " loss = outputs[0]\n", " loss.backward()\n", " optimizer.step()\n", " scheduler.step()\n", " total_loss += loss.item()\n", " print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(input_ids)}')\n" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# Sauvegarder le modèle\n", "model.save_pretrained('test_generator_model')" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "GPT2LMHeadModel(\n", " (transformer): GPT2Model(\n", " (wte): Embedding(50257, 768)\n", " (wpe): Embedding(1024, 768)\n", " (drop): Dropout(p=0.1, inplace=False)\n", " (h): ModuleList(\n", " (0-11): 12 x GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2SdpaAttention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " )\n", " (lm_head): Linear(in_features=768, out_features=50257, bias=False)\n", ")" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## tome 2" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\aft.AFREETECH\\Downloads\\model\\.venv\\Lib\\site-packages\\transformers\\generation\\configuration_utils.py:567: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.7` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n", " warnings.warn(\n", "c:\\Users\\aft.AFREETECH\\Downloads\\model\\.venv\\Lib\\site-packages\\transformers\\generation\\configuration_utils.py:572: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.95` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.\n", " warnings.warn(\n", "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Transform the following Python function into a pytest unit test function:\n", "def mel2hz(mel):\n", " return 700*(10**(mel/2595.0)-1)\n", "def mel3hz(mel):\n", " \n", "def mel4hz(mel):\n", " \n", "def mel5hz(mel):\n", " \n", "def mel6hz(mel):\n", " \n", "def mel7hz(mel):\n", " \n", "def mel8hz(mel):\n", " \n", "def mel9hz(mel):\n", " \n" ] } ], "source": [ "import inspect\n", "import torch\n", "from transformers import GPT2LMHeadModel, GPT2Tokenizer\n", "\n", "# Charger le modèle et le tokenizer\n", "model = GPT2LMHeadModel.from_pretrained('test_generator_model')\n", "tokenizer = GPT2Tokenizer.from_pretrained('gpt2')\n", "\n", "# Définir la fonction Python à tester\n", "def mel2hz(mel):\n", " return 700*(10**(mel/2595.0)-1)\n", "\n", "# Encoder la fonction en entrée\n", "input_text = f\"Transform the following Python function into a pytest unit test function:\\n{inspect.getsource(mel2hz)}\"\n", "input_ids = tokenizer.encode(input_text, return_tensors='pt')\n", "\n", "# Créer un attention mask\n", "attention_mask = torch.ones(input_ids.shape, dtype=torch.long)\n", "\n", "# Générer le test unitaire avec des paramètres ajustés\n", "output_ids = model.generate(\n", " input_ids,\n", " attention_mask=attention_mask,\n", " max_length=256,\n", " num_return_sequences=1,\n", " num_beams=5, \n", " temperature=0.7,\n", " top_k=50, \n", " top_p=0.95, \n", " early_stopping=True\n", ")\n", "\n", "# Décoder et afficher le résultat\n", "generated_test = tokenizer.decode(output_ids[0], skip_special_tokens=True)\n", "print(generated_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json\n", "import numpy as np\n", "import torch\n", "from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW, get_linear_schedule_with_warmup\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Charger le dataset\n", "with open('data/data_test.json', 'r') as f:\n", " data = json.load(f)\n", "\n", "# Séparer les données d'entrée et de sortie\n", "inputs = [d['input'] for d in data]\n", "outputs = [d['output'] for d in data]\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Tokenizer\n", "tokenizer = T5Tokenizer.from_pretrained('t5-small')\n", "\n", "# Encoder les entrées et sorties\n", "max_length = 128\n", "input_ids = tokenizer(inputs, max_length=max_length, padding=True, truncation=True, return_tensors='pt').input_ids\n", "output_ids = tokenizer(outputs, max_length=max_length, padding=True, truncation=True, return_tensors='pt').input_ids\n", "\n", "# Charger le modèle pré-entraîné\n", "model = T5ForConditionalGeneration.from_pretrained('t5-small')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Configurer l'entraînement\n", "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", "model.to(device)\n", "optimizer = AdamW(model.parameters(), lr=1e-5)\n", "scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(input_ids) * 10)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Entraîner le modèle\n", "epochs = 5\n", "for epoch in range(epochs):\n", " model.train()\n", " total_loss = 0\n", " for i in range(len(input_ids)):\n", " optimizer.zero_grad()\n", " input_ids_tensor = input_ids[i].unsqueeze(0).to(device) # Convertir en tenseur Long\n", " output_ids_tensor = output_ids[i].unsqueeze(0).to(device) # Convertir en tenseur Long\n", " outputs = model(input_ids_tensor, labels=output_ids_tensor)\n", " loss = outputs.loss\n", " loss.backward()\n", " optimizer.step()\n", " scheduler.step()\n", " total_loss += loss.item()\n", " print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(input_ids)}')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "# Sauvegarder le modèle\n", "model.save_pretrained('test_generator_model_t5')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import inspect\n", "import torch\n", "from transformers import T5ForConditionalGeneration, T5Tokenizer\n", "\n", "# Charger le modèle et le tokenizer\n", "model = T5ForConditionalGeneration.from_pretrained('test_generator_model_t5')\n", "tokenizer = T5Tokenizer.from_pretrained('t5-small')\n", "\n", "# Définir la fonction Python à tester\n", "def mel2hz(mel):\n", " return 700 * (10 ** (mel / 2595.0) - 1)\n", "\n", "# Encoder la fonction en entrée\n", "input_text = f\"Please write a pytest unit test for the following Python function:\\n{inspect.getsource(mel2hz)}\"\n", "input_ids = tokenizer.encode(input_text, return_tensors='pt')\n", "\n", "# Créer un attention mask\n", "attention_mask = torch.ones(input_ids.shape, dtype=torch.long)\n", "\n", "# Générer le test unitaire avec des paramètres ajustés\n", "output_ids = model.generate(\n", " input_ids,\n", " attention_mask=attention_mask,\n", " max_length=256,\n", " num_return_sequences=1,\n", " num_beams=5,\n", " temperature=0.7, # Ajustement de la température\n", " top_k=50,\n", " top_p=0.95,\n", " early_stopping=True\n", ")\n", "\n", "# Décoder et afficher le résultat\n", "generated_test = tokenizer.decode(output_ids[0], skip_special_tokens=True)\n", "print(generated_test)" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.1" } }, "nbformat": 4, "nbformat_minor": 2 }