In [1]:
import json
import numpy as np
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup


 from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Charger le dataset
with open('data/data_test.json', 'r') as f:
 data = json.load(f)


In [3]:
# Séparer les données d'entrée et de sortie
inputs = [d['input'] for d in data]
outputs = [d['output'] for d in data]


In [4]:
# Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')


In [5]:
# Définir le token de padding
tokenizer.pad_token = tokenizer.eos_token # Utiliser le token de fin comme token de padding


In [6]:
# Encoder les entrées et sorties avec padding
max_length = 128
input_ids = [tokenizer.encode(input_text, add_special_tokens=True, max_length=max_length, truncation=True, padding='max_length') for input_text in inputs]
output_ids = [tokenizer.encode(output_text, add_special_tokens=True, max_length=max_length, truncation=True, padding='max_length') for output_text in outputs]


In [7]:
# Convertir en tenseurs numpy
input_ids = np.array(input_ids).astype(np.int64) # Convertir en entiers longs
output_ids = np.array(output_ids).astype(np.int64) # Convertir en entiers longs


In [8]:
# Charger le modèle pré-entraîné
model = GPT2LMHeadModel.from_pretrained('gpt2')


In [9]:
# Configurer l'entraînement
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(input_ids) * 10)




In [10]:
# Configurer l'entraînement
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(input_ids) * 10)


In [11]:
# Entraîner le modèle
epochs = 10
for epoch in range(epochs):
 model.train()
 total_loss = 0
 for i in range(len(input_ids)):
 optimizer.zero_grad()
 input_ids_tensor = torch.tensor([input_ids[i]], device=device, dtype=torch.long) # Convertir en tenseur Long
 output_ids_tensor = torch.tensor([output_ids[i]], device=device, dtype=torch.long) # Convertir en tenseur Long
 outputs = model(input_ids_tensor, labels=output_ids_tensor)
 loss = outputs[0]
 loss.backward()
 optimizer.step()
 scheduler.step()
 total_loss += loss.item()
 print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(input_ids)}')


 input_ids_tensor = torch.tensor([input_ids[i]], device=device, dtype=torch.long) # Convertir en tenseur Long


Epoch 1/10, Loss: 9.18336634202437
Epoch 2/10, Loss: 7.024676236239347
Epoch 3/10, Loss: 6.38383115421642
Epoch 4/10, Loss: 6.083545771512118
Epoch 5/10, Loss: 5.902632973410866
Epoch 6/10, Loss: 5.789845033125444
Epoch 7/10, Loss: 5.625866976651278
Epoch 8/10, Loss: 5.553086866031993
Epoch 9/10, Loss: 5.5227460861206055
Epoch 10/10, Loss: 5.475944844159213


In [12]:
# Sauvegarder le modèle
model.save_pretrained('test_generator_model')

In [13]:
model

GPT2LMHeadModel(
 (transformer): GPT2Model(
 (wte): Embedding(50257, 768)
 (wpe): Embedding(1024, 768)
 (drop): Dropout(p=0.1, inplace=False)
 (h): ModuleList(
 (0-11): 12 x GPT2Block(
 (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
 (attn): GPT2SdpaAttention(
 (c_attn): Conv1D()
 (c_proj): Conv1D()
 (attn_dropout): Dropout(p=0.1, inplace=False)
 (resid_dropout): Dropout(p=0.1, inplace=False)
 )
 (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
 (mlp): GPT2MLP(
 (c_fc): Conv1D()
 (c_proj): Conv1D()
 (act): NewGELUActivation()
 (dropout): Dropout(p=0.1, inplace=False)
 )
 )
 )
 (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
 )
 (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

## tome 2

In [15]:
import inspect
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Charger le modèle et le tokenizer
model = GPT2LMHeadModel.from_pretrained('test_generator_model')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Définir la fonction Python à tester
def mel2hz(mel):
 return 700*(10**(mel/2595.0)-1)

# Encoder la fonction en entrée
input_text = f"Transform the following Python function into a pytest unit test function:\n{inspect.getsource(mel2hz)}"
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# Créer un attention mask
attention_mask = torch.ones(input_ids.shape, dtype=torch.long)

# Générer le test unitaire avec des paramètres ajustés
output_ids = model.generate(
 input_ids,
 attention_mask=attention_mask,
 max_length=256,
 num_return_sequences=1,
 num_beams=5, 
 temperature=0.7,
 top_k=50, 
 top_p=0.95, 
 early_stopping=True
)

# Décoder et afficher le résultat
generated_test = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_test)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Transform the following Python function into a pytest unit test function:
def mel2hz(mel):
 return 700*(10**(mel/2595.0)-1)
def mel3hz(mel):
 
def mel4hz(mel):
 
def mel5hz(mel):
 
def mel6hz(mel):
 
def mel7hz(mel):
 
def mel8hz(mel):
 
def mel9hz(mel):
 


In [None]:
import json
import numpy as np
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW, get_linear_schedule_with_warmup


In [None]:
# Charger le dataset
with open('data/data_test.json', 'r') as f:
 data = json.load(f)

# Séparer les données d'entrée et de sortie
inputs = [d['input'] for d in data]
outputs = [d['output'] for d in data]


In [None]:
# Tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Encoder les entrées et sorties
max_length = 128
input_ids = tokenizer(inputs, max_length=max_length, padding=True, truncation=True, return_tensors='pt').input_ids
output_ids = tokenizer(outputs, max_length=max_length, padding=True, truncation=True, return_tensors='pt').input_ids

# Charger le modèle pré-entraîné
model = T5ForConditionalGeneration.from_pretrained('t5-small')

In [None]:
# Configurer l'entraînement
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(input_ids) * 10)


In [None]:
# Entraîner le modèle
epochs = 5
for epoch in range(epochs):
 model.train()
 total_loss = 0
 for i in range(len(input_ids)):
 optimizer.zero_grad()
 input_ids_tensor = input_ids[i].unsqueeze(0).to(device) # Convertir en tenseur Long
 output_ids_tensor = output_ids[i].unsqueeze(0).to(device) # Convertir en tenseur Long
 outputs = model(input_ids_tensor, labels=output_ids_tensor)
 loss = outputs.loss
 loss.backward()
 optimizer.step()
 scheduler.step()
 total_loss += loss.item()
 print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(input_ids)}')


In [None]:

# Sauvegarder le modèle
model.save_pretrained('test_generator_model_t5')

In [None]:
import inspect
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Charger le modèle et le tokenizer
model = T5ForConditionalGeneration.from_pretrained('test_generator_model_t5')
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Définir la fonction Python à tester
def mel2hz(mel):
 return 700 * (10 ** (mel / 2595.0) - 1)

# Encoder la fonction en entrée
input_text = f"Please write a pytest unit test for the following Python function:\n{inspect.getsource(mel2hz)}"
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# Créer un attention mask
attention_mask = torch.ones(input_ids.shape, dtype=torch.long)

# Générer le test unitaire avec des paramètres ajustés
output_ids = model.generate(
 input_ids,
 attention_mask=attention_mask,
 max_length=256,
 num_return_sequences=1,
 num_beams=5,
 temperature=0.7, # Ajustement de la température
 top_k=50,
 top_p=0.95,
 early_stopping=True
)

# Décoder et afficher le résultat
generated_test = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_test)