Spaces:

a-bab
/

train-arabic

Sleeping

App Files Files Community

train-arabic / app.py

a-bab

Update app.py

b57e664 12 months ago

raw

history blame

No virus

3.77 kB

	import json
	from pathlib import Path
	from datasets import Dataset
	from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, pipeline

	# Load the Dataset
	with open('./Arabic-SQuAD.json', 'r', encoding='utf-8') as file:
	soqal_dataset = json.load(file)

	# Convert JSON to Hugging Face Dataset
	def convert_to_dataset(dataset_dict):
	data = []
	for article in dataset_dict['data']:
	for paragraph in article['paragraphs']:
	context = paragraph['context']
	for qa in paragraph['qas']:
	question = qa['question']
	id = qa['id']
	answers = qa.get('answers', [])
	if answers:
	text = answers[0]['text']
	start = answers[0]['answer_start']
	data.append({'context': context, 'question': question, 'id': id, 'answer_text': text, 'start_position': start})
	return Dataset.from_dict({'context': [d['context'] for d in data],
	'question': [d['question'] for d in data],
	'answer_text': [d['answer_text'] for d in data],
	'id': [d['id'] for d in data],
	'start_position': [d['start_position'] for d in data]})

	soqal_formatted_dataset = convert_to_dataset(soqal_dataset)

	# Tokenize Dataset
	tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02")
	# Adjust the tokenization function to include the start and end positions of the answer
	def tokenize_function(examples):
	# Encode the context and question to get input_ids, attention_mask, and token_type_ids
	encodings = tokenizer(examples['context'], examples['question'], truncation=True, padding='max_length', max_length=512)

	# Assign the start_positions and end_positions to the encodings
	start_positions = examples['start_position']
	end_positions = [start + len(answer) for start, answer in zip(start_positions, examples['answer_text'])]

	encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
	return encodings

	# Assuming 'soqal_formatted_dataset' is of 'Dataset' type
	tokenized_soqal_datasets = soqal_formatted_dataset.map(tokenize_function, batched=True)

	# Splitting the Dataset
	small_train_dataset = tokenized_soqal_datasets.select([i for i in range(0, len(tokenized_soqal_datasets), 2)]) # 50% train
	small_eval_dataset = tokenized_soqal_datasets.select([i for i in range(1, len(tokenized_soqal_datasets), 2)]) # 50% eval


	# Initialize Model and Trainer
	model = AutoModelForQuestionAnswering.from_pretrained("aubmindlab/bert-base-arabertv02")

	training_args = TrainingArguments(
	output_dir='./results',
	num_train_epochs=3,
	per_device_train_batch_size=4,
	per_device_eval_batch_size=4,
	warmup_steps=500,
	weight_decay=0.01,
	logging_dir='./logs',
	logging_steps=100,
	do_train=True,
	do_eval=True,
	evaluation_strategy="epoch",
	save_strategy="epoch",
	push_to_hub=False,
	)

	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=small_train_dataset, # Use the training dataset here
	eval_dataset=small_eval_dataset, # Use the evaluation dataset here
	)


	# Train and Save Model
	trainer.train()
	trainer.save_model("./arabic_qa_model")

	# Evaluate Model
	results = trainer.evaluate()
	print(results)

	# Test Model after Training
	nlp = pipeline("question-answering", model=model, tokenizer=tokenizer)

	context = "يرجى وضع النص العربي هنا الذي يحتوي على المعلومات."
	question = "ما هو السؤال الذي تريد الإجابة عليه؟"

	answer = nlp(question=question, context=context)
	print(answer)