import logging import os import re from functools import lru_cache from urllib.parse import unquote import streamlit as st from codetiming import Timer from transformers import pipeline from arabert.preprocess import ArabertPreprocessor from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM import tokenizers import re import heapq from string import punctuation import nltk from nltk.corpus import stopwords import download nltk.download('punkt') nltk.download('stopwords') nltk.download('wordnet') nltk.download('omw-1.4') punctuation = punctuation + '\n' logger = logging.getLogger(__name__) os.environ["TOKENIZERS_PARALLELISM"] = "false" logger.info("Loading models...") reader_time = Timer("loading", text="Time: {:.2f}", logger=logging.info) reader_time.start() reader_time.stop() logger.info("Finished loading the models...") logger.info(f"Time spent loading: {reader_time.last}") @lru_cache(maxsize=200) def get_results(text, model_selected, num_beams, length_penalty): logger.info("\n=================================================================") logger.info(f"Text: {text}") logger.info(f"model_selected: {model_selected}") logger.info(f"length_penalty: {length_penalty}") reader_time = Timer("summarize", text="Time: {:.2f}", logger=logging.info) reader_time.start() if model_selected == 'GPT-2': number_of_tokens_limit = 80 else: number_of_tokens_limit = 150 logger.info(f"input length: {len(text.split())}") if model_selected == 'arabartsummarization': model_name="abdalrahmanshahrour/arabartsummarization" preprocessor = ArabertPreprocessor(model_name="") tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) pipeline1 = pipeline("text2text-generation",model=model,tokenizer=tokenizer) result = pipeline1(text, pad_token_id= tokenizer.eos_token_id, num_beams=num_beams, repetition_penalty=3.0, max_length=200, length_penalty=length_penalty, no_repeat_ngram_size = 3)[0]['generated_text'] logger.info('arabartsummarization') elif model_selected == 'AraBART': model_name= "abdalrahmanshahrour/AraBART-summ" preprocessor = ArabertPreprocessor(model_name="") tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) pipeline1 = pipeline("text2text-generation",model=model,tokenizer=tokenizer) result = pipeline1(text, pad_token_id= tokenizer.eos_token_id, num_beams=num_beams, repetition_penalty=3.0, max_length=200, length_penalty=length_penalty, no_repeat_ngram_size = 3)[0]['generated_text'] logger.info('AraBART') elif model_selected == "auto-arabic-summarization": model_name="abdalrahmanshahrour/auto-arabic-summarization" preprocessor = ArabertPreprocessor(model_name="") tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) pipeline1 = pipeline("text2text-generation",model=model,tokenizer=tokenizer) result = pipeline1(text, pad_token_id= tokenizer.eos_token_id, num_beams=num_beams, repetition_penalty=3.0, max_length=200, length_penalty=length_penalty, no_repeat_ngram_size = 3)[0]['generated_text'] logger.info('auto-arabic-summarization') else: result = "الرجاء اختيار نموذج" reader_time.stop() logger.info(f"Time spent summarizing: {reader_time.last}") return result if __name__ == "__main__": results_dict = ""