Spaces:

user2434
/

SummarizedAbstract

Sleeping

App Files Files

SummarizedAbstract / app.py

user2434

Update app.py

6a1e667 11 months ago

raw

history blame

3.44 kB

	# -- coding: utf-8 --
	"""app.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/1n1zTe_HIqsQ1JvPcV2S3i8-kjq5V4xJo
	"""
	#https://huggingface.co/spaces/user2434/SummarizedAbstract
	# Import necessary libraries
	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	from gtts import gTTS
	from io import BytesIO
	import PyPDF2

	# Function to extract abstract from PDF
	def extract_abstract(pdf_path):
	with open(pdf_path, 'rb') as file:
	reader = PyPDF2.PdfReader(file)
	abstract_start, abstract_end = None, None

	for page_num, page in enumerate(reader.pages):
	page_text = page.extract_text()
	if "Abstract" in page_text:
	abstract_start = page_num
	break

	if abstract_start is not None:
	for page_num, page in enumerate(reader.pages[abstract_start + 1:]):
	page_text = page.extract_text()
	if any(title_word in page_text for title_word in ["Introduction", "Background", "1.", "I."]):
	abstract_end = abstract_start + page_num + 1
	break

	if abstract_start is not None and abstract_end is not None:
	abstract_text = ''.join(page.extract_text() for page in reader.pages[abstract_start:abstract_end])
	return abstract_text
	else:
	return None

	# Function to summarize abstract using a pre-trained model
	def summarize_abstract(text):
	tokenizer = AutoTokenizer.from_pretrained("pszemraj/led-base-book-summary")
	model = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/led-base-book-summary")
	inputs = tokenizer(text, max_length=1000, return_tensors="pt", truncation=True)
	summary_ids = model.generate(
	inputs['input_ids'],
	max_length=40,
	min_length=20,
	no_repeat_ngram_size=3,
	encoder_no_repeat_ngram_size=3,
	repetition_penalty=2.0,
	num_beams=3,
	do_sample=True,
	early_stopping=False
	)
	summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

	sentences = summary.split('.')
	if len(sentences) > 1:
	summary = sentences[0] + '.'

	return summary

	# Function to convert text to speech
	def convert_to_speech(text):
	tts = gTTS(text, lang='en')
	buffer = BytesIO()
	tts.write_to_fp(buffer)
	buffer.seek(0)
	return buffer.read()

	# Function to process PDF and generate summary
	def process_pdf(pdf_path):
	abstract_text = extract_abstract(pdf_path)

	if abstract_text:
	abstract_text = abstract_text[:1024]
	summary = summarize_abstract(abstract_text)

	if summary:
	return summary, convert_to_speech(summary)

	# Define Gradio interface
	inputs = gr.File(label="Upload a PDF with an abstract") # Add a label to the file input
	summary_text = gr.Text(label="Written summary of the abstract")
	audio_summary = gr.Audio(label="Audio summary of abstract")

	# Launch the Gradio interface with an example PDF
	iface = gr.Interface(
	fn=process_pdf,
	inputs=inputs,
	outputs=[summary_text, audio_summary],
	title="Summarized Abstract",
	description="The app will summarize the abstract of a PDF and read it to the user.",
	examples=["Article 11 Hidden Technical Debt in Machine Learning Systems.pdf"
	]
	)

	# Launch the Gradio interface
	iface.launch()