user2434's picture
Update app.py
6a1e667
raw
history blame
3.44 kB
# -*- coding: utf-8 -*-
"""app.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1n1zTe_HIqsQ1JvPcV2S3i8-kjq5V4xJo
"""
#https://huggingface.co/spaces/user2434/SummarizedAbstract
# Import necessary libraries
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from gtts import gTTS
from io import BytesIO
import PyPDF2
# Function to extract abstract from PDF
def extract_abstract(pdf_path):
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
abstract_start, abstract_end = None, None
for page_num, page in enumerate(reader.pages):
page_text = page.extract_text()
if "Abstract" in page_text:
abstract_start = page_num
break
if abstract_start is not None:
for page_num, page in enumerate(reader.pages[abstract_start + 1:]):
page_text = page.extract_text()
if any(title_word in page_text for title_word in ["Introduction", "Background", "1.", "I."]):
abstract_end = abstract_start + page_num + 1
break
if abstract_start is not None and abstract_end is not None:
abstract_text = ''.join(page.extract_text() for page in reader.pages[abstract_start:abstract_end])
return abstract_text
else:
return None
# Function to summarize abstract using a pre-trained model
def summarize_abstract(text):
tokenizer = AutoTokenizer.from_pretrained("pszemraj/led-base-book-summary")
model = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/led-base-book-summary")
inputs = tokenizer(text, max_length=1000, return_tensors="pt", truncation=True)
summary_ids = model.generate(
inputs['input_ids'],
max_length=40,
min_length=20,
no_repeat_ngram_size=3,
encoder_no_repeat_ngram_size=3,
repetition_penalty=2.0,
num_beams=3,
do_sample=True,
early_stopping=False
)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
sentences = summary.split('.')
if len(sentences) > 1:
summary = sentences[0] + '.'
return summary
# Function to convert text to speech
def convert_to_speech(text):
tts = gTTS(text, lang='en')
buffer = BytesIO()
tts.write_to_fp(buffer)
buffer.seek(0)
return buffer.read()
# Function to process PDF and generate summary
def process_pdf(pdf_path):
abstract_text = extract_abstract(pdf_path)
if abstract_text:
abstract_text = abstract_text[:1024]
summary = summarize_abstract(abstract_text)
if summary:
return summary, convert_to_speech(summary)
# Define Gradio interface
inputs = gr.File(label="Upload a PDF with an abstract") # Add a label to the file input
summary_text = gr.Text(label="Written summary of the abstract")
audio_summary = gr.Audio(label="Audio summary of abstract")
# Launch the Gradio interface with an example PDF
iface = gr.Interface(
fn=process_pdf,
inputs=inputs,
outputs=[summary_text, audio_summary],
title="Summarized Abstract",
description="The app will summarize the abstract of a PDF and read it to the user.",
examples=["Article 11 Hidden Technical Debt in Machine Learning Systems.pdf"
]
)
# Launch the Gradio interface
iface.launch()