Spaces:
Sleeping
Sleeping
# -*- coding: utf-8 -*- | |
"""app.ipynb | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/drive/1n1zTe_HIqsQ1JvPcV2S3i8-kjq5V4xJo | |
""" | |
#https://huggingface.co/spaces/user2434/SummarizedAbstract | |
# Import necessary libraries | |
import gradio as gr | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
from gtts import gTTS | |
from io import BytesIO | |
import PyPDF2 | |
# Function to extract abstract from PDF | |
def extract_abstract(pdf_path): | |
with open(pdf_path, 'rb') as file: | |
reader = PyPDF2.PdfReader(file) | |
abstract_start, abstract_end = None, None | |
for page_num, page in enumerate(reader.pages): | |
page_text = page.extract_text() | |
if "Abstract" in page_text: | |
abstract_start = page_num | |
break | |
if abstract_start is not None: | |
for page_num, page in enumerate(reader.pages[abstract_start + 1:]): | |
page_text = page.extract_text() | |
if any(title_word in page_text for title_word in ["Introduction", "Background", "1.", "I."]): | |
abstract_end = abstract_start + page_num + 1 | |
break | |
if abstract_start is not None and abstract_end is not None: | |
abstract_text = ''.join(page.extract_text() for page in reader.pages[abstract_start:abstract_end]) | |
return abstract_text | |
else: | |
return None | |
# Function to summarize abstract using a pre-trained model | |
def summarize_abstract(text): | |
tokenizer = AutoTokenizer.from_pretrained("pszemraj/led-base-book-summary") | |
model = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/led-base-book-summary") | |
inputs = tokenizer(text, max_length=1000, return_tensors="pt", truncation=True) | |
summary_ids = model.generate( | |
inputs['input_ids'], | |
max_length=40, | |
min_length=20, | |
no_repeat_ngram_size=3, | |
encoder_no_repeat_ngram_size=3, | |
repetition_penalty=2.0, | |
num_beams=3, | |
do_sample=True, | |
early_stopping=False | |
) | |
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
sentences = summary.split('.') | |
if len(sentences) > 1: | |
summary = sentences[0] + '.' | |
return summary | |
# Function to convert text to speech | |
def convert_to_speech(text): | |
tts = gTTS(text, lang='en') | |
buffer = BytesIO() | |
tts.write_to_fp(buffer) | |
buffer.seek(0) | |
return buffer.read() | |
# Function to process PDF and generate summary | |
def process_pdf(pdf_path): | |
abstract_text = extract_abstract(pdf_path) | |
if abstract_text: | |
abstract_text = abstract_text[:1024] | |
summary = summarize_abstract(abstract_text) | |
if summary: | |
return summary, convert_to_speech(summary) | |
# Define Gradio interface | |
inputs = gr.File(label="Upload a PDF with an abstract") # Add a label to the file input | |
summary_text = gr.Text(label="Written summary of the abstract") | |
audio_summary = gr.Audio(label="Audio summary of abstract") | |
# Launch the Gradio interface with an example PDF | |
iface = gr.Interface( | |
fn=process_pdf, | |
inputs=inputs, | |
outputs=[summary_text, audio_summary], | |
title="Summarized Abstract", | |
description="The app will summarize the abstract of a PDF and read it to the user.", | |
examples=["Article 11 Hidden Technical Debt in Machine Learning Systems.pdf" | |
] | |
) | |
# Launch the Gradio interface | |
iface.launch() |