| import gradio as gr |
| from PyPDF2 import PdfReader |
| from transformers import AutoModelForSeq2SeqLM, AutoTokenizer |
| from gtts import gTTS |
| from io import BytesIO |
| import re |
| import os |
|
|
| model_name = "ArtifactAI/led_large_16384_arxiv_summarization" |
| model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
| def extract_first_sentence(text): |
| sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text) |
| if sentences: |
| return sentences[0] |
| else: |
| return text |
|
|
| def extract_abstract_and_summarize(pdf_file): |
| try: |
| with open(pdf_file, 'rb') as file: |
| pdf_reader = PdfReader(file) |
| abstract_text = '' |
|
|
| for page_num in range(len(pdf_reader.pages)): |
| page = pdf_reader.pages[page_num] |
| text = page.extract_text() |
|
|
| abstract_match = re.search(r'\bAbstract\b', text, re.IGNORECASE) |
|
|
| if abstract_match: |
| start_index = abstract_match.end() |
|
|
| |
| next_section_match = re.search(r'\b(?:Introduction|Methodology|Conclusion)\b', text[start_index:]) |
|
|
| if next_section_match: |
| end_index = start_index + next_section_match.start() |
| abstract_text = text[start_index:end_index] |
| else: |
| abstract_text = text[start_index:] |
|
|
| break |
|
|
| |
| inputs = tokenizer(abstract_text, return_tensors="pt") |
| outputs = model.generate(**inputs) |
| summary = tokenizer.decode(outputs[0]) |
|
|
| |
| summary_sentence = extract_first_sentence(summary) |
|
|
| |
| speech = gTTS(text=summary_sentence, lang="en") |
| speech_bytes = BytesIO() |
| speech.write_to_fp(speech_bytes) |
|
|
| |
| return summary_sentence, speech_bytes.getvalue(), abstract_text.strip() |
|
|
| except Exception as e: |
| raise Exception(str(e)) |
|
|
| interface = gr.Interface( |
| fn=extract_abstract_and_summarize, |
| inputs=[gr.File(label="Upload PDF")], |
| outputs=[gr.Textbox(label="Summary"), gr.Audio()], |
| title="PDF Summarization & Audio Tool", |
| description="""PDF Summarization App. This app extracts the abstract from a PDF, summarizes it in one sentence, and generates an audio of it. Only upload PDFs with abstracts. |
| Please read the README.MD for information about the app and sample PDFs.""", |
| examples=[os.path.join(os.path.dirname(__file__), "Article 11 Hidden Technical Debt in Machine Learning Systems.pdf")], |
| cache_examples=True, |
| ) |
|
|
| interface.launch(share=True) |