| Hugging Face's logo |
| Hugging Face |
| Models |
| Datasets |
| Spaces |
| Community |
| Docs |
| Enterprise |
| Pricing |
| |
| |
| |
| Hugging Face is way more fun with friends and colleagues! π€ Join an organization |
| Spaces: |
| |
| aneesarom |
| / |
| PDF-Text-Extractor |
| |
| |
| like |
| 0 |
| |
| Logs |
| App |
| Files |
| Community |
| Settings |
| PDF-Text-Extractor |
| / |
| app.py |
| |
| aneesarom's picture |
| aneesarom |
| Update app.py |
| 4d17112 |
| verified |
| 5 days ago |
| raw |
|
|
| Copy download link |
| history |
| blame |
| edit |
| delete |
|
|
| 2.54 kB |
| import json |
| import gradio as gr |
| import pdfplumber |
| import requests |
| from io import BytesIO |
|
|
| def read_pdf_from_url(url: str) -> dict: |
| """ |
| Extracts text from a PDF file given a direct PDF download URL. |
| Args: |
| url (str): A URL that points directly to a PDF file. |
| Returns: |
| dict: JSON-formatted dictionary containing: |
| - url (str): The PDF URL |
| - page_count (int): Number of pages in the PDF |
| - content (str): Extracted text from the PDF, with page numbers |
| - error (str, optional): Error message if extraction fails |
| """ |
| try: |
| if not url.startswith("http"): |
| return {"error": "Invalid URL. Must start with http:// or https://"} |
| |
| response = requests.get(url, timeout=10) |
| response.raise_for_status() |
|
|
| if not response.content.startswith(b"%PDF-"): |
| return {"error": "URL does not point to a valid PDF file"} |
|
|
| file_like = BytesIO(response.content) |
| text = "" |
| with pdfplumber.open(file_like) as pdf: |
| for page_num, page in enumerate(pdf.pages, start=1): |
| page_text = page.extract_text() |
| if page_text: |
| text += f"[Page {page_num}]\n{page_text}\n\n" |
|
|
| return { |
| "url": url, |
| "page_count": len(pdf.pages), |
| "content": text.strip() if text else "No text found in PDF." |
| } |
|
|
| except Exception as e: |
| return {"error": str(e)} |
|
|
| |
| example_urls = [ |
| ["https://education.github.com/git-cheat-sheet-education.pdf"], |
| ["https://github.com/tpn/pdfs/raw/master/A%20Journey%20in%20Creating%20an%20Operating%20System%20Kernel%20-%20The%20539Kernel%20Book%20(Nov%202022).pdf"] |
| ] |
|
|
| |
| demo = gr.Interface( |
| fn=read_pdf_from_url, |
| inputs=gr.Textbox( |
| label="PDF URL", |
| placeholder="Enter a direct PDF URL (e.g., GitHub raw link)" |
| ), |
| outputs=gr.JSON(label="Extracted Text"), |
| title="PDF Text Extractor From Url", |
| description=( |
| "Provide a URL that directly points to a PDF file (from any server). " |
| "The server fetches the PDF and extracts the text content, returning it in JSON format." |
| ), |
| examples=example_urls, |
| flagging_mode="never", |
| cache_examples=False |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch(mcp_server=True) |
| |
|
|