Boka73's picture
Deploy Gradio app
dd6303a verified
"""
PDF Reader
----------
Low-level utility: opens a PDF and returns its text page by page.
Uses pdfplumber for clean text extraction; falls back to PyMuPDF if needed.
"""
import re
from pathlib import Path
from typing import List, Dict
def read_pdf_pages(pdf_path: str) -> List[str]:
"""Return a list of strings, one per page."""
path = Path(pdf_path)
if not path.exists():
raise FileNotFoundError(f"PDF not found: {pdf_path}")
try:
import pdfplumber
pages = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text = page.extract_text() or ""
pages.append(text)
return pages
except ImportError:
pass
# Fallback: PyMuPDF (fitz)
try:
import fitz
doc = fitz.open(pdf_path)
pages = [doc[i].get_text() for i in range(len(doc))]
doc.close()
return pages
except ImportError:
raise ImportError(
"Neither pdfplumber nor PyMuPDF is installed.\n"
"Run: pip install pdfplumber OR pip install pymupdf"
)
def read_pdf_text(pdf_path: str) -> str:
"""Return full text of a PDF as a single string."""
return "\n".join(read_pdf_pages(pdf_path))
def extract_tables_from_pdf(pdf_path: str) -> List[List[List[str]]]:
"""
Extract tables page by page using pdfplumber.
Returns a list (one per page) of tables,
where each table is a list of rows (list of cell strings).
"""
try:
import pdfplumber
result = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
tables = page.extract_tables() or []
result.append(tables)
return result
except ImportError:
raise ImportError("pdfplumber is required for table extraction. Run: pip install pdfplumber")
def clean_text(text: str) -> str:
"""Normalise whitespace and remove junk characters."""
text = re.sub(r"\s+", " ", text)
return text.strip()