Spaces:

lablab-ai-amd-developer-hackathon
/

riprap-nyc

Running

App Files Files Community

riprap-nyc / experiments /02_gliner_extraction /extract_from_pdf.py

seriffic

Backend evolution: Phases 1-10 specialists + agentic FSM + Mellea + LiteLLM router

6a82282 5 days ago

raw

history blame contribute delete

2.61 kB

	"""Pull text from one of the corpus PDFs and run GLiNER over the
	top-K paragraphs whose density of typed entities looks highest.

	Phase 2 scope: prove the specialist plumbing works end-to-end on real
	corpus content. We don't yet rank paragraphs by query relevance — that
	job belongs to the existing Granite Embedding 278M retriever
	(specialist 13 in the FSM). In production this specialist would
	consume retriever output, not raw PDFs.
	"""

	from __future__ import annotations

	import argparse
	import json
	import re
	import sys
	from pathlib import Path

	CORPUS = Path(__file__).resolve().parents[2] / "corpus"

	from extract import extract, load_model # noqa: E402


	def read_pdf_paragraphs(path: Path, min_chars: int = 200) -> list[str]:
	from pypdf import PdfReader
	reader = PdfReader(str(path))
	text = "\n".join(page.extract_text() or "" for page in reader.pages)
	# Naive paragraph split; the PDFs are scanned/extracted text so
	# paragraphs are separated by blank lines or end-of-line patterns
	# that look like sentence-final punctuation followed by a newline.
	paras = re.split(r"\n\s*\n", text)
	paras = [re.sub(r"\s+", " ", p).strip() for p in paras]
	return [p for p in paras if len(p) >= min_chars]


	def main() -> int:
	ap = argparse.ArgumentParser()
	ap.add_argument("--pdf", required=True,
	help="Path under corpus/, e.g. mta_resilience_2025.pdf")
	ap.add_argument("--top", type=int, default=3,
	help="Run GLiNER on the top-N paragraphs by length")
	ap.add_argument("--threshold", type=float, default=0.45)
	args = ap.parse_args()

	pdf_path = CORPUS / args.pdf if not Path(args.pdf).is_absolute() else Path(args.pdf)
	paras = read_pdf_paragraphs(pdf_path)
	if not paras:
	print(f"No paragraphs ≥200 chars found in {pdf_path}", file=sys.stderr)
	return 1

	# "Top-N by length" is a placeholder ranker — see module docstring.
	top = sorted(paras, key=len, reverse=True)[:args.top]

	print("Loading GLiNER (~150 MB)…", file=sys.stderr)
	model = load_model()

	out = []
	for p in top:
	ents = extract(model, p, threshold=args.threshold)
	out.append({"paragraph": p[:400] + ("…" if len(p) > 400 else ""),
	"n_entities": len(ents),
	"entities": [
	{"label": e.label, "text": e.text,
	"score": round(e.score, 3)}
	for e in ents
	]})
	print(json.dumps(out, indent=2))
	return 0


	if __name__ == "__main__":
	sys.exit(main())