seriffic's picture
Backend evolution: Phases 1-10 specialists + agentic FSM + Mellea + LiteLLM router
6a82282
"""Pull text from one of the corpus PDFs and run GLiNER over the
top-K paragraphs whose density of typed entities looks highest.
Phase 2 scope: prove the specialist plumbing works end-to-end on real
corpus content. We don't yet rank paragraphs by query relevance — that
job belongs to the existing Granite Embedding 278M retriever
(specialist 13 in the FSM). In production this specialist would
consume retriever output, not raw PDFs.
"""
from __future__ import annotations
import argparse
import json
import re
import sys
from pathlib import Path
CORPUS = Path(__file__).resolve().parents[2] / "corpus"
from extract import extract, load_model # noqa: E402
def read_pdf_paragraphs(path: Path, min_chars: int = 200) -> list[str]:
from pypdf import PdfReader
reader = PdfReader(str(path))
text = "\n".join(page.extract_text() or "" for page in reader.pages)
# Naive paragraph split; the PDFs are scanned/extracted text so
# paragraphs are separated by blank lines or end-of-line patterns
# that look like sentence-final punctuation followed by a newline.
paras = re.split(r"\n\s*\n", text)
paras = [re.sub(r"\s+", " ", p).strip() for p in paras]
return [p for p in paras if len(p) >= min_chars]
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--pdf", required=True,
help="Path under corpus/, e.g. mta_resilience_2025.pdf")
ap.add_argument("--top", type=int, default=3,
help="Run GLiNER on the top-N paragraphs by length")
ap.add_argument("--threshold", type=float, default=0.45)
args = ap.parse_args()
pdf_path = CORPUS / args.pdf if not Path(args.pdf).is_absolute() else Path(args.pdf)
paras = read_pdf_paragraphs(pdf_path)
if not paras:
print(f"No paragraphs ≥200 chars found in {pdf_path}", file=sys.stderr)
return 1
# "Top-N by length" is a placeholder ranker — see module docstring.
top = sorted(paras, key=len, reverse=True)[:args.top]
print("Loading GLiNER (~150 MB)…", file=sys.stderr)
model = load_model()
out = []
for p in top:
ents = extract(model, p, threshold=args.threshold)
out.append({"paragraph": p[:400] + ("…" if len(p) > 400 else ""),
"n_entities": len(ents),
"entities": [
{"label": e.label, "text": e.text,
"score": round(e.score, 3)}
for e in ents
]})
print(json.dumps(out, indent=2))
return 0
if __name__ == "__main__":
sys.exit(main())