seriffic's picture
Backend evolution: Phases 1-10 specialists + agentic FSM + Mellea + LiteLLM router
6a82282
"""GLiNER (urchade/gliner_medium-v2.1) structured extraction.
Runs the typed-NER model over a paragraph of policy text and emits a
list of typed extractions:
- nyc_location (e.g. "Coney Island", "Hunts Point")
- dollar_amount (e.g. "$5.6 million")
- date_range (e.g. "fiscal year 2025-2027")
- agency (e.g. "NYC DEP", "NYCHA")
- infrastructure_project (e.g. "Bluebelt expansion", "Newtown Creek
wastewater upgrade")
License: Apache-2.0 (NOT to be confused with `gliner_base`, which is
CC-BY-NC-4.0).
"""
from __future__ import annotations
import argparse
import json
import os
import sys
from dataclasses import asdict, dataclass
from pathlib import Path
CACHE = Path(__file__).parent / ".cache"
CACHE.mkdir(exist_ok=True)
os.environ.setdefault("HF_HOME", str(CACHE / "hf"))
ENTITY_LABELS = [
"nyc_location",
"dollar_amount",
"date_range",
"agency",
"infrastructure_project",
]
DEFAULT_THRESHOLD = 0.45
@dataclass
class Extraction:
label: str
text: str
score: float
start: int
end: int
def load_model():
from gliner import GLiNER
return GLiNER.from_pretrained("urchade/gliner_medium-v2.1",
cache_dir=str(CACHE / "hf"))
def extract(model, paragraph: str, threshold: float = DEFAULT_THRESHOLD,
labels: list[str] = None) -> list[Extraction]:
labels = labels or ENTITY_LABELS
raw = model.predict_entities(paragraph, labels, threshold=threshold)
return [Extraction(label=r["label"], text=r["text"], score=float(r["score"]),
start=int(r["start"]), end=int(r["end"]))
for r in raw]
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--text", required=True, help="Paragraph to extract from")
ap.add_argument("--threshold", type=float, default=DEFAULT_THRESHOLD)
args = ap.parse_args()
model = load_model()
out = extract(model, args.text, threshold=args.threshold)
print(json.dumps([asdict(x) for x in out], indent=2))
return 0
if __name__ == "__main__":
sys.exit(main())