File size: 2,110 Bytes
6a82282 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | """Build a `role: "document gliner_<source>"` chat message from a
GLiNER extraction list.
The doc body is a labeled list of extractions:
source: <pdf_id>
paragraph_excerpt: "<first 240 chars of the source paragraph>"
extractions:
- [agency] NYC DEP
- [dollar_amount] $22.5 million
- [date_range] FY 2025-2027
- [nyc_location] Hollis
- [infrastructure_project] Bluebelt expansion
This is structured enough that Granite 4.1 grounds against the typed
fields ("DEP allocated $22.5 million for the Bluebelt expansion in
Hollis"), and the doc_id tag naming by source PDF means [gliner_dep]
or [gliner_comptroller] resolves cleanly through the existing Mellea
citations_resolve check.
"""
from __future__ import annotations
SYSTEM_PROMPT_FRAGMENT = """\
You will be given GLiNER-extracted typed entities tagged
[gliner_<source>]. Cite at least one specific [agency], [dollar_amount],
or [infrastructure_project] from the extractions, using its parent
[gliner_<source>] tag. Do not invent values that aren't in the
extractions list.
"""
def make_doc(source_id: str, paragraph: str, extractions) -> dict:
"""Construct {role, content} for the reconciler.
source_id: short slug like "comptroller", "dep", "mta", "nycha",
"coned" — must match [a-z][a-z0-9_]* so the doc_id appears in the
Mellea citations check.
"""
excerpt = paragraph.strip().replace("\n", " ")[:240]
if len(paragraph) > 240:
excerpt += "…"
rows = [f"source: {source_id}",
f"paragraph_excerpt: \"{excerpt}\"",
"extractions:"]
for e in extractions:
rows.append(f" - [{e.label}] {e.text} (score={e.score:.2f})")
return {"role": f"document gliner_{source_id}", "content": "\n".join(rows)}
def render_for_trace(source_id: str, extractions) -> dict:
counts = {}
for e in extractions:
counts[e.label] = counts.get(e.label, 0) + 1
return {
"label": f"gliner_{source_id}",
"ok": True,
"fields": {
"n_entities": len(extractions),
**counts,
},
}
|