| """get_extraction tool — fetch a single document's extracted structured data.""" |
|
|
| from __future__ import annotations |
|
|
| import json |
|
|
| from langchain_core.tools import tool |
|
|
| from tools.context import ChatToolContext |
|
|
|
|
| def build_get_extraction_tool(ctx: ChatToolContext): |
| @tool |
| def get_extraction(filename: str) -> str: |
| """Fetch the structured extraction for a document by filename. |
| |
| For an invoice: line items, amounts, dates. |
| For a contract: clauses, terms, validity dates. |
| |
| Args: |
| filename: the document filename (e.g. 'invoice_001.pdf') |
| """ |
| pd = ctx.get_document(filename) |
| if pd is None: |
| available = ctx.list_filenames() |
| return ( |
| f"Document not found: '{filename}'. " |
| f"Available files: {available if available else 'no documents uploaded'}" |
| ) |
|
|
| if pd.extracted is None: |
| return f"'{filename}' has not been extracted yet (extracted=null)." |
|
|
| |
| out = { |
| "file": filename, |
| "doc_type": pd.classification.doc_type if pd.classification else "other", |
| "data": pd.extracted.raw, |
| "_quotes": pd.extracted.quotes, |
| "_confidence": pd.extracted.confidence, |
| } |
| return json.dumps(out, ensure_ascii=False, indent=2, default=str) |
|
|
| return get_extraction |
|
|