Spaces:

Vincsipe
/

paperhawk

Running

File size: 1,459 Bytes

7ff7119

"""get_extraction tool — fetch a single document's extracted structured data."""

from __future__ import annotations

import json

from langchain_core.tools import tool

from tools.context import ChatToolContext


def build_get_extraction_tool(ctx: ChatToolContext):
    @tool
    def get_extraction(filename: str) -> str:
        """Fetch the structured extraction for a document by filename.

        For an invoice: line items, amounts, dates.
        For a contract: clauses, terms, validity dates.

        Args:
            filename: the document filename (e.g. 'invoice_001.pdf')
        """
        pd = ctx.get_document(filename)
        if pd is None:
            available = ctx.list_filenames()
            return (
                f"Document not found: '{filename}'. "
                f"Available files: {available if available else 'no documents uploaded'}"
            )

        if pd.extracted is None:
            return f"'{filename}' has not been extracted yet (extracted=null)."

        # Return the full ExtractedData as JSON (quotes + confidence included)
        out = {
            "file": filename,
            "doc_type": pd.classification.doc_type if pd.classification else "other",
            "data": pd.extracted.raw,
            "_quotes": pd.extracted.quotes,
            "_confidence": pd.extracted.confidence,
        }
        return json.dumps(out, ensure_ascii=False, indent=2, default=str)

    return get_extraction