Spaces:

Vincsipe
/

paperhawk

Running

paperhawk / tools /get_extraction.py

Nándorfi Vince

Initial paperhawk push to HF Space (LFS for binaries)

7ff7119 3 days ago

1.46 kB

	"""get_extraction tool — fetch a single document's extracted structured data."""

	from __future__ import annotations

	import json

	from langchain_core.tools import tool

	from tools.context import ChatToolContext


	def build_get_extraction_tool(ctx: ChatToolContext):
	@tool
	def get_extraction(filename: str) -> str:
	"""Fetch the structured extraction for a document by filename.

	For an invoice: line items, amounts, dates.
	For a contract: clauses, terms, validity dates.

	Args:
	filename: the document filename (e.g. 'invoice_001.pdf')
	"""
	pd = ctx.get_document(filename)
	if pd is None:
	available = ctx.list_filenames()
	return (
	f"Document not found: '{filename}'. "
	f"Available files: {available if available else 'no documents uploaded'}"
	)

	if pd.extracted is None:
	return f"'{filename}' has not been extracted yet (extracted=null)."

	# Return the full ExtractedData as JSON (quotes + confidence included)
	out = {
	"file": filename,
	"doc_type": pd.classification.doc_type if pd.classification else "other",
	"data": pd.extracted.raw,
	"_quotes": pd.extracted.quotes,
	"_confidence": pd.extracted.confidence,
	}
	return json.dumps(out, ensure_ascii=False, indent=2, default=str)

	return get_extraction