| """Integration tests for classify, extract, and RAG-index in dummy mode.""" |
|
|
| from __future__ import annotations |
|
|
| import pytest |
|
|
| from graph.states.pipeline_state import IngestedDocument, PageContent |
| from nodes.extract._dummy_extractor import extract_dummy |
| from nodes.extract.quote_validator_node import quote_validator_node |
| from nodes.pipeline.classify_node import classify_node |
| from schemas import flatten_universal, load_schema, pydantic_for |
| from subgraphs.extract_subgraph import build_extract_subgraph |
|
|
|
|
| |
| |
| |
|
|
|
|
| @pytest.mark.unit |
| def test_load_all_schemas(): |
| """All 6 schemas load.""" |
| for doc_type in ("invoice", "delivery_note", "purchase_order", "contract", |
| "financial_report", "other"): |
| s = load_schema(doc_type) |
| assert s["type"] == "object" |
| assert "_quotes" in s["required"] |
| assert "_confidence" in s["required"] |
|
|
|
|
| @pytest.mark.unit |
| def test_pydantic_mirrors(): |
| from schemas.pydantic_models import ( |
| ContractModel, |
| DeliveryNoteModel, |
| FinancialReportModel, |
| InvoiceModel, |
| PurchaseOrderModel, |
| UniversalModel, |
| ) |
|
|
| assert pydantic_for("invoice") is InvoiceModel |
| assert pydantic_for("delivery_note") is DeliveryNoteModel |
| assert pydantic_for("purchase_order") is PurchaseOrderModel |
| assert pydantic_for("contract") is ContractModel |
| assert pydantic_for("financial_report") is FinancialReportModel |
| assert pydantic_for("other") is UniversalModel |
| assert pydantic_for("unknown") is UniversalModel |
|
|
|
|
| @pytest.mark.unit |
| def test_invoice_pydantic_validation(): |
| from schemas.pydantic_models import InvoiceModel |
| inv = InvoiceModel.model_validate({ |
| "invoice_number": "2026/001", |
| "issuer": {"name": "Acme Inc.", "tax_id": "12-3456789"}, |
| "customer": {"name": "Beta LLC", "tax_id": "98-7654321"}, |
| "total_net": 20_000.00, |
| "total_vat": 4_000.00, |
| "total_gross": 24_000.00, |
| "_quotes": ["Invoice number: 2026/001"], |
| "_confidence": {"invoice_number": "high"}, |
| }) |
| assert inv.invoice_number == "2026/001" |
| assert inv.issuer is not None |
| assert inv.issuer.tax_id == "12-3456789" |
| assert inv.total_gross == 24_000.00 |
|
|
|
|
| |
| |
| |
|
|
|
|
| @pytest.mark.unit |
| def test_dummy_extract_invoice(): |
| text = ( |
| "INVOICE\n\n" |
| "Invoice number: 2026/001\n" |
| "Issue date: 2026-01-31\n" |
| "Fulfillment date: 2026-01-30\n" |
| "Payment due: 2026-02-29\n\n" |
| "Issuer: AcmeSoft Inc.\n" |
| "Tax ID: 12-3456789\n\n" |
| "Customer: BudaData LLC\n" |
| "Tax ID: 98-7654321\n\n" |
| "Total net: $20,000.00\n" |
| "Total VAT: $4,000.00\n" |
| "Total gross: $24,000.00\n" |
| ) |
| out = extract_dummy(text, "invoice", "invoice_january.pdf") |
|
|
| assert out["invoice_number"] == "2026/001" |
| assert out["issue_date"] == "2026-01-31" |
| assert out["payment_due_date"] == "2026-02-29" |
| assert len(out.get("_quotes", [])) > 0 |
|
|
|
|
| @pytest.mark.unit |
| def test_dummy_extract_contract(): |
| text = ( |
| "Non-Disclosure Agreement (NDA)\n\n" |
| "Parties: SmartSensors Inc. (tax id: 13-5792468) " |
| "and InfoTech Ltd. (tax id: 86-4201357)\n\n" |
| "Effective date: 2026-01-15\n" |
| "Expiry date: 2027-01-15\n\n" |
| "Penalty: A breach triggers a $50,000 penalty per incident.\n" |
| "Governing law: State of Delaware, USA.\n" |
| ) |
| out = extract_dummy(text, "contract", "nda_smartsensors.pdf") |
|
|
| assert out["contract_type"] == "NDA" |
| assert out.get("effective_date") == "2026-01-15" |
| assert out.get("expiry_date") == "2027-01-15" |
| assert out.get("confidentiality_clause") is True |
| |
| assert "delaware" in (out.get("governing_law", "") or "").lower() or out.get("governing_law") |
|
|
|
|
| |
| |
| |
|
|
|
|
| @pytest.mark.unit |
| def test_flatten_universal_keeps_flat_dict_unchanged(): |
| """A typed-shape dict (no universal indicators) passes through.""" |
| flat = {"invoice_number": "X", "_quotes": []} |
| out = flatten_universal(flat, "invoice") |
| assert out["invoice_number"] == "X" |
|
|
|
|
| @pytest.mark.unit |
| def test_flatten_universal_unfolds_nested(): |
| """Universal → flat: dates, amounts, parties get unfolded.""" |
| universal = { |
| "document_number": "X-001", |
| "document_type": "contract", |
| "dates": {"effective": "2026-01-01", "expiry": "2027-01-01"}, |
| "amounts": {"total_net": 100, "total_vat": 27, "total_gross": 127, "currency": "USD"}, |
| "parties": [ |
| {"name": "A Inc.", "role": "supplier", "tax_id": "11-1111111"}, |
| {"name": "B Corp.", "role": "customer", "tax_id": "22-2222222"}, |
| ], |
| "_quotes": ["source1"], |
| "_confidence": {"X": "high"}, |
| } |
| out = flatten_universal(universal, "contract") |
| assert out["invoice_number"] == "X-001" |
| assert out["effective_date"] == "2026-01-01" |
| assert out["total_net"] == 100 |
| assert out["issuer"]["name"] == "A Inc." |
| assert out["customer"]["name"] == "B Corp." |
|
|
|
|
| |
| |
| |
|
|
|
|
| @pytest.mark.integration |
| @pytest.mark.asyncio |
| async def test_classify_node_invoice(): |
| ingested = IngestedDocument( |
| file_name="invoice_january.pdf", |
| file_type="pdf", |
| pages=[PageContent(page_number=1, text="INVOICE\nInvoice number: X")], |
| full_text="INVOICE\nInvoice number: X", |
| ) |
| out = await classify_node({"ingested": ingested}) |
| assert "documents" in out |
| pd = out["documents"][0] |
| assert pd.classification.doc_type == "invoice" |
| |
| assert pd.classification.language in ("en", "hu", "de") |
|
|
|
|
| @pytest.mark.integration |
| @pytest.mark.asyncio |
| async def test_extract_subgraph_invoice(sample_pdf_bytes): |
| """End-to-end: ingest → classify → extract.""" |
| from subgraphs.ingest_subgraph import ingest_one_doc |
|
|
| pd = await ingest_one_doc("invoice_test.pdf", sample_pdf_bytes) |
| assert pd is not None |
|
|
| cls_out = await classify_node({"ingested": pd.ingested}) |
| classification = cls_out["documents"][0].classification |
|
|
| extract_graph = build_extract_subgraph() |
| ext_out = await extract_graph.ainvoke({ |
| "ingested": pd.ingested, |
| "classification": classification, |
| }) |
| pd_with_extracted = ext_out["documents"][0] |
| assert pd_with_extracted.extracted is not None |
| raw = pd_with_extracted.extracted.raw |
| assert raw.get("invoice_number") == "2026/001" |
|
|
|
|
| |
| |
| |
|
|
|
|
| @pytest.mark.integration |
| @pytest.mark.asyncio |
| async def test_quote_validator_passes_valid_quotes(): |
| from graph.states.pipeline_state import ExtractedData, ProcessedDocument |
|
|
| ingested = IngestedDocument( |
| file_name="X.pdf", |
| file_type="pdf", |
| pages=[PageContent(page_number=1, text="Invoice number: 2026/001 Penalty: $50,000")], |
| full_text="Invoice number: 2026/001 Penalty: $50,000", |
| ) |
| extracted = ExtractedData( |
| raw={ |
| "_quotes": ["Invoice number: 2026/001", "Penalty: $50,000"], |
| "_confidence": {"X": "high"}, |
| }, |
| _quotes=["Invoice number: 2026/001", "Penalty: $50,000"], |
| _confidence={"X": "high"}, |
| ) |
| pd = ProcessedDocument(ingested=ingested, extracted=extracted) |
| out = await quote_validator_node({"documents": [pd]}) |
| |
| assert out.get("risks") in (None, []) |
|
|
|
|
| @pytest.mark.integration |
| @pytest.mark.asyncio |
| async def test_quote_validator_flags_invalid_quotes(): |
| from graph.states.pipeline_state import ExtractedData, ProcessedDocument |
|
|
| ingested = IngestedDocument( |
| file_name="X.pdf", |
| file_type="pdf", |
| pages=[PageContent(page_number=1, text="Just this short text is here.")], |
| full_text="Just this short text is here.", |
| ) |
| extracted = ExtractedData( |
| raw={ |
| "_quotes": ["Hallucinated quote that is not in the source"], |
| "_confidence": {"X": "high"}, |
| }, |
| _quotes=["Hallucinated quote that is not in the source"], |
| _confidence={"X": "high"}, |
| ) |
| pd = ProcessedDocument(ingested=ingested, extracted=extracted) |
| out = await quote_validator_node({"documents": [pd]}) |
| assert "risks" in out |
| assert len(out["risks"]) == 1 |
| risk = out["risks"][0] |
| assert risk.kind == "validation" |
| assert risk.source_check_id == "quote_validator" |
| |
| updated_pd = out["documents"][0] |
| assert "low" in str(updated_pd.extracted.raw["_confidence"]).lower() |
|
|
|
|
| |
| |
| |
|
|
|
|
| @pytest.mark.integration |
| @pytest.mark.asyncio |
| async def test_rag_index_subgraph_indexes_chunks(tmp_path): |
| """The rag_index_subgraph adds chunks to the HybridStore.""" |
| from store import HybridStore |
| from subgraphs.rag_index_subgraph import build_rag_index_subgraph |
|
|
| store = HybridStore( |
| chroma_path=str(tmp_path / "chroma"), |
| collection_name="test_collection", |
| ) |
| graph = build_rag_index_subgraph(store) |
|
|
| ingested = IngestedDocument( |
| file_name="test.pdf", |
| file_type="pdf", |
| pages=[], |
| full_text="This is the content of an English business document. It contains valuable information.", |
| ) |
| result = await graph.ainvoke({ |
| "ingested": ingested, |
| "doc_type": "other", |
| }) |
| assert result["chunks_indexed"] >= 1 |
| assert store.chunk_count >= 1 |
|
|
|
|
| @pytest.mark.integration |
| @pytest.mark.asyncio |
| async def test_hybrid_search_finds_indexed_chunks(tmp_path): |
| """HybridStore.search_hybrid finds relevant chunks.""" |
| from store import HybridStore |
|
|
| store = HybridStore( |
| chroma_path=str(tmp_path / "chroma_search"), |
| collection_name="test_search", |
| ) |
| chunks = [ |
| { |
| "text": "The March invoice gross total is $3,000.00 — a price increase pattern.", |
| "metadata": {"source": "invoice_march.pdf", "chunk_index": 0, "doc_type": "invoice"}, |
| }, |
| { |
| "text": "The January contract has a $50,000 penalty for confidentiality breach.", |
| "metadata": {"source": "nda_january.pdf", "chunk_index": 0, "doc_type": "contract"}, |
| }, |
| ] |
| await store.add_chunks(chunks) |
|
|
| |
| hits = await store.search_hybrid("penalty amount", top_k=2) |
| assert len(hits) >= 1 |
| assert any("penalty" in h["text"].lower() for h in hits) |
|
|