| """ingest_subgraph integration tests. |
| |
| Exercises all three formats (PDF / DOCX / PNG). The nodes are async, so we |
| invoke via the compiled subgraph's ``ainvoke()``. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import pytest |
|
|
| from subgraphs.ingest_subgraph import build_ingest_subgraph, ingest_one_doc |
|
|
|
|
| @pytest.mark.integration |
| @pytest.mark.asyncio |
| async def test_pdf_loader_via_subgraph(sample_pdf_bytes): |
| """Load a minimal English invoice PDF.""" |
| result = await ingest_one_doc("test_invoice.pdf", sample_pdf_bytes) |
|
|
| assert result is not None |
| assert result.ingested is not None |
|
|
| ing = result.ingested |
| assert ing.file_name == "test_invoice.pdf" |
| assert ing.file_type == "pdf" |
| assert len(ing.pages) >= 1 |
| assert "INVOICE" in ing.full_text |
| assert "AcmeSoft" in ing.full_text |
| assert "12-3456789" in ing.full_text |
| assert ing.is_scanned is False |
|
|
|
|
| @pytest.mark.integration |
| @pytest.mark.asyncio |
| async def test_docx_loader_via_subgraph(sample_docx_bytes): |
| """DOCX load (always digital).""" |
| result = await ingest_one_doc("test_contract.docx", sample_docx_bytes) |
|
|
| assert result is not None |
| assert result.ingested is not None |
|
|
| ing = result.ingested |
| assert ing.file_type == "docx" |
| assert ing.is_scanned is False |
| assert "Non-Disclosure" in ing.full_text |
| assert "SmartSensors" in ing.full_text |
|
|
|
|
| @pytest.mark.integration |
| @pytest.mark.asyncio |
| async def test_image_loader_vision_first(sample_png_bytes): |
| """PNG load via vision-first — image_bytes are always preserved.""" |
| result = await ingest_one_doc("test_image.png", sample_png_bytes) |
|
|
| assert result is not None |
| assert result.ingested is not None |
|
|
| ing = result.ingested |
| assert ing.file_type == "png" |
| assert ing.is_scanned is True |
| assert len(ing.pages) == 1 |
| |
| assert ing.pages[0].image_bytes is not None |
| assert ing.pages[0].image_bytes == sample_png_bytes |
|
|
|
|
| @pytest.mark.integration |
| @pytest.mark.asyncio |
| async def test_unknown_format_falls_back_to_txt(): |
| """Unknown suffix → txt loader (best-effort).""" |
| result = await ingest_one_doc("strange.xyz", b"plain text content here") |
| assert result is not None |
| assert result.ingested is not None |
| assert result.ingested.file_type == "txt" |
| assert "plain text content" in result.ingested.full_text |
|
|
|
|
| @pytest.mark.integration |
| @pytest.mark.asyncio |
| async def test_subgraph_compiles_directly(): |
| """The compiled subgraph can be invoked directly.""" |
| graph = build_ingest_subgraph() |
| |
| result = await graph.ainvoke({ |
| "file_name": "empty.txt", |
| "file_bytes": b"", |
| "started_at": __import__("datetime").datetime.now(), |
| }) |
| assert result.get("ingested") is not None |
| assert result["ingested"].full_text == "" |
| assert result.get("error") is None |
|
|