File size: 2,990 Bytes
7ff7119 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | """ingest_subgraph integration tests.
Exercises all three formats (PDF / DOCX / PNG). The nodes are async, so we
invoke via the compiled subgraph's ``ainvoke()``.
"""
from __future__ import annotations
import pytest
from subgraphs.ingest_subgraph import build_ingest_subgraph, ingest_one_doc
@pytest.mark.integration
@pytest.mark.asyncio
async def test_pdf_loader_via_subgraph(sample_pdf_bytes):
"""Load a minimal English invoice PDF."""
result = await ingest_one_doc("test_invoice.pdf", sample_pdf_bytes)
assert result is not None
assert result.ingested is not None
ing = result.ingested
assert ing.file_name == "test_invoice.pdf"
assert ing.file_type == "pdf"
assert len(ing.pages) >= 1
assert "INVOICE" in ing.full_text
assert "AcmeSoft" in ing.full_text
assert "12-3456789" in ing.full_text
assert ing.is_scanned is False # native text was sufficient
@pytest.mark.integration
@pytest.mark.asyncio
async def test_docx_loader_via_subgraph(sample_docx_bytes):
"""DOCX load (always digital)."""
result = await ingest_one_doc("test_contract.docx", sample_docx_bytes)
assert result is not None
assert result.ingested is not None
ing = result.ingested
assert ing.file_type == "docx"
assert ing.is_scanned is False
assert "Non-Disclosure" in ing.full_text
assert "SmartSensors" in ing.full_text
@pytest.mark.integration
@pytest.mark.asyncio
async def test_image_loader_vision_first(sample_png_bytes):
"""PNG load via vision-first — image_bytes are always preserved."""
result = await ingest_one_doc("test_image.png", sample_png_bytes)
assert result is not None
assert result.ingested is not None
ing = result.ingested
assert ing.file_type == "png"
assert ing.is_scanned is True # routed to the vision-extract path
assert len(ing.pages) == 1
# image_bytes must be retained for the downstream vision-extract
assert ing.pages[0].image_bytes is not None
assert ing.pages[0].image_bytes == sample_png_bytes
@pytest.mark.integration
@pytest.mark.asyncio
async def test_unknown_format_falls_back_to_txt():
"""Unknown suffix → txt loader (best-effort)."""
result = await ingest_one_doc("strange.xyz", b"plain text content here")
assert result is not None
assert result.ingested is not None
assert result.ingested.file_type == "txt"
assert "plain text content" in result.ingested.full_text
@pytest.mark.integration
@pytest.mark.asyncio
async def test_subgraph_compiles_directly():
"""The compiled subgraph can be invoked directly."""
graph = build_ingest_subgraph()
# Empty input → txt-loader fallback to empty text
result = await graph.ainvoke({
"file_name": "empty.txt",
"file_bytes": b"",
"started_at": __import__("datetime").datetime.now(),
})
assert result.get("ingested") is not None
assert result["ingested"].full_text == ""
assert result.get("error") is None
|