File size: 2,990 Bytes
7ff7119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""ingest_subgraph integration tests.

Exercises all three formats (PDF / DOCX / PNG). The nodes are async, so we
invoke via the compiled subgraph's ``ainvoke()``.
"""

from __future__ import annotations

import pytest

from subgraphs.ingest_subgraph import build_ingest_subgraph, ingest_one_doc


@pytest.mark.integration
@pytest.mark.asyncio
async def test_pdf_loader_via_subgraph(sample_pdf_bytes):
    """Load a minimal English invoice PDF."""
    result = await ingest_one_doc("test_invoice.pdf", sample_pdf_bytes)

    assert result is not None
    assert result.ingested is not None

    ing = result.ingested
    assert ing.file_name == "test_invoice.pdf"
    assert ing.file_type == "pdf"
    assert len(ing.pages) >= 1
    assert "INVOICE" in ing.full_text
    assert "AcmeSoft" in ing.full_text
    assert "12-3456789" in ing.full_text
    assert ing.is_scanned is False  # native text was sufficient


@pytest.mark.integration
@pytest.mark.asyncio
async def test_docx_loader_via_subgraph(sample_docx_bytes):
    """DOCX load (always digital)."""
    result = await ingest_one_doc("test_contract.docx", sample_docx_bytes)

    assert result is not None
    assert result.ingested is not None

    ing = result.ingested
    assert ing.file_type == "docx"
    assert ing.is_scanned is False
    assert "Non-Disclosure" in ing.full_text
    assert "SmartSensors" in ing.full_text


@pytest.mark.integration
@pytest.mark.asyncio
async def test_image_loader_vision_first(sample_png_bytes):
    """PNG load via vision-first — image_bytes are always preserved."""
    result = await ingest_one_doc("test_image.png", sample_png_bytes)

    assert result is not None
    assert result.ingested is not None

    ing = result.ingested
    assert ing.file_type == "png"
    assert ing.is_scanned is True  # routed to the vision-extract path
    assert len(ing.pages) == 1
    # image_bytes must be retained for the downstream vision-extract
    assert ing.pages[0].image_bytes is not None
    assert ing.pages[0].image_bytes == sample_png_bytes


@pytest.mark.integration
@pytest.mark.asyncio
async def test_unknown_format_falls_back_to_txt():
    """Unknown suffix → txt loader (best-effort)."""
    result = await ingest_one_doc("strange.xyz", b"plain text content here")
    assert result is not None
    assert result.ingested is not None
    assert result.ingested.file_type == "txt"
    assert "plain text content" in result.ingested.full_text


@pytest.mark.integration
@pytest.mark.asyncio
async def test_subgraph_compiles_directly():
    """The compiled subgraph can be invoked directly."""
    graph = build_ingest_subgraph()
    # Empty input → txt-loader fallback to empty text
    result = await graph.ainvoke({
        "file_name": "empty.txt",
        "file_bytes": b"",
        "started_at": __import__("datetime").datetime.now(),
    })
    assert result.get("ingested") is not None
    assert result["ingested"].full_text == ""
    assert result.get("error") is None