| """ |
| Integration Tests for Document Processing Pipeline |
| |
| Tests the full document processing workflow: |
| - OCR extraction |
| - Layout detection |
| - Reading order reconstruction |
| - Chunking |
| """ |
|
|
| import pytest |
| from pathlib import Path |
| from unittest.mock import Mock, patch, MagicMock |
| import numpy as np |
|
|
| |
| @pytest.fixture |
| def sample_image(): |
| """Create a sample image for testing.""" |
| return np.zeros((1000, 800, 3), dtype=np.uint8) |
|
|
|
|
| @pytest.fixture |
| def mock_ocr_result(): |
| """Mock OCR result.""" |
| from src.document.ocr import OCRResult |
| from src.document.schemas.core import OCRRegion, BoundingBox |
|
|
| regions = [ |
| OCRRegion( |
| text="Sample Title", |
| confidence=0.95, |
| bbox=BoundingBox(x_min=100, y_min=50, x_max=700, y_max=100), |
| page=0, |
| engine="mock", |
| ), |
| OCRRegion( |
| text="This is paragraph text that contains important information.", |
| confidence=0.92, |
| bbox=BoundingBox(x_min=100, y_min=150, x_max=700, y_max=250), |
| page=0, |
| engine="mock", |
| ), |
| ] |
|
|
| return OCRResult( |
| success=True, |
| regions=regions, |
| page_num=0, |
| processing_time=0.5, |
| ) |
|
|
|
|
| class TestDocumentSchemas: |
| """Test document schema models.""" |
|
|
| def test_bounding_box_creation(self): |
| """Test BoundingBox creation and properties.""" |
| from src.document.schemas.core import BoundingBox |
|
|
| bbox = BoundingBox(x_min=10, y_min=20, x_max=100, y_max=80) |
|
|
| assert bbox.width == 90 |
| assert bbox.height == 60 |
| assert bbox.area == 5400 |
| assert bbox.center == (55.0, 50.0) |
|
|
| def test_bounding_box_normalization(self): |
| """Test BoundingBox normalization.""" |
| from src.document.schemas.core import BoundingBox |
|
|
| bbox = BoundingBox(x_min=100, y_min=200, x_max=300, y_max=400) |
|
|
| normalized = bbox.normalize(1000, 800) |
| assert normalized.normalized is True |
| assert 0 <= normalized.x_min <= 1 |
| assert 0 <= normalized.y_max <= 1 |
|
|
| def test_bounding_box_iou(self): |
| """Test BoundingBox IoU calculation.""" |
| from src.document.schemas.core import BoundingBox |
|
|
| bbox1 = BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100) |
| bbox2 = BoundingBox(x_min=50, y_min=50, x_max=150, y_max=150) |
| bbox3 = BoundingBox(x_min=200, y_min=200, x_max=300, y_max=300) |
|
|
| |
| iou = bbox1.iou(bbox2) |
| assert 0 < iou < 1 |
|
|
| |
| iou = bbox1.iou(bbox3) |
| assert iou == 0 |
|
|
| def test_ocr_region_creation(self): |
| """Test OCRRegion creation.""" |
| from src.document.schemas.core import OCRRegion, BoundingBox |
|
|
| region = OCRRegion( |
| text="Sample text", |
| confidence=0.95, |
| bbox=BoundingBox(x_min=0, y_min=0, x_max=100, y_max=50), |
| page=0, |
| engine="paddleocr", |
| ) |
|
|
| assert region.text == "Sample text" |
| assert region.confidence == 0.95 |
|
|
| def test_document_chunk_creation(self): |
| """Test DocumentChunk creation.""" |
| from src.document.schemas.core import DocumentChunk, ChunkType, BoundingBox |
|
|
| chunk = DocumentChunk( |
| chunk_id="chunk_001", |
| chunk_type=ChunkType.TEXT, |
| text="Sample chunk text", |
| bbox=BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100), |
| page=0, |
| document_id="doc_001", |
| source_path="/path/to/doc.pdf", |
| sequence_index=0, |
| confidence=0.9, |
| ) |
|
|
| assert chunk.chunk_id == "chunk_001" |
| assert chunk.chunk_type == ChunkType.TEXT |
|
|
|
|
| class TestOCREngines: |
| """Test OCR engine implementations.""" |
|
|
| def test_ocr_config_defaults(self): |
| """Test OCRConfig default values.""" |
| from src.document.ocr import OCRConfig |
|
|
| config = OCRConfig() |
| assert config.engine == "paddleocr" |
| assert config.language == "en" |
|
|
| def test_ocr_factory_paddleocr(self): |
| """Test OCR factory for PaddleOCR.""" |
| from src.document.ocr import get_ocr_engine, OCRConfig |
|
|
| with patch("src.document.ocr.paddle_ocr.PADDLEOCR_AVAILABLE", True): |
| with patch("src.document.ocr.paddle_ocr.PaddleOCR"): |
| config = OCRConfig(engine="paddleocr") |
| |
| |
|
|
| def test_ocr_factory_tesseract(self): |
| """Test OCR factory for Tesseract.""" |
| from src.document.ocr import get_ocr_engine, OCRConfig |
|
|
| with patch("src.document.ocr.tesseract_ocr.TESSERACT_AVAILABLE", True): |
| config = OCRConfig(engine="tesseract") |
| |
|
|
|
|
| class TestLayoutDetection: |
| """Test layout detection functionality.""" |
|
|
| def test_layout_config_defaults(self): |
| """Test LayoutConfig defaults.""" |
| from src.document.layout import LayoutConfig |
|
|
| config = LayoutConfig() |
| assert config.method == "rule_based" |
|
|
| def test_layout_type_enum(self): |
| """Test LayoutType enum values.""" |
| from src.document.schemas.core import LayoutType |
|
|
| assert LayoutType.TEXT.value == "text" |
| assert LayoutType.TITLE.value == "title" |
| assert LayoutType.TABLE.value == "table" |
|
|
|
|
| class TestReadingOrder: |
| """Test reading order reconstruction.""" |
|
|
| def test_reading_order_config(self): |
| """Test ReadingOrderConfig.""" |
| from src.document.reading_order import ReadingOrderConfig |
|
|
| config = ReadingOrderConfig() |
| assert config.method == "rule_based" |
| assert config.reading_direction == "ltr" |
|
|
|
|
| class TestChunking: |
| """Test document chunking.""" |
|
|
| def test_chunker_config(self): |
| """Test ChunkerConfig.""" |
| from src.document.chunking import ChunkerConfig |
|
|
| config = ChunkerConfig() |
| assert config.target_chunk_size > 0 |
| assert config.max_chunk_size >= config.target_chunk_size |
|
|
| def test_semantic_chunker_creation(self): |
| """Test SemanticChunker creation.""" |
| from src.document.chunking import SemanticChunker, ChunkerConfig |
|
|
| config = ChunkerConfig(target_chunk_size=256) |
| chunker = SemanticChunker(config) |
|
|
| assert chunker.config.target_chunk_size == 256 |
|
|
|
|
| class TestValidation: |
| """Test validation components.""" |
|
|
| def test_validation_status_enum(self): |
| """Test ValidationStatus enum.""" |
| from src.document.validation.critic import ValidationStatus |
|
|
| assert ValidationStatus.VALID.value == "valid" |
| assert ValidationStatus.INVALID.value == "invalid" |
| assert ValidationStatus.ABSTAIN.value == "abstain" |
|
|
| def test_evidence_strength_enum(self): |
| """Test EvidenceStrength enum.""" |
| from src.document.validation.verifier import EvidenceStrength |
|
|
| assert EvidenceStrength.STRONG.value == "strong" |
| assert EvidenceStrength.NONE.value == "none" |
|
|
|
|
| class TestPipelineIntegration: |
| """Integration tests for full pipeline.""" |
|
|
| def test_pipeline_config_creation(self): |
| """Test PipelineConfig creation.""" |
| from src.document.pipeline import PipelineConfig |
| from src.document.ocr import OCRConfig |
|
|
| config = PipelineConfig( |
| ocr=OCRConfig(engine="paddleocr"), |
| render_dpi=300, |
| max_pages=10, |
| ) |
|
|
| assert config.render_dpi == 300 |
| assert config.max_pages == 10 |
|
|
| def test_processed_document_structure(self): |
| """Test ProcessedDocument structure.""" |
| from src.document.schemas.core import ( |
| ProcessedDocument, |
| DocumentMetadata, |
| OCRRegion, |
| LayoutRegion, |
| DocumentChunk, |
| ChunkType, |
| BoundingBox, |
| ) |
| from datetime import datetime |
|
|
| metadata = DocumentMetadata( |
| document_id="test_doc", |
| source_path="/path/to/doc.pdf", |
| filename="doc.pdf", |
| file_type="pdf", |
| file_size_bytes=1000, |
| num_pages=1, |
| page_dimensions=[(800, 1000)], |
| processed_at=datetime.utcnow(), |
| total_chunks=1, |
| total_characters=100, |
| ) |
|
|
| chunk = DocumentChunk( |
| chunk_id="chunk_1", |
| chunk_type=ChunkType.TEXT, |
| text="Sample text", |
| bbox=BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100), |
| page=0, |
| document_id="test_doc", |
| source_path="/path/to/doc.pdf", |
| sequence_index=0, |
| confidence=0.9, |
| ) |
|
|
| doc = ProcessedDocument( |
| metadata=metadata, |
| ocr_regions=[], |
| layout_regions=[], |
| chunks=[chunk], |
| full_text="Sample text", |
| status="completed", |
| ) |
|
|
| assert doc.metadata.document_id == "test_doc" |
| assert len(doc.chunks) == 1 |
|
|