File size: 14,684 Bytes

d2a9d6e

#!/usr/bin/env python3
"""
Test script for the Document Re-enrichment Module.

Creates a sample DOCX with intentionally inconsistent formatting
(the exact problem the module solves), then tests:
1. Paragraph extraction
2. Chunking logic
3. Classification application (mocked LLM)
4. Output document formatting verification

Run: python test_module.py
Requires: pip install python-docx
Does NOT require Ollama (LLM is mocked).
"""

import os
import sys
import logging
import json
from docx import Document
from docx.shared import Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH

from doc_enricher.handlers.docx_handler import DocxHandler
from doc_enricher.chunker import build_chunks, estimate_tokens
from doc_enricher.base_handler import ParagraphInfo

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)


def create_problematic_docx(filepath: str):
    """
    Create a DOCX that mimics the user's problem:
    - The title is NOT bold and same font size as body
    - Section headings have inconsistent formatting  
    - Some body text is bold (causing misclassification)
    - Some headings look like body text
    """
    doc = Document()

    # --- TITLE: Not bold, same size as body (parser would miss it) ---
    title_para = doc.add_paragraph()
    run = title_para.add_run("Annual Infrastructure Modernization Report 2024")
    run.bold = False  # Problem: title is NOT bold
    run.font.size = Pt(11)  # Problem: same size as body text
    
    # --- BODY: Intro paragraph ---
    body1 = doc.add_paragraph()
    run = body1.add_run(
        "This report provides a comprehensive overview of the infrastructure "
        "modernization initiatives undertaken during the fiscal year 2024. "
        "It covers budget allocation, project milestones, risk assessments, "
        "and recommendations for the upcoming fiscal year."
    )
    run.font.size = Pt(11)

    # --- SECTION HEADING: Bold but same size as body ---
    heading1 = doc.add_paragraph()
    run = heading1.add_run("Executive Summary")
    run.bold = True
    run.font.size = Pt(11)  # Problem: same size as body

    # --- BODY ---
    body2 = doc.add_paragraph()
    run = body2.add_run(
        "The organization successfully completed 85% of planned infrastructure "
        "upgrades. The total expenditure was $4.2 million, coming in 3% under "
        "the approved budget of $4.33 million."
    )
    run.font.size = Pt(11)

    # --- SECTION HEADING: NOT bold at all (parser would miss it) ---
    heading2 = doc.add_paragraph()
    run = heading2.add_run("Budget Allocation and Expenditure")
    run.bold = False  # Problem: heading is NOT bold
    run.font.size = Pt(11)  # Problem: same size as body

    # --- BODY with bold (parser might think it's a heading) ---
    body3 = doc.add_paragraph()
    run = body3.add_run(
        "The total approved budget for FY2024 was $4.33 million. "
    )
    run.bold = True  # Problem: body text is bold
    run.font.size = Pt(11)
    run2 = body3.add_run(
        "This was distributed across four major project areas: "
        "network infrastructure (35%), server consolidation (25%), "
        "security upgrades (20%), and cloud migration (20%)."
    )
    run2.font.size = Pt(11)

    # --- SECTION HEADING: Underlined but not bold ---
    heading3 = doc.add_paragraph()
    run = heading3.add_run("Network Infrastructure Upgrades")
    run.bold = False
    run.underline = True  # Different formatting pattern
    run.font.size = Pt(11)

    # --- BODY ---
    body4 = doc.add_paragraph()
    run = body4.add_run(
        "The network team replaced all legacy switches across 12 branch "
        "offices, upgraded the core router at headquarters, and implemented "
        "SD-WAN connectivity for remote offices. The project was completed "
        "two weeks ahead of schedule."
    )
    run.font.size = Pt(11)

    # --- SECTION HEADING: Has proper formatting (to test mixed docs) ---
    heading4 = doc.add_paragraph()
    run = heading4.add_run("Security Improvements")
    run.bold = True
    run.font.size = Pt(14)  # This one is properly formatted

    # --- BODY ---
    body5 = doc.add_paragraph()
    run = body5.add_run(
        "Zero-trust architecture was deployed across all critical systems. "
        "Multi-factor authentication enrollment reached 98% of employees. "
        "The security operations center now monitors 24/7 with automated "
        "threat response capabilities."
    )
    run.font.size = Pt(11)

    # --- SECTION HEADING: Plain text, no formatting ---
    heading5 = doc.add_paragraph()
    run = heading5.add_run("Recommendations for FY2025")
    run.bold = False
    run.font.size = Pt(11)

    # --- BODY ---
    body6 = doc.add_paragraph()
    run = body6.add_run(
        "Based on the outcomes of FY2024, the committee recommends: "
        "1) Increasing the cloud migration budget by 15%, "
        "2) Implementing AI-driven monitoring tools, "
        "3) Establishing a dedicated DevOps team, and "
        "4) Conducting quarterly security audits."
    )
    run.font.size = Pt(11)

    doc.save(filepath)
    logger.info(f"Created problematic DOCX: {filepath}")


def test_extraction():
    """Test that paragraphs are correctly extracted with metadata."""
    logger.info("=" * 60)
    logger.info("TEST 1: Paragraph Extraction")
    logger.info("=" * 60)

    handler = DocxHandler()
    paragraphs = handler.extract_paragraphs("test_data/sample.docx")

    logger.info(f"Extracted {len(paragraphs)} paragraphs:\n")
    for p in paragraphs:
        logger.info(
            f"  [{p.index:2d}] bold={str(p.is_bold):5s}  "
            f"size={str(p.avg_font_size_pt):6s}  "
            f"style={p.style_name:15s}  "
            f"len={p.text_length:3d}  "
            f"text=\"{p.text[:60]}...\""
        )

    assert len(paragraphs) > 0, "Should extract at least some paragraphs"
    
    # Check that the title paragraph was extracted (index 0)
    title_para = paragraphs[0]
    assert "Annual Infrastructure" in title_para.text
    # The title is NOT bold — this is the problem we're fixing
    assert title_para.is_bold is not True, \
        "Title should NOT be bold (that's the formatting problem)"

    logger.info("\n✅ Extraction test passed!")
    return paragraphs


def test_chunking(paragraphs):
    """Test that chunking works correctly for various document sizes."""
    logger.info("\n" + "=" * 60)
    logger.info("TEST 2: Chunking Logic")
    logger.info("=" * 60)

    # Test 1: Small doc — should be a single chunk
    chunks = build_chunks(paragraphs, max_tokens_per_chunk=5000, overlap=3)
    logger.info(f"  Small budget test: {len(chunks)} chunk(s)")
    assert len(chunks) == 1, "Small doc should fit in one chunk"
    assert set(chunks[0].classify_indices) == {p.index for p in paragraphs}

    # Test 2: Very tight budget — force multiple chunks
    chunks = build_chunks(paragraphs, max_tokens_per_chunk=200, overlap=2)
    logger.info(f"  Tight budget test: {len(chunks)} chunk(s)")
    assert len(chunks) > 1, "Tight budget should create multiple chunks"

    # Verify all paragraphs are covered
    all_classify = set()
    for c in chunks:
        all_classify.update(c.classify_indices)
    expected = {p.index for p in paragraphs}
    assert all_classify == expected, \
        f"All paragraphs must be classified. Missing: {expected - all_classify}"

    # Verify no duplicates in classify_indices across chunks
    seen = set()
    for c in chunks:
        for idx in c.classify_indices:
            assert idx not in seen, f"Paragraph {idx} classified by multiple chunks!"
            seen.add(idx)

    # Test 3: Token estimation
    sample_text = "This is a sample paragraph with about thirty words in it to test."
    estimated = estimate_tokens(sample_text)
    assert 10 < estimated < 30, f"Token estimate {estimated} seems off for short text"

    logger.info("\n✅ Chunking test passed!")


def test_classification_application(paragraphs):
    """Test applying mock classifications to the document."""
    logger.info("\n" + "=" * 60)
    logger.info("TEST 3: Classification Application")
    logger.info("=" * 60)

    # Mock classifications (what the LLM would produce)
    mock_classifications = {}
    for p in paragraphs:
        if "Annual Infrastructure" in p.text:
            mock_classifications[p.index] = "TITLE"
        elif len(p.text) < 40 and not p.text.endswith("."):
            # Short text without period → likely a heading
            mock_classifications[p.index] = "SECTION_HEADING"
        else:
            mock_classifications[p.index] = "BODY"

    logger.info("  Mock classifications:")
    for idx, label in sorted(mock_classifications.items()):
        para = next(p for p in paragraphs if p.index == idx)
        logger.info(f"    [{idx:2d}] {label:17s} → \"{para.text[:50]}...\"")

    # Apply to document
    handler = DocxHandler()
    output_path = handler.apply_classifications(
        "test_data/sample.docx",
        "test_data/sample_enriched.docx",
        mock_classifications,
    )

    assert os.path.exists(output_path), "Output file should exist"

    # Verify the output document formatting
    enriched = Document(output_path)
    logger.info("\n  Verifying enriched document formatting:")

    for i, para in enumerate(enriched.paragraphs):
        if i not in mock_classifications:
            continue

        label = mock_classifications[i]
        text = para.text.strip()
        if not text:
            continue

        style = para.style.name if para.style else "None"
        if para.runs:
            run = para.runs[0]
            bold = run.bold
            size = run.font.size.pt if run.font.size else None
        else:
            bold = None
            size = None

        logger.info(
            f"    [{i:2d}] label={label:17s}  style={style:15s}  "
            f"bold={str(bold):5s}  size={str(size):6s}  "
            f"text=\"{text[:40]}...\""
        )

        if label == "TITLE":
            assert bold is True, f"Title para {i} should be bold"
            assert size == 20.0, f"Title para {i} should be 20pt, got {size}"
        elif label == "SECTION_HEADING":
            assert bold is True, f"Heading para {i} should be bold"
            assert size == 14.0, f"Heading para {i} should be 14pt, got {size}"
        elif label == "BODY":
            assert bold is False, f"Body para {i} should NOT be bold"
            assert size == 11.0, f"Body para {i} should be 11pt, got {size}"

    # Verify original is unchanged
    original = Document("test_data/sample.docx")
    orig_title = original.paragraphs[0]
    if orig_title.runs:
        assert orig_title.runs[0].bold is not True, \
            "Original title should still NOT be bold (unchanged)"

    logger.info("\n✅ Classification application test passed!")
    logger.info(f"  Original: test_data/sample.docx (UNCHANGED)")
    logger.info(f"  Enriched: {output_path} (formatting corrected)")


def test_edge_cases():
    """Test edge cases: empty doc, single paragraph, etc."""
    logger.info("\n" + "=" * 60)
    logger.info("TEST 4: Edge Cases")
    logger.info("=" * 60)

    handler = DocxHandler()

    # Test: Empty document
    empty_path = "test_data/empty.docx"
    doc = Document()
    doc.save(empty_path)
    paragraphs = handler.extract_paragraphs(empty_path)
    assert len(paragraphs) == 0, "Empty doc should have 0 paragraphs"
    logger.info("  ✅ Empty document handled correctly")

    # Test: Document with only whitespace paragraphs
    ws_path = "test_data/whitespace.docx"
    doc = Document()
    doc.add_paragraph("   ")
    doc.add_paragraph("\t\n")
    doc.add_paragraph("")
    doc.save(ws_path)
    paragraphs = handler.extract_paragraphs(ws_path)
    assert len(paragraphs) == 0, "Whitespace-only doc should have 0 paragraphs"
    logger.info("  ✅ Whitespace-only document handled correctly")

    # Test: Single paragraph document
    single_path = "test_data/single.docx"
    doc = Document()
    doc.add_paragraph("Just one paragraph here.")
    doc.save(single_path)
    paragraphs = handler.extract_paragraphs(single_path)
    assert len(paragraphs) == 1
    chunks = build_chunks(paragraphs, max_tokens_per_chunk=3000)
    assert len(chunks) == 1
    assert chunks[0].classify_indices == [paragraphs[0].index]
    logger.info("  ✅ Single paragraph document handled correctly")

    # Test: Large document (simulate 100+ paragraphs)
    large_path = "test_data/large.docx"
    doc = Document()
    for i in range(150):
        if i % 15 == 0:
            doc.add_paragraph(f"Section {i // 15 + 1}")
        else:
            doc.add_paragraph(
                f"This is body paragraph {i}. It contains enough text to be "
                f"realistic for token estimation purposes. Lorem ipsum dolor "
                f"sit amet, consectetur adipiscing elit paragraph {i}."
            )
    doc.save(large_path)
    paragraphs = handler.extract_paragraphs(large_path)
    assert len(paragraphs) == 150
    chunks = build_chunks(paragraphs, max_tokens_per_chunk=3000, overlap=3)
    logger.info(f"  Large doc (150 paras): {len(chunks)} chunks")
    assert len(chunks) > 1, "Large doc should require multiple chunks"

    # Verify complete coverage
    all_classify = set()
    for c in chunks:
        for idx in c.classify_indices:
            assert idx not in all_classify, f"Duplicate classify index {idx}"
            all_classify.add(idx)
    assert all_classify == {p.index for p in paragraphs}, "All paragraphs must be covered"
    logger.info(f"  ✅ Large document (150 paragraphs, {len(chunks)} chunks) handled correctly")

    logger.info("\n✅ All edge case tests passed!")


def main():
    os.makedirs("test_data", exist_ok=True)

    create_problematic_docx("test_data/sample.docx")

    paragraphs = test_extraction()
    test_chunking(paragraphs)
    test_classification_application(paragraphs)
    test_edge_cases()

    logger.info("\n" + "=" * 60)
    logger.info("ALL TESTS PASSED ✅")
    logger.info("=" * 60)
    logger.info("\nThe module correctly:")
    logger.info("  1. Extracts paragraphs with formatting metadata from DOCX")
    logger.info("  2. Chunks large documents with overlap for LLM context")
    logger.info("  3. Applies classifications to a COPY (original untouched)")
    logger.info("  4. Handles edge cases (empty, single para, 150+ paras)")
    logger.info("\nReady to use with Ollama. Run:")
    logger.info("  python -m doc_enricher.cli input.docx -o output.docx")


if __name__ == "__main__":
    main()