#!/usr/bin/env python3 """ Test script for the Document Re-enrichment Module. Creates a sample DOCX with intentionally inconsistent formatting (the exact problem the module solves), then tests: 1. Paragraph extraction 2. Chunking logic 3. Classification application (mocked LLM) 4. Output document formatting verification Run: python test_module.py Requires: pip install python-docx Does NOT require Ollama (LLM is mocked). """ import os import sys import logging import json from docx import Document from docx.shared import Pt, RGBColor from docx.enum.text import WD_ALIGN_PARAGRAPH from doc_enricher.handlers.docx_handler import DocxHandler from doc_enricher.chunker import build_chunks, estimate_tokens from doc_enricher.base_handler import ParagraphInfo logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") logger = logging.getLogger(__name__) def create_problematic_docx(filepath: str): """ Create a DOCX that mimics the user's problem: - The title is NOT bold and same font size as body - Section headings have inconsistent formatting - Some body text is bold (causing misclassification) - Some headings look like body text """ doc = Document() # --- TITLE: Not bold, same size as body (parser would miss it) --- title_para = doc.add_paragraph() run = title_para.add_run("Annual Infrastructure Modernization Report 2024") run.bold = False # Problem: title is NOT bold run.font.size = Pt(11) # Problem: same size as body text # --- BODY: Intro paragraph --- body1 = doc.add_paragraph() run = body1.add_run( "This report provides a comprehensive overview of the infrastructure " "modernization initiatives undertaken during the fiscal year 2024. " "It covers budget allocation, project milestones, risk assessments, " "and recommendations for the upcoming fiscal year." ) run.font.size = Pt(11) # --- SECTION HEADING: Bold but same size as body --- heading1 = doc.add_paragraph() run = heading1.add_run("Executive Summary") run.bold = True run.font.size = Pt(11) # Problem: same size as body # --- BODY --- body2 = doc.add_paragraph() run = body2.add_run( "The organization successfully completed 85% of planned infrastructure " "upgrades. The total expenditure was $4.2 million, coming in 3% under " "the approved budget of $4.33 million." ) run.font.size = Pt(11) # --- SECTION HEADING: NOT bold at all (parser would miss it) --- heading2 = doc.add_paragraph() run = heading2.add_run("Budget Allocation and Expenditure") run.bold = False # Problem: heading is NOT bold run.font.size = Pt(11) # Problem: same size as body # --- BODY with bold (parser might think it's a heading) --- body3 = doc.add_paragraph() run = body3.add_run( "The total approved budget for FY2024 was $4.33 million. " ) run.bold = True # Problem: body text is bold run.font.size = Pt(11) run2 = body3.add_run( "This was distributed across four major project areas: " "network infrastructure (35%), server consolidation (25%), " "security upgrades (20%), and cloud migration (20%)." ) run2.font.size = Pt(11) # --- SECTION HEADING: Underlined but not bold --- heading3 = doc.add_paragraph() run = heading3.add_run("Network Infrastructure Upgrades") run.bold = False run.underline = True # Different formatting pattern run.font.size = Pt(11) # --- BODY --- body4 = doc.add_paragraph() run = body4.add_run( "The network team replaced all legacy switches across 12 branch " "offices, upgraded the core router at headquarters, and implemented " "SD-WAN connectivity for remote offices. The project was completed " "two weeks ahead of schedule." ) run.font.size = Pt(11) # --- SECTION HEADING: Has proper formatting (to test mixed docs) --- heading4 = doc.add_paragraph() run = heading4.add_run("Security Improvements") run.bold = True run.font.size = Pt(14) # This one is properly formatted # --- BODY --- body5 = doc.add_paragraph() run = body5.add_run( "Zero-trust architecture was deployed across all critical systems. " "Multi-factor authentication enrollment reached 98% of employees. " "The security operations center now monitors 24/7 with automated " "threat response capabilities." ) run.font.size = Pt(11) # --- SECTION HEADING: Plain text, no formatting --- heading5 = doc.add_paragraph() run = heading5.add_run("Recommendations for FY2025") run.bold = False run.font.size = Pt(11) # --- BODY --- body6 = doc.add_paragraph() run = body6.add_run( "Based on the outcomes of FY2024, the committee recommends: " "1) Increasing the cloud migration budget by 15%, " "2) Implementing AI-driven monitoring tools, " "3) Establishing a dedicated DevOps team, and " "4) Conducting quarterly security audits." ) run.font.size = Pt(11) doc.save(filepath) logger.info(f"Created problematic DOCX: {filepath}") def test_extraction(): """Test that paragraphs are correctly extracted with metadata.""" logger.info("=" * 60) logger.info("TEST 1: Paragraph Extraction") logger.info("=" * 60) handler = DocxHandler() paragraphs = handler.extract_paragraphs("test_data/sample.docx") logger.info(f"Extracted {len(paragraphs)} paragraphs:\n") for p in paragraphs: logger.info( f" [{p.index:2d}] bold={str(p.is_bold):5s} " f"size={str(p.avg_font_size_pt):6s} " f"style={p.style_name:15s} " f"len={p.text_length:3d} " f"text=\"{p.text[:60]}...\"" ) assert len(paragraphs) > 0, "Should extract at least some paragraphs" # Check that the title paragraph was extracted (index 0) title_para = paragraphs[0] assert "Annual Infrastructure" in title_para.text # The title is NOT bold — this is the problem we're fixing assert title_para.is_bold is not True, \ "Title should NOT be bold (that's the formatting problem)" logger.info("\n✅ Extraction test passed!") return paragraphs def test_chunking(paragraphs): """Test that chunking works correctly for various document sizes.""" logger.info("\n" + "=" * 60) logger.info("TEST 2: Chunking Logic") logger.info("=" * 60) # Test 1: Small doc — should be a single chunk chunks = build_chunks(paragraphs, max_tokens_per_chunk=5000, overlap=3) logger.info(f" Small budget test: {len(chunks)} chunk(s)") assert len(chunks) == 1, "Small doc should fit in one chunk" assert set(chunks[0].classify_indices) == {p.index for p in paragraphs} # Test 2: Very tight budget — force multiple chunks chunks = build_chunks(paragraphs, max_tokens_per_chunk=200, overlap=2) logger.info(f" Tight budget test: {len(chunks)} chunk(s)") assert len(chunks) > 1, "Tight budget should create multiple chunks" # Verify all paragraphs are covered all_classify = set() for c in chunks: all_classify.update(c.classify_indices) expected = {p.index for p in paragraphs} assert all_classify == expected, \ f"All paragraphs must be classified. Missing: {expected - all_classify}" # Verify no duplicates in classify_indices across chunks seen = set() for c in chunks: for idx in c.classify_indices: assert idx not in seen, f"Paragraph {idx} classified by multiple chunks!" seen.add(idx) # Test 3: Token estimation sample_text = "This is a sample paragraph with about thirty words in it to test." estimated = estimate_tokens(sample_text) assert 10 < estimated < 30, f"Token estimate {estimated} seems off for short text" logger.info("\n✅ Chunking test passed!") def test_classification_application(paragraphs): """Test applying mock classifications to the document.""" logger.info("\n" + "=" * 60) logger.info("TEST 3: Classification Application") logger.info("=" * 60) # Mock classifications (what the LLM would produce) mock_classifications = {} for p in paragraphs: if "Annual Infrastructure" in p.text: mock_classifications[p.index] = "TITLE" elif len(p.text) < 40 and not p.text.endswith("."): # Short text without period → likely a heading mock_classifications[p.index] = "SECTION_HEADING" else: mock_classifications[p.index] = "BODY" logger.info(" Mock classifications:") for idx, label in sorted(mock_classifications.items()): para = next(p for p in paragraphs if p.index == idx) logger.info(f" [{idx:2d}] {label:17s} → \"{para.text[:50]}...\"") # Apply to document handler = DocxHandler() output_path = handler.apply_classifications( "test_data/sample.docx", "test_data/sample_enriched.docx", mock_classifications, ) assert os.path.exists(output_path), "Output file should exist" # Verify the output document formatting enriched = Document(output_path) logger.info("\n Verifying enriched document formatting:") for i, para in enumerate(enriched.paragraphs): if i not in mock_classifications: continue label = mock_classifications[i] text = para.text.strip() if not text: continue style = para.style.name if para.style else "None" if para.runs: run = para.runs[0] bold = run.bold size = run.font.size.pt if run.font.size else None else: bold = None size = None logger.info( f" [{i:2d}] label={label:17s} style={style:15s} " f"bold={str(bold):5s} size={str(size):6s} " f"text=\"{text[:40]}...\"" ) if label == "TITLE": assert bold is True, f"Title para {i} should be bold" assert size == 20.0, f"Title para {i} should be 20pt, got {size}" elif label == "SECTION_HEADING": assert bold is True, f"Heading para {i} should be bold" assert size == 14.0, f"Heading para {i} should be 14pt, got {size}" elif label == "BODY": assert bold is False, f"Body para {i} should NOT be bold" assert size == 11.0, f"Body para {i} should be 11pt, got {size}" # Verify original is unchanged original = Document("test_data/sample.docx") orig_title = original.paragraphs[0] if orig_title.runs: assert orig_title.runs[0].bold is not True, \ "Original title should still NOT be bold (unchanged)" logger.info("\n✅ Classification application test passed!") logger.info(f" Original: test_data/sample.docx (UNCHANGED)") logger.info(f" Enriched: {output_path} (formatting corrected)") def test_edge_cases(): """Test edge cases: empty doc, single paragraph, etc.""" logger.info("\n" + "=" * 60) logger.info("TEST 4: Edge Cases") logger.info("=" * 60) handler = DocxHandler() # Test: Empty document empty_path = "test_data/empty.docx" doc = Document() doc.save(empty_path) paragraphs = handler.extract_paragraphs(empty_path) assert len(paragraphs) == 0, "Empty doc should have 0 paragraphs" logger.info(" ✅ Empty document handled correctly") # Test: Document with only whitespace paragraphs ws_path = "test_data/whitespace.docx" doc = Document() doc.add_paragraph(" ") doc.add_paragraph("\t\n") doc.add_paragraph("") doc.save(ws_path) paragraphs = handler.extract_paragraphs(ws_path) assert len(paragraphs) == 0, "Whitespace-only doc should have 0 paragraphs" logger.info(" ✅ Whitespace-only document handled correctly") # Test: Single paragraph document single_path = "test_data/single.docx" doc = Document() doc.add_paragraph("Just one paragraph here.") doc.save(single_path) paragraphs = handler.extract_paragraphs(single_path) assert len(paragraphs) == 1 chunks = build_chunks(paragraphs, max_tokens_per_chunk=3000) assert len(chunks) == 1 assert chunks[0].classify_indices == [paragraphs[0].index] logger.info(" ✅ Single paragraph document handled correctly") # Test: Large document (simulate 100+ paragraphs) large_path = "test_data/large.docx" doc = Document() for i in range(150): if i % 15 == 0: doc.add_paragraph(f"Section {i // 15 + 1}") else: doc.add_paragraph( f"This is body paragraph {i}. It contains enough text to be " f"realistic for token estimation purposes. Lorem ipsum dolor " f"sit amet, consectetur adipiscing elit paragraph {i}." ) doc.save(large_path) paragraphs = handler.extract_paragraphs(large_path) assert len(paragraphs) == 150 chunks = build_chunks(paragraphs, max_tokens_per_chunk=3000, overlap=3) logger.info(f" Large doc (150 paras): {len(chunks)} chunks") assert len(chunks) > 1, "Large doc should require multiple chunks" # Verify complete coverage all_classify = set() for c in chunks: for idx in c.classify_indices: assert idx not in all_classify, f"Duplicate classify index {idx}" all_classify.add(idx) assert all_classify == {p.index for p in paragraphs}, "All paragraphs must be covered" logger.info(f" ✅ Large document (150 paragraphs, {len(chunks)} chunks) handled correctly") logger.info("\n✅ All edge case tests passed!") def main(): os.makedirs("test_data", exist_ok=True) create_problematic_docx("test_data/sample.docx") paragraphs = test_extraction() test_chunking(paragraphs) test_classification_application(paragraphs) test_edge_cases() logger.info("\n" + "=" * 60) logger.info("ALL TESTS PASSED ✅") logger.info("=" * 60) logger.info("\nThe module correctly:") logger.info(" 1. Extracts paragraphs with formatting metadata from DOCX") logger.info(" 2. Chunks large documents with overlap for LLM context") logger.info(" 3. Applies classifications to a COPY (original untouched)") logger.info(" 4. Handles edge cases (empty, single para, 150+ paras)") logger.info("\nReady to use with Ollama. Run:") logger.info(" python -m doc_enricher.cli input.docx -o output.docx") if __name__ == "__main__": main()