| |
| """ |
| Test script for the Document Re-enrichment Module. |
| |
| Creates a sample DOCX with intentionally inconsistent formatting |
| (the exact problem the module solves), then tests: |
| 1. Paragraph extraction |
| 2. Chunking logic |
| 3. Classification application (mocked LLM) |
| 4. Output document formatting verification |
| |
| Run: python test_module.py |
| Requires: pip install python-docx |
| Does NOT require Ollama (LLM is mocked). |
| """ |
|
|
| import os |
| import sys |
| import logging |
| import json |
| from docx import Document |
| from docx.shared import Pt, RGBColor |
| from docx.enum.text import WD_ALIGN_PARAGRAPH |
|
|
| from doc_enricher.handlers.docx_handler import DocxHandler |
| from doc_enricher.chunker import build_chunks, estimate_tokens |
| from doc_enricher.base_handler import ParagraphInfo |
|
|
| logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") |
| logger = logging.getLogger(__name__) |
|
|
|
|
| def create_problematic_docx(filepath: str): |
| """ |
| Create a DOCX that mimics the user's problem: |
| - The title is NOT bold and same font size as body |
| - Section headings have inconsistent formatting |
| - Some body text is bold (causing misclassification) |
| - Some headings look like body text |
| """ |
| doc = Document() |
|
|
| |
| title_para = doc.add_paragraph() |
| run = title_para.add_run("Annual Infrastructure Modernization Report 2024") |
| run.bold = False |
| run.font.size = Pt(11) |
| |
| |
| body1 = doc.add_paragraph() |
| run = body1.add_run( |
| "This report provides a comprehensive overview of the infrastructure " |
| "modernization initiatives undertaken during the fiscal year 2024. " |
| "It covers budget allocation, project milestones, risk assessments, " |
| "and recommendations for the upcoming fiscal year." |
| ) |
| run.font.size = Pt(11) |
|
|
| |
| heading1 = doc.add_paragraph() |
| run = heading1.add_run("Executive Summary") |
| run.bold = True |
| run.font.size = Pt(11) |
|
|
| |
| body2 = doc.add_paragraph() |
| run = body2.add_run( |
| "The organization successfully completed 85% of planned infrastructure " |
| "upgrades. The total expenditure was $4.2 million, coming in 3% under " |
| "the approved budget of $4.33 million." |
| ) |
| run.font.size = Pt(11) |
|
|
| |
| heading2 = doc.add_paragraph() |
| run = heading2.add_run("Budget Allocation and Expenditure") |
| run.bold = False |
| run.font.size = Pt(11) |
|
|
| |
| body3 = doc.add_paragraph() |
| run = body3.add_run( |
| "The total approved budget for FY2024 was $4.33 million. " |
| ) |
| run.bold = True |
| run.font.size = Pt(11) |
| run2 = body3.add_run( |
| "This was distributed across four major project areas: " |
| "network infrastructure (35%), server consolidation (25%), " |
| "security upgrades (20%), and cloud migration (20%)." |
| ) |
| run2.font.size = Pt(11) |
|
|
| |
| heading3 = doc.add_paragraph() |
| run = heading3.add_run("Network Infrastructure Upgrades") |
| run.bold = False |
| run.underline = True |
| run.font.size = Pt(11) |
|
|
| |
| body4 = doc.add_paragraph() |
| run = body4.add_run( |
| "The network team replaced all legacy switches across 12 branch " |
| "offices, upgraded the core router at headquarters, and implemented " |
| "SD-WAN connectivity for remote offices. The project was completed " |
| "two weeks ahead of schedule." |
| ) |
| run.font.size = Pt(11) |
|
|
| |
| heading4 = doc.add_paragraph() |
| run = heading4.add_run("Security Improvements") |
| run.bold = True |
| run.font.size = Pt(14) |
|
|
| |
| body5 = doc.add_paragraph() |
| run = body5.add_run( |
| "Zero-trust architecture was deployed across all critical systems. " |
| "Multi-factor authentication enrollment reached 98% of employees. " |
| "The security operations center now monitors 24/7 with automated " |
| "threat response capabilities." |
| ) |
| run.font.size = Pt(11) |
|
|
| |
| heading5 = doc.add_paragraph() |
| run = heading5.add_run("Recommendations for FY2025") |
| run.bold = False |
| run.font.size = Pt(11) |
|
|
| |
| body6 = doc.add_paragraph() |
| run = body6.add_run( |
| "Based on the outcomes of FY2024, the committee recommends: " |
| "1) Increasing the cloud migration budget by 15%, " |
| "2) Implementing AI-driven monitoring tools, " |
| "3) Establishing a dedicated DevOps team, and " |
| "4) Conducting quarterly security audits." |
| ) |
| run.font.size = Pt(11) |
|
|
| doc.save(filepath) |
| logger.info(f"Created problematic DOCX: {filepath}") |
|
|
|
|
| def test_extraction(): |
| """Test that paragraphs are correctly extracted with metadata.""" |
| logger.info("=" * 60) |
| logger.info("TEST 1: Paragraph Extraction") |
| logger.info("=" * 60) |
|
|
| handler = DocxHandler() |
| paragraphs = handler.extract_paragraphs("test_data/sample.docx") |
|
|
| logger.info(f"Extracted {len(paragraphs)} paragraphs:\n") |
| for p in paragraphs: |
| logger.info( |
| f" [{p.index:2d}] bold={str(p.is_bold):5s} " |
| f"size={str(p.avg_font_size_pt):6s} " |
| f"style={p.style_name:15s} " |
| f"len={p.text_length:3d} " |
| f"text=\"{p.text[:60]}...\"" |
| ) |
|
|
| assert len(paragraphs) > 0, "Should extract at least some paragraphs" |
| |
| |
| title_para = paragraphs[0] |
| assert "Annual Infrastructure" in title_para.text |
| |
| assert title_para.is_bold is not True, \ |
| "Title should NOT be bold (that's the formatting problem)" |
|
|
| logger.info("\n✅ Extraction test passed!") |
| return paragraphs |
|
|
|
|
| def test_chunking(paragraphs): |
| """Test that chunking works correctly for various document sizes.""" |
| logger.info("\n" + "=" * 60) |
| logger.info("TEST 2: Chunking Logic") |
| logger.info("=" * 60) |
|
|
| |
| chunks = build_chunks(paragraphs, max_tokens_per_chunk=5000, overlap=3) |
| logger.info(f" Small budget test: {len(chunks)} chunk(s)") |
| assert len(chunks) == 1, "Small doc should fit in one chunk" |
| assert set(chunks[0].classify_indices) == {p.index for p in paragraphs} |
|
|
| |
| chunks = build_chunks(paragraphs, max_tokens_per_chunk=200, overlap=2) |
| logger.info(f" Tight budget test: {len(chunks)} chunk(s)") |
| assert len(chunks) > 1, "Tight budget should create multiple chunks" |
|
|
| |
| all_classify = set() |
| for c in chunks: |
| all_classify.update(c.classify_indices) |
| expected = {p.index for p in paragraphs} |
| assert all_classify == expected, \ |
| f"All paragraphs must be classified. Missing: {expected - all_classify}" |
|
|
| |
| seen = set() |
| for c in chunks: |
| for idx in c.classify_indices: |
| assert idx not in seen, f"Paragraph {idx} classified by multiple chunks!" |
| seen.add(idx) |
|
|
| |
| sample_text = "This is a sample paragraph with about thirty words in it to test." |
| estimated = estimate_tokens(sample_text) |
| assert 10 < estimated < 30, f"Token estimate {estimated} seems off for short text" |
|
|
| logger.info("\n✅ Chunking test passed!") |
|
|
|
|
| def test_classification_application(paragraphs): |
| """Test applying mock classifications to the document.""" |
| logger.info("\n" + "=" * 60) |
| logger.info("TEST 3: Classification Application") |
| logger.info("=" * 60) |
|
|
| |
| mock_classifications = {} |
| for p in paragraphs: |
| if "Annual Infrastructure" in p.text: |
| mock_classifications[p.index] = "TITLE" |
| elif len(p.text) < 40 and not p.text.endswith("."): |
| |
| mock_classifications[p.index] = "SECTION_HEADING" |
| else: |
| mock_classifications[p.index] = "BODY" |
|
|
| logger.info(" Mock classifications:") |
| for idx, label in sorted(mock_classifications.items()): |
| para = next(p for p in paragraphs if p.index == idx) |
| logger.info(f" [{idx:2d}] {label:17s} → \"{para.text[:50]}...\"") |
|
|
| |
| handler = DocxHandler() |
| output_path = handler.apply_classifications( |
| "test_data/sample.docx", |
| "test_data/sample_enriched.docx", |
| mock_classifications, |
| ) |
|
|
| assert os.path.exists(output_path), "Output file should exist" |
|
|
| |
| enriched = Document(output_path) |
| logger.info("\n Verifying enriched document formatting:") |
|
|
| for i, para in enumerate(enriched.paragraphs): |
| if i not in mock_classifications: |
| continue |
|
|
| label = mock_classifications[i] |
| text = para.text.strip() |
| if not text: |
| continue |
|
|
| style = para.style.name if para.style else "None" |
| if para.runs: |
| run = para.runs[0] |
| bold = run.bold |
| size = run.font.size.pt if run.font.size else None |
| else: |
| bold = None |
| size = None |
|
|
| logger.info( |
| f" [{i:2d}] label={label:17s} style={style:15s} " |
| f"bold={str(bold):5s} size={str(size):6s} " |
| f"text=\"{text[:40]}...\"" |
| ) |
|
|
| if label == "TITLE": |
| assert bold is True, f"Title para {i} should be bold" |
| assert size == 20.0, f"Title para {i} should be 20pt, got {size}" |
| elif label == "SECTION_HEADING": |
| assert bold is True, f"Heading para {i} should be bold" |
| assert size == 14.0, f"Heading para {i} should be 14pt, got {size}" |
| elif label == "BODY": |
| assert bold is False, f"Body para {i} should NOT be bold" |
| assert size == 11.0, f"Body para {i} should be 11pt, got {size}" |
|
|
| |
| original = Document("test_data/sample.docx") |
| orig_title = original.paragraphs[0] |
| if orig_title.runs: |
| assert orig_title.runs[0].bold is not True, \ |
| "Original title should still NOT be bold (unchanged)" |
|
|
| logger.info("\n✅ Classification application test passed!") |
| logger.info(f" Original: test_data/sample.docx (UNCHANGED)") |
| logger.info(f" Enriched: {output_path} (formatting corrected)") |
|
|
|
|
| def test_edge_cases(): |
| """Test edge cases: empty doc, single paragraph, etc.""" |
| logger.info("\n" + "=" * 60) |
| logger.info("TEST 4: Edge Cases") |
| logger.info("=" * 60) |
|
|
| handler = DocxHandler() |
|
|
| |
| empty_path = "test_data/empty.docx" |
| doc = Document() |
| doc.save(empty_path) |
| paragraphs = handler.extract_paragraphs(empty_path) |
| assert len(paragraphs) == 0, "Empty doc should have 0 paragraphs" |
| logger.info(" ✅ Empty document handled correctly") |
|
|
| |
| ws_path = "test_data/whitespace.docx" |
| doc = Document() |
| doc.add_paragraph(" ") |
| doc.add_paragraph("\t\n") |
| doc.add_paragraph("") |
| doc.save(ws_path) |
| paragraphs = handler.extract_paragraphs(ws_path) |
| assert len(paragraphs) == 0, "Whitespace-only doc should have 0 paragraphs" |
| logger.info(" ✅ Whitespace-only document handled correctly") |
|
|
| |
| single_path = "test_data/single.docx" |
| doc = Document() |
| doc.add_paragraph("Just one paragraph here.") |
| doc.save(single_path) |
| paragraphs = handler.extract_paragraphs(single_path) |
| assert len(paragraphs) == 1 |
| chunks = build_chunks(paragraphs, max_tokens_per_chunk=3000) |
| assert len(chunks) == 1 |
| assert chunks[0].classify_indices == [paragraphs[0].index] |
| logger.info(" ✅ Single paragraph document handled correctly") |
|
|
| |
| large_path = "test_data/large.docx" |
| doc = Document() |
| for i in range(150): |
| if i % 15 == 0: |
| doc.add_paragraph(f"Section {i // 15 + 1}") |
| else: |
| doc.add_paragraph( |
| f"This is body paragraph {i}. It contains enough text to be " |
| f"realistic for token estimation purposes. Lorem ipsum dolor " |
| f"sit amet, consectetur adipiscing elit paragraph {i}." |
| ) |
| doc.save(large_path) |
| paragraphs = handler.extract_paragraphs(large_path) |
| assert len(paragraphs) == 150 |
| chunks = build_chunks(paragraphs, max_tokens_per_chunk=3000, overlap=3) |
| logger.info(f" Large doc (150 paras): {len(chunks)} chunks") |
| assert len(chunks) > 1, "Large doc should require multiple chunks" |
|
|
| |
| all_classify = set() |
| for c in chunks: |
| for idx in c.classify_indices: |
| assert idx not in all_classify, f"Duplicate classify index {idx}" |
| all_classify.add(idx) |
| assert all_classify == {p.index for p in paragraphs}, "All paragraphs must be covered" |
| logger.info(f" ✅ Large document (150 paragraphs, {len(chunks)} chunks) handled correctly") |
|
|
| logger.info("\n✅ All edge case tests passed!") |
|
|
|
|
| def main(): |
| os.makedirs("test_data", exist_ok=True) |
|
|
| create_problematic_docx("test_data/sample.docx") |
|
|
| paragraphs = test_extraction() |
| test_chunking(paragraphs) |
| test_classification_application(paragraphs) |
| test_edge_cases() |
|
|
| logger.info("\n" + "=" * 60) |
| logger.info("ALL TESTS PASSED ✅") |
| logger.info("=" * 60) |
| logger.info("\nThe module correctly:") |
| logger.info(" 1. Extracts paragraphs with formatting metadata from DOCX") |
| logger.info(" 2. Chunks large documents with overlap for LLM context") |
| logger.info(" 3. Applies classifications to a COPY (original untouched)") |
| logger.info(" 4. Handles edge cases (empty, single para, 150+ paras)") |
| logger.info("\nReady to use with Ollama. Run:") |
| logger.info(" python -m doc_enricher.cli input.docx -o output.docx") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|