doc-enricher / test_module.py
dwijverma2's picture
Add test suite
d2a9d6e verified
#!/usr/bin/env python3
"""
Test script for the Document Re-enrichment Module.
Creates a sample DOCX with intentionally inconsistent formatting
(the exact problem the module solves), then tests:
1. Paragraph extraction
2. Chunking logic
3. Classification application (mocked LLM)
4. Output document formatting verification
Run: python test_module.py
Requires: pip install python-docx
Does NOT require Ollama (LLM is mocked).
"""
import os
import sys
import logging
import json
from docx import Document
from docx.shared import Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from doc_enricher.handlers.docx_handler import DocxHandler
from doc_enricher.chunker import build_chunks, estimate_tokens
from doc_enricher.base_handler import ParagraphInfo
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)
def create_problematic_docx(filepath: str):
"""
Create a DOCX that mimics the user's problem:
- The title is NOT bold and same font size as body
- Section headings have inconsistent formatting
- Some body text is bold (causing misclassification)
- Some headings look like body text
"""
doc = Document()
# --- TITLE: Not bold, same size as body (parser would miss it) ---
title_para = doc.add_paragraph()
run = title_para.add_run("Annual Infrastructure Modernization Report 2024")
run.bold = False # Problem: title is NOT bold
run.font.size = Pt(11) # Problem: same size as body text
# --- BODY: Intro paragraph ---
body1 = doc.add_paragraph()
run = body1.add_run(
"This report provides a comprehensive overview of the infrastructure "
"modernization initiatives undertaken during the fiscal year 2024. "
"It covers budget allocation, project milestones, risk assessments, "
"and recommendations for the upcoming fiscal year."
)
run.font.size = Pt(11)
# --- SECTION HEADING: Bold but same size as body ---
heading1 = doc.add_paragraph()
run = heading1.add_run("Executive Summary")
run.bold = True
run.font.size = Pt(11) # Problem: same size as body
# --- BODY ---
body2 = doc.add_paragraph()
run = body2.add_run(
"The organization successfully completed 85% of planned infrastructure "
"upgrades. The total expenditure was $4.2 million, coming in 3% under "
"the approved budget of $4.33 million."
)
run.font.size = Pt(11)
# --- SECTION HEADING: NOT bold at all (parser would miss it) ---
heading2 = doc.add_paragraph()
run = heading2.add_run("Budget Allocation and Expenditure")
run.bold = False # Problem: heading is NOT bold
run.font.size = Pt(11) # Problem: same size as body
# --- BODY with bold (parser might think it's a heading) ---
body3 = doc.add_paragraph()
run = body3.add_run(
"The total approved budget for FY2024 was $4.33 million. "
)
run.bold = True # Problem: body text is bold
run.font.size = Pt(11)
run2 = body3.add_run(
"This was distributed across four major project areas: "
"network infrastructure (35%), server consolidation (25%), "
"security upgrades (20%), and cloud migration (20%)."
)
run2.font.size = Pt(11)
# --- SECTION HEADING: Underlined but not bold ---
heading3 = doc.add_paragraph()
run = heading3.add_run("Network Infrastructure Upgrades")
run.bold = False
run.underline = True # Different formatting pattern
run.font.size = Pt(11)
# --- BODY ---
body4 = doc.add_paragraph()
run = body4.add_run(
"The network team replaced all legacy switches across 12 branch "
"offices, upgraded the core router at headquarters, and implemented "
"SD-WAN connectivity for remote offices. The project was completed "
"two weeks ahead of schedule."
)
run.font.size = Pt(11)
# --- SECTION HEADING: Has proper formatting (to test mixed docs) ---
heading4 = doc.add_paragraph()
run = heading4.add_run("Security Improvements")
run.bold = True
run.font.size = Pt(14) # This one is properly formatted
# --- BODY ---
body5 = doc.add_paragraph()
run = body5.add_run(
"Zero-trust architecture was deployed across all critical systems. "
"Multi-factor authentication enrollment reached 98% of employees. "
"The security operations center now monitors 24/7 with automated "
"threat response capabilities."
)
run.font.size = Pt(11)
# --- SECTION HEADING: Plain text, no formatting ---
heading5 = doc.add_paragraph()
run = heading5.add_run("Recommendations for FY2025")
run.bold = False
run.font.size = Pt(11)
# --- BODY ---
body6 = doc.add_paragraph()
run = body6.add_run(
"Based on the outcomes of FY2024, the committee recommends: "
"1) Increasing the cloud migration budget by 15%, "
"2) Implementing AI-driven monitoring tools, "
"3) Establishing a dedicated DevOps team, and "
"4) Conducting quarterly security audits."
)
run.font.size = Pt(11)
doc.save(filepath)
logger.info(f"Created problematic DOCX: {filepath}")
def test_extraction():
"""Test that paragraphs are correctly extracted with metadata."""
logger.info("=" * 60)
logger.info("TEST 1: Paragraph Extraction")
logger.info("=" * 60)
handler = DocxHandler()
paragraphs = handler.extract_paragraphs("test_data/sample.docx")
logger.info(f"Extracted {len(paragraphs)} paragraphs:\n")
for p in paragraphs:
logger.info(
f" [{p.index:2d}] bold={str(p.is_bold):5s} "
f"size={str(p.avg_font_size_pt):6s} "
f"style={p.style_name:15s} "
f"len={p.text_length:3d} "
f"text=\"{p.text[:60]}...\""
)
assert len(paragraphs) > 0, "Should extract at least some paragraphs"
# Check that the title paragraph was extracted (index 0)
title_para = paragraphs[0]
assert "Annual Infrastructure" in title_para.text
# The title is NOT bold — this is the problem we're fixing
assert title_para.is_bold is not True, \
"Title should NOT be bold (that's the formatting problem)"
logger.info("\n✅ Extraction test passed!")
return paragraphs
def test_chunking(paragraphs):
"""Test that chunking works correctly for various document sizes."""
logger.info("\n" + "=" * 60)
logger.info("TEST 2: Chunking Logic")
logger.info("=" * 60)
# Test 1: Small doc — should be a single chunk
chunks = build_chunks(paragraphs, max_tokens_per_chunk=5000, overlap=3)
logger.info(f" Small budget test: {len(chunks)} chunk(s)")
assert len(chunks) == 1, "Small doc should fit in one chunk"
assert set(chunks[0].classify_indices) == {p.index for p in paragraphs}
# Test 2: Very tight budget — force multiple chunks
chunks = build_chunks(paragraphs, max_tokens_per_chunk=200, overlap=2)
logger.info(f" Tight budget test: {len(chunks)} chunk(s)")
assert len(chunks) > 1, "Tight budget should create multiple chunks"
# Verify all paragraphs are covered
all_classify = set()
for c in chunks:
all_classify.update(c.classify_indices)
expected = {p.index for p in paragraphs}
assert all_classify == expected, \
f"All paragraphs must be classified. Missing: {expected - all_classify}"
# Verify no duplicates in classify_indices across chunks
seen = set()
for c in chunks:
for idx in c.classify_indices:
assert idx not in seen, f"Paragraph {idx} classified by multiple chunks!"
seen.add(idx)
# Test 3: Token estimation
sample_text = "This is a sample paragraph with about thirty words in it to test."
estimated = estimate_tokens(sample_text)
assert 10 < estimated < 30, f"Token estimate {estimated} seems off for short text"
logger.info("\n✅ Chunking test passed!")
def test_classification_application(paragraphs):
"""Test applying mock classifications to the document."""
logger.info("\n" + "=" * 60)
logger.info("TEST 3: Classification Application")
logger.info("=" * 60)
# Mock classifications (what the LLM would produce)
mock_classifications = {}
for p in paragraphs:
if "Annual Infrastructure" in p.text:
mock_classifications[p.index] = "TITLE"
elif len(p.text) < 40 and not p.text.endswith("."):
# Short text without period → likely a heading
mock_classifications[p.index] = "SECTION_HEADING"
else:
mock_classifications[p.index] = "BODY"
logger.info(" Mock classifications:")
for idx, label in sorted(mock_classifications.items()):
para = next(p for p in paragraphs if p.index == idx)
logger.info(f" [{idx:2d}] {label:17s} → \"{para.text[:50]}...\"")
# Apply to document
handler = DocxHandler()
output_path = handler.apply_classifications(
"test_data/sample.docx",
"test_data/sample_enriched.docx",
mock_classifications,
)
assert os.path.exists(output_path), "Output file should exist"
# Verify the output document formatting
enriched = Document(output_path)
logger.info("\n Verifying enriched document formatting:")
for i, para in enumerate(enriched.paragraphs):
if i not in mock_classifications:
continue
label = mock_classifications[i]
text = para.text.strip()
if not text:
continue
style = para.style.name if para.style else "None"
if para.runs:
run = para.runs[0]
bold = run.bold
size = run.font.size.pt if run.font.size else None
else:
bold = None
size = None
logger.info(
f" [{i:2d}] label={label:17s} style={style:15s} "
f"bold={str(bold):5s} size={str(size):6s} "
f"text=\"{text[:40]}...\""
)
if label == "TITLE":
assert bold is True, f"Title para {i} should be bold"
assert size == 20.0, f"Title para {i} should be 20pt, got {size}"
elif label == "SECTION_HEADING":
assert bold is True, f"Heading para {i} should be bold"
assert size == 14.0, f"Heading para {i} should be 14pt, got {size}"
elif label == "BODY":
assert bold is False, f"Body para {i} should NOT be bold"
assert size == 11.0, f"Body para {i} should be 11pt, got {size}"
# Verify original is unchanged
original = Document("test_data/sample.docx")
orig_title = original.paragraphs[0]
if orig_title.runs:
assert orig_title.runs[0].bold is not True, \
"Original title should still NOT be bold (unchanged)"
logger.info("\n✅ Classification application test passed!")
logger.info(f" Original: test_data/sample.docx (UNCHANGED)")
logger.info(f" Enriched: {output_path} (formatting corrected)")
def test_edge_cases():
"""Test edge cases: empty doc, single paragraph, etc."""
logger.info("\n" + "=" * 60)
logger.info("TEST 4: Edge Cases")
logger.info("=" * 60)
handler = DocxHandler()
# Test: Empty document
empty_path = "test_data/empty.docx"
doc = Document()
doc.save(empty_path)
paragraphs = handler.extract_paragraphs(empty_path)
assert len(paragraphs) == 0, "Empty doc should have 0 paragraphs"
logger.info(" ✅ Empty document handled correctly")
# Test: Document with only whitespace paragraphs
ws_path = "test_data/whitespace.docx"
doc = Document()
doc.add_paragraph(" ")
doc.add_paragraph("\t\n")
doc.add_paragraph("")
doc.save(ws_path)
paragraphs = handler.extract_paragraphs(ws_path)
assert len(paragraphs) == 0, "Whitespace-only doc should have 0 paragraphs"
logger.info(" ✅ Whitespace-only document handled correctly")
# Test: Single paragraph document
single_path = "test_data/single.docx"
doc = Document()
doc.add_paragraph("Just one paragraph here.")
doc.save(single_path)
paragraphs = handler.extract_paragraphs(single_path)
assert len(paragraphs) == 1
chunks = build_chunks(paragraphs, max_tokens_per_chunk=3000)
assert len(chunks) == 1
assert chunks[0].classify_indices == [paragraphs[0].index]
logger.info(" ✅ Single paragraph document handled correctly")
# Test: Large document (simulate 100+ paragraphs)
large_path = "test_data/large.docx"
doc = Document()
for i in range(150):
if i % 15 == 0:
doc.add_paragraph(f"Section {i // 15 + 1}")
else:
doc.add_paragraph(
f"This is body paragraph {i}. It contains enough text to be "
f"realistic for token estimation purposes. Lorem ipsum dolor "
f"sit amet, consectetur adipiscing elit paragraph {i}."
)
doc.save(large_path)
paragraphs = handler.extract_paragraphs(large_path)
assert len(paragraphs) == 150
chunks = build_chunks(paragraphs, max_tokens_per_chunk=3000, overlap=3)
logger.info(f" Large doc (150 paras): {len(chunks)} chunks")
assert len(chunks) > 1, "Large doc should require multiple chunks"
# Verify complete coverage
all_classify = set()
for c in chunks:
for idx in c.classify_indices:
assert idx not in all_classify, f"Duplicate classify index {idx}"
all_classify.add(idx)
assert all_classify == {p.index for p in paragraphs}, "All paragraphs must be covered"
logger.info(f" ✅ Large document (150 paragraphs, {len(chunks)} chunks) handled correctly")
logger.info("\n✅ All edge case tests passed!")
def main():
os.makedirs("test_data", exist_ok=True)
create_problematic_docx("test_data/sample.docx")
paragraphs = test_extraction()
test_chunking(paragraphs)
test_classification_application(paragraphs)
test_edge_cases()
logger.info("\n" + "=" * 60)
logger.info("ALL TESTS PASSED ✅")
logger.info("=" * 60)
logger.info("\nThe module correctly:")
logger.info(" 1. Extracts paragraphs with formatting metadata from DOCX")
logger.info(" 2. Chunks large documents with overlap for LLM context")
logger.info(" 3. Applies classifications to a COPY (original untouched)")
logger.info(" 4. Handles edge cases (empty, single para, 150+ paras)")
logger.info("\nReady to use with Ollama. Run:")
logger.info(" python -m doc_enricher.cli input.docx -o output.docx")
if __name__ == "__main__":
main()