doc-enricher / test_module.py

Add test suite

d2a9d6e verified 10 days ago

14.7 kB

	#!/usr/bin/env python3
	"""
	Test script for the Document Re-enrichment Module.

	Creates a sample DOCX with intentionally inconsistent formatting
	(the exact problem the module solves), then tests:
	1. Paragraph extraction
	2. Chunking logic
	3. Classification application (mocked LLM)
	4. Output document formatting verification

	Run: python test_module.py
	Requires: pip install python-docx
	Does NOT require Ollama (LLM is mocked).
	"""

	import os
	import sys
	import logging
	import json
	from docx import Document
	from docx.shared import Pt, RGBColor
	from docx.enum.text import WD_ALIGN_PARAGRAPH

	from doc_enricher.handlers.docx_handler import DocxHandler
	from doc_enricher.chunker import build_chunks, estimate_tokens
	from doc_enricher.base_handler import ParagraphInfo

	logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
	logger = logging.getLogger(__name__)


	def create_problematic_docx(filepath: str):
	"""
	Create a DOCX that mimics the user's problem:
	- The title is NOT bold and same font size as body
	- Section headings have inconsistent formatting
	- Some body text is bold (causing misclassification)
	- Some headings look like body text
	"""
	doc = Document()

	# --- TITLE: Not bold, same size as body (parser would miss it) ---
	title_para = doc.add_paragraph()
	run = title_para.add_run("Annual Infrastructure Modernization Report 2024")
	run.bold = False # Problem: title is NOT bold
	run.font.size = Pt(11) # Problem: same size as body text

	# --- BODY: Intro paragraph ---
	body1 = doc.add_paragraph()
	run = body1.add_run(
	"This report provides a comprehensive overview of the infrastructure "
	"modernization initiatives undertaken during the fiscal year 2024. "
	"It covers budget allocation, project milestones, risk assessments, "
	"and recommendations for the upcoming fiscal year."
	)
	run.font.size = Pt(11)

	# --- SECTION HEADING: Bold but same size as body ---
	heading1 = doc.add_paragraph()
	run = heading1.add_run("Executive Summary")
	run.bold = True
	run.font.size = Pt(11) # Problem: same size as body

	# --- BODY ---
	body2 = doc.add_paragraph()
	run = body2.add_run(
	"The organization successfully completed 85% of planned infrastructure "
	"upgrades. The total expenditure was $4.2 million, coming in 3% under "
	"the approved budget of $4.33 million."
	)
	run.font.size = Pt(11)

	# --- SECTION HEADING: NOT bold at all (parser would miss it) ---
	heading2 = doc.add_paragraph()
	run = heading2.add_run("Budget Allocation and Expenditure")
	run.bold = False # Problem: heading is NOT bold
	run.font.size = Pt(11) # Problem: same size as body

	# --- BODY with bold (parser might think it's a heading) ---
	body3 = doc.add_paragraph()
	run = body3.add_run(
	"The total approved budget for FY2024 was $4.33 million. "
	)
	run.bold = True # Problem: body text is bold
	run.font.size = Pt(11)
	run2 = body3.add_run(
	"This was distributed across four major project areas: "
	"network infrastructure (35%), server consolidation (25%), "
	"security upgrades (20%), and cloud migration (20%)."
	)
	run2.font.size = Pt(11)

	# --- SECTION HEADING: Underlined but not bold ---
	heading3 = doc.add_paragraph()
	run = heading3.add_run("Network Infrastructure Upgrades")
	run.bold = False
	run.underline = True # Different formatting pattern
	run.font.size = Pt(11)

	# --- BODY ---
	body4 = doc.add_paragraph()
	run = body4.add_run(
	"The network team replaced all legacy switches across 12 branch "
	"offices, upgraded the core router at headquarters, and implemented "
	"SD-WAN connectivity for remote offices. The project was completed "
	"two weeks ahead of schedule."
	)
	run.font.size = Pt(11)

	# --- SECTION HEADING: Has proper formatting (to test mixed docs) ---
	heading4 = doc.add_paragraph()
	run = heading4.add_run("Security Improvements")
	run.bold = True
	run.font.size = Pt(14) # This one is properly formatted

	# --- BODY ---
	body5 = doc.add_paragraph()
	run = body5.add_run(
	"Zero-trust architecture was deployed across all critical systems. "
	"Multi-factor authentication enrollment reached 98% of employees. "
	"The security operations center now monitors 24/7 with automated "
	"threat response capabilities."
	)
	run.font.size = Pt(11)

	# --- SECTION HEADING: Plain text, no formatting ---
	heading5 = doc.add_paragraph()
	run = heading5.add_run("Recommendations for FY2025")
	run.bold = False
	run.font.size = Pt(11)

	# --- BODY ---
	body6 = doc.add_paragraph()
	run = body6.add_run(
	"Based on the outcomes of FY2024, the committee recommends: "
	"1) Increasing the cloud migration budget by 15%, "
	"2) Implementing AI-driven monitoring tools, "
	"3) Establishing a dedicated DevOps team, and "
	"4) Conducting quarterly security audits."
	)
	run.font.size = Pt(11)

	doc.save(filepath)
	logger.info(f"Created problematic DOCX: {filepath}")


	def test_extraction():
	"""Test that paragraphs are correctly extracted with metadata."""
	logger.info("=" * 60)
	logger.info("TEST 1: Paragraph Extraction")
	logger.info("=" * 60)

	handler = DocxHandler()
	paragraphs = handler.extract_paragraphs("test_data/sample.docx")

	logger.info(f"Extracted {len(paragraphs)} paragraphs:\n")
	for p in paragraphs:
	logger.info(
	f" [{p.index:2d}] bold={str(p.is_bold):5s} "
	f"size={str(p.avg_font_size_pt):6s} "
	f"style={p.style_name:15s} "
	f"len={p.text_length:3d} "
	f"text=\"{p.text[:60]}...\""
	)

	assert len(paragraphs) > 0, "Should extract at least some paragraphs"

	# Check that the title paragraph was extracted (index 0)
	title_para = paragraphs[0]
	assert "Annual Infrastructure" in title_para.text
	# The title is NOT bold — this is the problem we're fixing
	assert title_para.is_bold is not True, \
	"Title should NOT be bold (that's the formatting problem)"

	logger.info("\n✅ Extraction test passed!")
	return paragraphs


	def test_chunking(paragraphs):
	"""Test that chunking works correctly for various document sizes."""
	logger.info("\n" + "=" * 60)
	logger.info("TEST 2: Chunking Logic")
	logger.info("=" * 60)

	# Test 1: Small doc — should be a single chunk
	chunks = build_chunks(paragraphs, max_tokens_per_chunk=5000, overlap=3)
	logger.info(f" Small budget test: {len(chunks)} chunk(s)")
	assert len(chunks) == 1, "Small doc should fit in one chunk"
	assert set(chunks[0].classify_indices) == {p.index for p in paragraphs}

	# Test 2: Very tight budget — force multiple chunks
	chunks = build_chunks(paragraphs, max_tokens_per_chunk=200, overlap=2)
	logger.info(f" Tight budget test: {len(chunks)} chunk(s)")
	assert len(chunks) > 1, "Tight budget should create multiple chunks"

	# Verify all paragraphs are covered
	all_classify = set()
	for c in chunks:
	all_classify.update(c.classify_indices)
	expected = {p.index for p in paragraphs}
	assert all_classify == expected, \
	f"All paragraphs must be classified. Missing: {expected - all_classify}"

	# Verify no duplicates in classify_indices across chunks
	seen = set()
	for c in chunks:
	for idx in c.classify_indices:
	assert idx not in seen, f"Paragraph {idx} classified by multiple chunks!"
	seen.add(idx)

	# Test 3: Token estimation
	sample_text = "This is a sample paragraph with about thirty words in it to test."
	estimated = estimate_tokens(sample_text)
	assert 10 < estimated < 30, f"Token estimate {estimated} seems off for short text"

	logger.info("\n✅ Chunking test passed!")


	def test_classification_application(paragraphs):
	"""Test applying mock classifications to the document."""
	logger.info("\n" + "=" * 60)
	logger.info("TEST 3: Classification Application")
	logger.info("=" * 60)

	# Mock classifications (what the LLM would produce)
	mock_classifications = {}
	for p in paragraphs:
	if "Annual Infrastructure" in p.text:
	mock_classifications[p.index] = "TITLE"
	elif len(p.text) < 40 and not p.text.endswith("."):
	# Short text without period → likely a heading
	mock_classifications[p.index] = "SECTION_HEADING"
	else:
	mock_classifications[p.index] = "BODY"

	logger.info(" Mock classifications:")
	for idx, label in sorted(mock_classifications.items()):
	para = next(p for p in paragraphs if p.index == idx)
	logger.info(f" [{idx:2d}] {label:17s} → \"{para.text[:50]}...\"")

	# Apply to document
	handler = DocxHandler()
	output_path = handler.apply_classifications(
	"test_data/sample.docx",
	"test_data/sample_enriched.docx",
	mock_classifications,
	)

	assert os.path.exists(output_path), "Output file should exist"

	# Verify the output document formatting
	enriched = Document(output_path)
	logger.info("\n Verifying enriched document formatting:")

	for i, para in enumerate(enriched.paragraphs):
	if i not in mock_classifications:
	continue

	label = mock_classifications[i]
	text = para.text.strip()
	if not text:
	continue

	style = para.style.name if para.style else "None"
	if para.runs:
	run = para.runs[0]
	bold = run.bold
	size = run.font.size.pt if run.font.size else None
	else:
	bold = None
	size = None

	logger.info(
	f" [{i:2d}] label={label:17s} style={style:15s} "
	f"bold={str(bold):5s} size={str(size):6s} "
	f"text=\"{text[:40]}...\""
	)

	if label == "TITLE":
	assert bold is True, f"Title para {i} should be bold"
	assert size == 20.0, f"Title para {i} should be 20pt, got {size}"
	elif label == "SECTION_HEADING":
	assert bold is True, f"Heading para {i} should be bold"
	assert size == 14.0, f"Heading para {i} should be 14pt, got {size}"
	elif label == "BODY":
	assert bold is False, f"Body para {i} should NOT be bold"
	assert size == 11.0, f"Body para {i} should be 11pt, got {size}"

	# Verify original is unchanged
	original = Document("test_data/sample.docx")
	orig_title = original.paragraphs[0]
	if orig_title.runs:
	assert orig_title.runs[0].bold is not True, \
	"Original title should still NOT be bold (unchanged)"

	logger.info("\n✅ Classification application test passed!")
	logger.info(f" Original: test_data/sample.docx (UNCHANGED)")
	logger.info(f" Enriched: {output_path} (formatting corrected)")


	def test_edge_cases():
	"""Test edge cases: empty doc, single paragraph, etc."""
	logger.info("\n" + "=" * 60)
	logger.info("TEST 4: Edge Cases")
	logger.info("=" * 60)

	handler = DocxHandler()

	# Test: Empty document
	empty_path = "test_data/empty.docx"
	doc = Document()
	doc.save(empty_path)
	paragraphs = handler.extract_paragraphs(empty_path)
	assert len(paragraphs) == 0, "Empty doc should have 0 paragraphs"
	logger.info(" ✅ Empty document handled correctly")

	# Test: Document with only whitespace paragraphs
	ws_path = "test_data/whitespace.docx"
	doc = Document()
	doc.add_paragraph(" ")
	doc.add_paragraph("\t\n")
	doc.add_paragraph("")
	doc.save(ws_path)
	paragraphs = handler.extract_paragraphs(ws_path)
	assert len(paragraphs) == 0, "Whitespace-only doc should have 0 paragraphs"
	logger.info(" ✅ Whitespace-only document handled correctly")

	# Test: Single paragraph document
	single_path = "test_data/single.docx"
	doc = Document()
	doc.add_paragraph("Just one paragraph here.")
	doc.save(single_path)
	paragraphs = handler.extract_paragraphs(single_path)
	assert len(paragraphs) == 1
	chunks = build_chunks(paragraphs, max_tokens_per_chunk=3000)
	assert len(chunks) == 1
	assert chunks[0].classify_indices == [paragraphs[0].index]
	logger.info(" ✅ Single paragraph document handled correctly")

	# Test: Large document (simulate 100+ paragraphs)
	large_path = "test_data/large.docx"
	doc = Document()
	for i in range(150):
	if i % 15 == 0:
	doc.add_paragraph(f"Section {i // 15 + 1}")
	else:
	doc.add_paragraph(
	f"This is body paragraph {i}. It contains enough text to be "
	f"realistic for token estimation purposes. Lorem ipsum dolor "
	f"sit amet, consectetur adipiscing elit paragraph {i}."
	)
	doc.save(large_path)
	paragraphs = handler.extract_paragraphs(large_path)
	assert len(paragraphs) == 150
	chunks = build_chunks(paragraphs, max_tokens_per_chunk=3000, overlap=3)
	logger.info(f" Large doc (150 paras): {len(chunks)} chunks")
	assert len(chunks) > 1, "Large doc should require multiple chunks"

	# Verify complete coverage
	all_classify = set()
	for c in chunks:
	for idx in c.classify_indices:
	assert idx not in all_classify, f"Duplicate classify index {idx}"
	all_classify.add(idx)
	assert all_classify == {p.index for p in paragraphs}, "All paragraphs must be covered"
	logger.info(f" ✅ Large document (150 paragraphs, {len(chunks)} chunks) handled correctly")

	logger.info("\n✅ All edge case tests passed!")


	def main():
	os.makedirs("test_data", exist_ok=True)

	create_problematic_docx("test_data/sample.docx")

	paragraphs = test_extraction()
	test_chunking(paragraphs)
	test_classification_application(paragraphs)
	test_edge_cases()

	logger.info("\n" + "=" * 60)
	logger.info("ALL TESTS PASSED ✅")
	logger.info("=" * 60)
	logger.info("\nThe module correctly:")
	logger.info(" 1. Extracts paragraphs with formatting metadata from DOCX")
	logger.info(" 2. Chunks large documents with overlap for LLM context")
	logger.info(" 3. Applies classifications to a COPY (original untouched)")
	logger.info(" 4. Handles edge cases (empty, single para, 150+ paras)")
	logger.info("\nReady to use with Ollama. Run:")
	logger.info(" python -m doc_enricher.cli input.docx -o output.docx")


	if __name__ == "__main__":
	main()