File size: 14,684 Bytes
d2a9d6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
#!/usr/bin/env python3
"""
Test script for the Document Re-enrichment Module.

Creates a sample DOCX with intentionally inconsistent formatting
(the exact problem the module solves), then tests:
1. Paragraph extraction
2. Chunking logic
3. Classification application (mocked LLM)
4. Output document formatting verification

Run: python test_module.py
Requires: pip install python-docx
Does NOT require Ollama (LLM is mocked).
"""

import os
import sys
import logging
import json
from docx import Document
from docx.shared import Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH

from doc_enricher.handlers.docx_handler import DocxHandler
from doc_enricher.chunker import build_chunks, estimate_tokens
from doc_enricher.base_handler import ParagraphInfo

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)


def create_problematic_docx(filepath: str):
    """
    Create a DOCX that mimics the user's problem:
    - The title is NOT bold and same font size as body
    - Section headings have inconsistent formatting  
    - Some body text is bold (causing misclassification)
    - Some headings look like body text
    """
    doc = Document()

    # --- TITLE: Not bold, same size as body (parser would miss it) ---
    title_para = doc.add_paragraph()
    run = title_para.add_run("Annual Infrastructure Modernization Report 2024")
    run.bold = False  # Problem: title is NOT bold
    run.font.size = Pt(11)  # Problem: same size as body text
    
    # --- BODY: Intro paragraph ---
    body1 = doc.add_paragraph()
    run = body1.add_run(
        "This report provides a comprehensive overview of the infrastructure "
        "modernization initiatives undertaken during the fiscal year 2024. "
        "It covers budget allocation, project milestones, risk assessments, "
        "and recommendations for the upcoming fiscal year."
    )
    run.font.size = Pt(11)

    # --- SECTION HEADING: Bold but same size as body ---
    heading1 = doc.add_paragraph()
    run = heading1.add_run("Executive Summary")
    run.bold = True
    run.font.size = Pt(11)  # Problem: same size as body

    # --- BODY ---
    body2 = doc.add_paragraph()
    run = body2.add_run(
        "The organization successfully completed 85% of planned infrastructure "
        "upgrades. The total expenditure was $4.2 million, coming in 3% under "
        "the approved budget of $4.33 million."
    )
    run.font.size = Pt(11)

    # --- SECTION HEADING: NOT bold at all (parser would miss it) ---
    heading2 = doc.add_paragraph()
    run = heading2.add_run("Budget Allocation and Expenditure")
    run.bold = False  # Problem: heading is NOT bold
    run.font.size = Pt(11)  # Problem: same size as body

    # --- BODY with bold (parser might think it's a heading) ---
    body3 = doc.add_paragraph()
    run = body3.add_run(
        "The total approved budget for FY2024 was $4.33 million. "
    )
    run.bold = True  # Problem: body text is bold
    run.font.size = Pt(11)
    run2 = body3.add_run(
        "This was distributed across four major project areas: "
        "network infrastructure (35%), server consolidation (25%), "
        "security upgrades (20%), and cloud migration (20%)."
    )
    run2.font.size = Pt(11)

    # --- SECTION HEADING: Underlined but not bold ---
    heading3 = doc.add_paragraph()
    run = heading3.add_run("Network Infrastructure Upgrades")
    run.bold = False
    run.underline = True  # Different formatting pattern
    run.font.size = Pt(11)

    # --- BODY ---
    body4 = doc.add_paragraph()
    run = body4.add_run(
        "The network team replaced all legacy switches across 12 branch "
        "offices, upgraded the core router at headquarters, and implemented "
        "SD-WAN connectivity for remote offices. The project was completed "
        "two weeks ahead of schedule."
    )
    run.font.size = Pt(11)

    # --- SECTION HEADING: Has proper formatting (to test mixed docs) ---
    heading4 = doc.add_paragraph()
    run = heading4.add_run("Security Improvements")
    run.bold = True
    run.font.size = Pt(14)  # This one is properly formatted

    # --- BODY ---
    body5 = doc.add_paragraph()
    run = body5.add_run(
        "Zero-trust architecture was deployed across all critical systems. "
        "Multi-factor authentication enrollment reached 98% of employees. "
        "The security operations center now monitors 24/7 with automated "
        "threat response capabilities."
    )
    run.font.size = Pt(11)

    # --- SECTION HEADING: Plain text, no formatting ---
    heading5 = doc.add_paragraph()
    run = heading5.add_run("Recommendations for FY2025")
    run.bold = False
    run.font.size = Pt(11)

    # --- BODY ---
    body6 = doc.add_paragraph()
    run = body6.add_run(
        "Based on the outcomes of FY2024, the committee recommends: "
        "1) Increasing the cloud migration budget by 15%, "
        "2) Implementing AI-driven monitoring tools, "
        "3) Establishing a dedicated DevOps team, and "
        "4) Conducting quarterly security audits."
    )
    run.font.size = Pt(11)

    doc.save(filepath)
    logger.info(f"Created problematic DOCX: {filepath}")


def test_extraction():
    """Test that paragraphs are correctly extracted with metadata."""
    logger.info("=" * 60)
    logger.info("TEST 1: Paragraph Extraction")
    logger.info("=" * 60)

    handler = DocxHandler()
    paragraphs = handler.extract_paragraphs("test_data/sample.docx")

    logger.info(f"Extracted {len(paragraphs)} paragraphs:\n")
    for p in paragraphs:
        logger.info(
            f"  [{p.index:2d}] bold={str(p.is_bold):5s}  "
            f"size={str(p.avg_font_size_pt):6s}  "
            f"style={p.style_name:15s}  "
            f"len={p.text_length:3d}  "
            f"text=\"{p.text[:60]}...\""
        )

    assert len(paragraphs) > 0, "Should extract at least some paragraphs"
    
    # Check that the title paragraph was extracted (index 0)
    title_para = paragraphs[0]
    assert "Annual Infrastructure" in title_para.text
    # The title is NOT bold β€” this is the problem we're fixing
    assert title_para.is_bold is not True, \
        "Title should NOT be bold (that's the formatting problem)"

    logger.info("\nβœ… Extraction test passed!")
    return paragraphs


def test_chunking(paragraphs):
    """Test that chunking works correctly for various document sizes."""
    logger.info("\n" + "=" * 60)
    logger.info("TEST 2: Chunking Logic")
    logger.info("=" * 60)

    # Test 1: Small doc β€” should be a single chunk
    chunks = build_chunks(paragraphs, max_tokens_per_chunk=5000, overlap=3)
    logger.info(f"  Small budget test: {len(chunks)} chunk(s)")
    assert len(chunks) == 1, "Small doc should fit in one chunk"
    assert set(chunks[0].classify_indices) == {p.index for p in paragraphs}

    # Test 2: Very tight budget β€” force multiple chunks
    chunks = build_chunks(paragraphs, max_tokens_per_chunk=200, overlap=2)
    logger.info(f"  Tight budget test: {len(chunks)} chunk(s)")
    assert len(chunks) > 1, "Tight budget should create multiple chunks"

    # Verify all paragraphs are covered
    all_classify = set()
    for c in chunks:
        all_classify.update(c.classify_indices)
    expected = {p.index for p in paragraphs}
    assert all_classify == expected, \
        f"All paragraphs must be classified. Missing: {expected - all_classify}"

    # Verify no duplicates in classify_indices across chunks
    seen = set()
    for c in chunks:
        for idx in c.classify_indices:
            assert idx not in seen, f"Paragraph {idx} classified by multiple chunks!"
            seen.add(idx)

    # Test 3: Token estimation
    sample_text = "This is a sample paragraph with about thirty words in it to test."
    estimated = estimate_tokens(sample_text)
    assert 10 < estimated < 30, f"Token estimate {estimated} seems off for short text"

    logger.info("\nβœ… Chunking test passed!")


def test_classification_application(paragraphs):
    """Test applying mock classifications to the document."""
    logger.info("\n" + "=" * 60)
    logger.info("TEST 3: Classification Application")
    logger.info("=" * 60)

    # Mock classifications (what the LLM would produce)
    mock_classifications = {}
    for p in paragraphs:
        if "Annual Infrastructure" in p.text:
            mock_classifications[p.index] = "TITLE"
        elif len(p.text) < 40 and not p.text.endswith("."):
            # Short text without period β†’ likely a heading
            mock_classifications[p.index] = "SECTION_HEADING"
        else:
            mock_classifications[p.index] = "BODY"

    logger.info("  Mock classifications:")
    for idx, label in sorted(mock_classifications.items()):
        para = next(p for p in paragraphs if p.index == idx)
        logger.info(f"    [{idx:2d}] {label:17s} β†’ \"{para.text[:50]}...\"")

    # Apply to document
    handler = DocxHandler()
    output_path = handler.apply_classifications(
        "test_data/sample.docx",
        "test_data/sample_enriched.docx",
        mock_classifications,
    )

    assert os.path.exists(output_path), "Output file should exist"

    # Verify the output document formatting
    enriched = Document(output_path)
    logger.info("\n  Verifying enriched document formatting:")

    for i, para in enumerate(enriched.paragraphs):
        if i not in mock_classifications:
            continue

        label = mock_classifications[i]
        text = para.text.strip()
        if not text:
            continue

        style = para.style.name if para.style else "None"
        if para.runs:
            run = para.runs[0]
            bold = run.bold
            size = run.font.size.pt if run.font.size else None
        else:
            bold = None
            size = None

        logger.info(
            f"    [{i:2d}] label={label:17s}  style={style:15s}  "
            f"bold={str(bold):5s}  size={str(size):6s}  "
            f"text=\"{text[:40]}...\""
        )

        if label == "TITLE":
            assert bold is True, f"Title para {i} should be bold"
            assert size == 20.0, f"Title para {i} should be 20pt, got {size}"
        elif label == "SECTION_HEADING":
            assert bold is True, f"Heading para {i} should be bold"
            assert size == 14.0, f"Heading para {i} should be 14pt, got {size}"
        elif label == "BODY":
            assert bold is False, f"Body para {i} should NOT be bold"
            assert size == 11.0, f"Body para {i} should be 11pt, got {size}"

    # Verify original is unchanged
    original = Document("test_data/sample.docx")
    orig_title = original.paragraphs[0]
    if orig_title.runs:
        assert orig_title.runs[0].bold is not True, \
            "Original title should still NOT be bold (unchanged)"

    logger.info("\nβœ… Classification application test passed!")
    logger.info(f"  Original: test_data/sample.docx (UNCHANGED)")
    logger.info(f"  Enriched: {output_path} (formatting corrected)")


def test_edge_cases():
    """Test edge cases: empty doc, single paragraph, etc."""
    logger.info("\n" + "=" * 60)
    logger.info("TEST 4: Edge Cases")
    logger.info("=" * 60)

    handler = DocxHandler()

    # Test: Empty document
    empty_path = "test_data/empty.docx"
    doc = Document()
    doc.save(empty_path)
    paragraphs = handler.extract_paragraphs(empty_path)
    assert len(paragraphs) == 0, "Empty doc should have 0 paragraphs"
    logger.info("  βœ… Empty document handled correctly")

    # Test: Document with only whitespace paragraphs
    ws_path = "test_data/whitespace.docx"
    doc = Document()
    doc.add_paragraph("   ")
    doc.add_paragraph("\t\n")
    doc.add_paragraph("")
    doc.save(ws_path)
    paragraphs = handler.extract_paragraphs(ws_path)
    assert len(paragraphs) == 0, "Whitespace-only doc should have 0 paragraphs"
    logger.info("  βœ… Whitespace-only document handled correctly")

    # Test: Single paragraph document
    single_path = "test_data/single.docx"
    doc = Document()
    doc.add_paragraph("Just one paragraph here.")
    doc.save(single_path)
    paragraphs = handler.extract_paragraphs(single_path)
    assert len(paragraphs) == 1
    chunks = build_chunks(paragraphs, max_tokens_per_chunk=3000)
    assert len(chunks) == 1
    assert chunks[0].classify_indices == [paragraphs[0].index]
    logger.info("  βœ… Single paragraph document handled correctly")

    # Test: Large document (simulate 100+ paragraphs)
    large_path = "test_data/large.docx"
    doc = Document()
    for i in range(150):
        if i % 15 == 0:
            doc.add_paragraph(f"Section {i // 15 + 1}")
        else:
            doc.add_paragraph(
                f"This is body paragraph {i}. It contains enough text to be "
                f"realistic for token estimation purposes. Lorem ipsum dolor "
                f"sit amet, consectetur adipiscing elit paragraph {i}."
            )
    doc.save(large_path)
    paragraphs = handler.extract_paragraphs(large_path)
    assert len(paragraphs) == 150
    chunks = build_chunks(paragraphs, max_tokens_per_chunk=3000, overlap=3)
    logger.info(f"  Large doc (150 paras): {len(chunks)} chunks")
    assert len(chunks) > 1, "Large doc should require multiple chunks"

    # Verify complete coverage
    all_classify = set()
    for c in chunks:
        for idx in c.classify_indices:
            assert idx not in all_classify, f"Duplicate classify index {idx}"
            all_classify.add(idx)
    assert all_classify == {p.index for p in paragraphs}, "All paragraphs must be covered"
    logger.info(f"  βœ… Large document (150 paragraphs, {len(chunks)} chunks) handled correctly")

    logger.info("\nβœ… All edge case tests passed!")


def main():
    os.makedirs("test_data", exist_ok=True)

    create_problematic_docx("test_data/sample.docx")

    paragraphs = test_extraction()
    test_chunking(paragraphs)
    test_classification_application(paragraphs)
    test_edge_cases()

    logger.info("\n" + "=" * 60)
    logger.info("ALL TESTS PASSED βœ…")
    logger.info("=" * 60)
    logger.info("\nThe module correctly:")
    logger.info("  1. Extracts paragraphs with formatting metadata from DOCX")
    logger.info("  2. Chunks large documents with overlap for LLM context")
    logger.info("  3. Applies classifications to a COPY (original untouched)")
    logger.info("  4. Handles edge cases (empty, single para, 150+ paras)")
    logger.info("\nReady to use with Ollama. Run:")
    logger.info("  python -m doc_enricher.cli input.docx -o output.docx")


if __name__ == "__main__":
    main()