dwijverma2 commited on
Commit
dc20f63
·
verified ·
1 Parent(s): e5a883e

Add DOCX handler

Browse files
Files changed (1) hide show
  1. doc_enricher/handlers/docx_handler.py +114 -0
doc_enricher/handlers/docx_handler.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ DOCX format handler.
3
+
4
+ Reads .docx files using python-docx, extracts paragraph metadata,
5
+ and applies heading formatting to a binary copy of the original.
6
+ """
7
+
8
+ import shutil
9
+ import logging
10
+ from docx import Document
11
+ from docx.shared import Pt
12
+
13
+ from ..base_handler import BaseHandler, ParagraphInfo
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class DocxHandler(BaseHandler):
19
+ """Handler for .docx files using python-docx."""
20
+
21
+ # Formatting rules for each classification label.
22
+ # The downstream parser expects:
23
+ # level 2 (document title) = bold + large font
24
+ # level 1 (section heading) = bold + medium font
25
+ # level 0 (body text) = normal
26
+ STYLE_MAP = {
27
+ "TITLE": {"style": "Title", "bold": True, "font_size_pt": 20},
28
+ "SECTION_HEADING": {"style": "Heading 1", "bold": True, "font_size_pt": 14},
29
+ "BODY": {"style": "Normal", "bold": False, "font_size_pt": 11},
30
+ }
31
+
32
+ def extract_paragraphs(self, filepath: str) -> list[ParagraphInfo]:
33
+ """Extract all non-empty paragraphs with formatting metadata."""
34
+ doc = Document(filepath)
35
+ paragraphs = []
36
+
37
+ for i, para in enumerate(doc.paragraphs):
38
+ text = para.text.strip()
39
+ if not text:
40
+ continue # Skip empty/whitespace-only paragraphs
41
+
42
+ # Gather run-level formatting info
43
+ sizes = []
44
+ bolds = []
45
+ for run in para.runs:
46
+ if run.font.size is not None:
47
+ sizes.append(run.font.size.pt)
48
+ if run.bold is not None:
49
+ bolds.append(run.bold)
50
+
51
+ avg_size = sum(sizes) / len(sizes) if sizes else None
52
+ any_bold = any(bolds) if bolds else None
53
+
54
+ paragraphs.append(ParagraphInfo(
55
+ index=i,
56
+ text=text,
57
+ style_name=para.style.name if para.style else None,
58
+ is_bold=any_bold,
59
+ avg_font_size_pt=avg_size,
60
+ text_length=len(text),
61
+ ))
62
+
63
+ logger.info(f"Extracted {len(paragraphs)} non-empty paragraphs from {filepath}")
64
+ return paragraphs
65
+
66
+ def apply_classifications(
67
+ self,
68
+ src_path: str,
69
+ dst_path: str,
70
+ classifications: dict[int, str],
71
+ ) -> str:
72
+ """
73
+ Create a binary copy of src_path, then apply formatting based on
74
+ the LLM classifications.
75
+
76
+ Strategy:
77
+ 1. Try to assign the named style (Title / Heading 1 / Normal).
78
+ This sets the correct outline level in the XML so downstream
79
+ parsers that check para.style.name will work.
80
+ 2. Also apply run-level bold + font-size overrides as a belt-and-
81
+ suspenders approach — handles cases where styles are missing or
82
+ the parser checks run formatting directly.
83
+ """
84
+ # Step 1: Binary copy preserves everything (images, tables, headers, etc.)
85
+ shutil.copy2(src_path, dst_path)
86
+
87
+ # Step 2: Open the copy and modify
88
+ doc = Document(dst_path)
89
+
90
+ # Collect available styles
91
+ available_styles = {s.name for s in doc.styles}
92
+
93
+ for i, para in enumerate(doc.paragraphs):
94
+ if i not in classifications:
95
+ continue # Paragraph wasn't classified (empty, skipped)
96
+
97
+ label = classifications[i]
98
+ fmt = self.STYLE_MAP.get(label, self.STYLE_MAP["BODY"])
99
+
100
+ # Apply named style if available
101
+ if fmt["style"] in available_styles:
102
+ try:
103
+ para.style = doc.styles[fmt["style"]]
104
+ except Exception as e:
105
+ logger.warning(f"Failed to set style '{fmt['style']}' on para {i}: {e}")
106
+
107
+ # Apply run-level formatting (belt-and-suspenders)
108
+ for run in para.runs:
109
+ run.bold = fmt["bold"]
110
+ run.font.size = Pt(fmt["font_size_pt"])
111
+
112
+ doc.save(dst_path)
113
+ logger.info(f"Re-enriched document saved to {dst_path}")
114
+ return dst_path