Add DOCX handler
Browse files
doc_enricher/handlers/docx_handler.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
DOCX format handler.
|
| 3 |
+
|
| 4 |
+
Reads .docx files using python-docx, extracts paragraph metadata,
|
| 5 |
+
and applies heading formatting to a binary copy of the original.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import shutil
|
| 9 |
+
import logging
|
| 10 |
+
from docx import Document
|
| 11 |
+
from docx.shared import Pt
|
| 12 |
+
|
| 13 |
+
from ..base_handler import BaseHandler, ParagraphInfo
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class DocxHandler(BaseHandler):
|
| 19 |
+
"""Handler for .docx files using python-docx."""
|
| 20 |
+
|
| 21 |
+
# Formatting rules for each classification label.
|
| 22 |
+
# The downstream parser expects:
|
| 23 |
+
# level 2 (document title) = bold + large font
|
| 24 |
+
# level 1 (section heading) = bold + medium font
|
| 25 |
+
# level 0 (body text) = normal
|
| 26 |
+
STYLE_MAP = {
|
| 27 |
+
"TITLE": {"style": "Title", "bold": True, "font_size_pt": 20},
|
| 28 |
+
"SECTION_HEADING": {"style": "Heading 1", "bold": True, "font_size_pt": 14},
|
| 29 |
+
"BODY": {"style": "Normal", "bold": False, "font_size_pt": 11},
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
def extract_paragraphs(self, filepath: str) -> list[ParagraphInfo]:
|
| 33 |
+
"""Extract all non-empty paragraphs with formatting metadata."""
|
| 34 |
+
doc = Document(filepath)
|
| 35 |
+
paragraphs = []
|
| 36 |
+
|
| 37 |
+
for i, para in enumerate(doc.paragraphs):
|
| 38 |
+
text = para.text.strip()
|
| 39 |
+
if not text:
|
| 40 |
+
continue # Skip empty/whitespace-only paragraphs
|
| 41 |
+
|
| 42 |
+
# Gather run-level formatting info
|
| 43 |
+
sizes = []
|
| 44 |
+
bolds = []
|
| 45 |
+
for run in para.runs:
|
| 46 |
+
if run.font.size is not None:
|
| 47 |
+
sizes.append(run.font.size.pt)
|
| 48 |
+
if run.bold is not None:
|
| 49 |
+
bolds.append(run.bold)
|
| 50 |
+
|
| 51 |
+
avg_size = sum(sizes) / len(sizes) if sizes else None
|
| 52 |
+
any_bold = any(bolds) if bolds else None
|
| 53 |
+
|
| 54 |
+
paragraphs.append(ParagraphInfo(
|
| 55 |
+
index=i,
|
| 56 |
+
text=text,
|
| 57 |
+
style_name=para.style.name if para.style else None,
|
| 58 |
+
is_bold=any_bold,
|
| 59 |
+
avg_font_size_pt=avg_size,
|
| 60 |
+
text_length=len(text),
|
| 61 |
+
))
|
| 62 |
+
|
| 63 |
+
logger.info(f"Extracted {len(paragraphs)} non-empty paragraphs from {filepath}")
|
| 64 |
+
return paragraphs
|
| 65 |
+
|
| 66 |
+
def apply_classifications(
|
| 67 |
+
self,
|
| 68 |
+
src_path: str,
|
| 69 |
+
dst_path: str,
|
| 70 |
+
classifications: dict[int, str],
|
| 71 |
+
) -> str:
|
| 72 |
+
"""
|
| 73 |
+
Create a binary copy of src_path, then apply formatting based on
|
| 74 |
+
the LLM classifications.
|
| 75 |
+
|
| 76 |
+
Strategy:
|
| 77 |
+
1. Try to assign the named style (Title / Heading 1 / Normal).
|
| 78 |
+
This sets the correct outline level in the XML so downstream
|
| 79 |
+
parsers that check para.style.name will work.
|
| 80 |
+
2. Also apply run-level bold + font-size overrides as a belt-and-
|
| 81 |
+
suspenders approach — handles cases where styles are missing or
|
| 82 |
+
the parser checks run formatting directly.
|
| 83 |
+
"""
|
| 84 |
+
# Step 1: Binary copy preserves everything (images, tables, headers, etc.)
|
| 85 |
+
shutil.copy2(src_path, dst_path)
|
| 86 |
+
|
| 87 |
+
# Step 2: Open the copy and modify
|
| 88 |
+
doc = Document(dst_path)
|
| 89 |
+
|
| 90 |
+
# Collect available styles
|
| 91 |
+
available_styles = {s.name for s in doc.styles}
|
| 92 |
+
|
| 93 |
+
for i, para in enumerate(doc.paragraphs):
|
| 94 |
+
if i not in classifications:
|
| 95 |
+
continue # Paragraph wasn't classified (empty, skipped)
|
| 96 |
+
|
| 97 |
+
label = classifications[i]
|
| 98 |
+
fmt = self.STYLE_MAP.get(label, self.STYLE_MAP["BODY"])
|
| 99 |
+
|
| 100 |
+
# Apply named style if available
|
| 101 |
+
if fmt["style"] in available_styles:
|
| 102 |
+
try:
|
| 103 |
+
para.style = doc.styles[fmt["style"]]
|
| 104 |
+
except Exception as e:
|
| 105 |
+
logger.warning(f"Failed to set style '{fmt['style']}' on para {i}: {e}")
|
| 106 |
+
|
| 107 |
+
# Apply run-level formatting (belt-and-suspenders)
|
| 108 |
+
for run in para.runs:
|
| 109 |
+
run.bold = fmt["bold"]
|
| 110 |
+
run.font.size = Pt(fmt["font_size_pt"])
|
| 111 |
+
|
| 112 |
+
doc.save(dst_path)
|
| 113 |
+
logger.info(f"Re-enriched document saved to {dst_path}")
|
| 114 |
+
return dst_path
|