dwijverma2
/

doc-enricher

ml-intern

Model card Files Files and versions

xet

Community

dwijverma2 commited on 12 days ago

Commit

dc20f63

verified ·

1 Parent(s): e5a883e

Add DOCX handler

Browse files

Files changed (1) hide show

doc_enricher/handlers/docx_handler.py +114 -0

doc_enricher/handlers/docx_handler.py ADDED Viewed

	@@ -0,0 +1,114 @@

+"""
+DOCX format handler.
+Reads .docx files using python-docx, extracts paragraph metadata,
+and applies heading formatting to a binary copy of the original.
+"""
+import shutil
+import logging
+from docx import Document
+from docx.shared import Pt
+from ..base_handler import BaseHandler, ParagraphInfo
+logger = logging.getLogger(__name__)
+class DocxHandler(BaseHandler):
+    """Handler for .docx files using python-docx."""
+    # Formatting rules for each classification label.
+    # The downstream parser expects:
+    #   level 2 (document title) = bold + large font
+    #   level 1 (section heading) = bold + medium font
+    #   level 0 (body text)       = normal
+    STYLE_MAP = {
+        "TITLE":           {"style": "Title",     "bold": True,  "font_size_pt": 20},
+        "SECTION_HEADING": {"style": "Heading 1", "bold": True,  "font_size_pt": 14},
+        "BODY":            {"style": "Normal",     "bold": False, "font_size_pt": 11},
+    }
+    def extract_paragraphs(self, filepath: str) -> list[ParagraphInfo]:
+        """Extract all non-empty paragraphs with formatting metadata."""
+        doc = Document(filepath)
+        paragraphs = []
+        for i, para in enumerate(doc.paragraphs):
+            text = para.text.strip()
+            if not text:
+                continue  # Skip empty/whitespace-only paragraphs
+            # Gather run-level formatting info
+            sizes = []
+            bolds = []
+            for run in para.runs:
+                if run.font.size is not None:
+                    sizes.append(run.font.size.pt)
+                if run.bold is not None:
+                    bolds.append(run.bold)
+            avg_size = sum(sizes) / len(sizes) if sizes else None
+            any_bold = any(bolds) if bolds else None
+            paragraphs.append(ParagraphInfo(
+                index=i,
+                text=text,
+                style_name=para.style.name if para.style else None,
+                is_bold=any_bold,
+                avg_font_size_pt=avg_size,
+                text_length=len(text),
+            ))
+        logger.info(f"Extracted {len(paragraphs)} non-empty paragraphs from {filepath}")
+        return paragraphs
+    def apply_classifications(
+        self,
+        src_path: str,
+        dst_path: str,
+        classifications: dict[int, str],
+    ) -> str:
+        """
+        Create a binary copy of src_path, then apply formatting based on
+        the LLM classifications.
+        Strategy:
+        1. Try to assign the named style (Title / Heading 1 / Normal).
+           This sets the correct outline level in the XML so downstream
+           parsers that check para.style.name will work.
+        2. Also apply run-level bold + font-size overrides as a belt-and-
+           suspenders approach — handles cases where styles are missing or
+           the parser checks run formatting directly.
+        """
+        # Step 1: Binary copy preserves everything (images, tables, headers, etc.)
+        shutil.copy2(src_path, dst_path)
+        # Step 2: Open the copy and modify
+        doc = Document(dst_path)
+        # Collect available styles
+        available_styles = {s.name for s in doc.styles}
+        for i, para in enumerate(doc.paragraphs):
+            if i not in classifications:
+                continue  # Paragraph wasn't classified (empty, skipped)
+            label = classifications[i]
+            fmt = self.STYLE_MAP.get(label, self.STYLE_MAP["BODY"])
+            # Apply named style if available
+            if fmt["style"] in available_styles:
+                try:
+                    para.style = doc.styles[fmt["style"]]
+                except Exception as e:
+                    logger.warning(f"Failed to set style '{fmt['style']}' on para {i}: {e}")
+            # Apply run-level formatting (belt-and-suspenders)
+            for run in para.runs:
+                run.bold = fmt["bold"]
+                run.font.size = Pt(fmt["font_size_pt"])
+        doc.save(dst_path)
+        logger.info(f"Re-enriched document saved to {dst_path}")
+        return dst_path