| """ | |
| Base handler interface for document format handlers. | |
| Each format (DOCX, PDF, HTML, etc.) implements this interface. | |
| """ | |
| from abc import ABC, abstractmethod | |
| from dataclasses import dataclass | |
| from typing import Optional | |
| class ParagraphInfo: | |
| """A paragraph extracted from a document, with metadata.""" | |
| index: int | |
| text: str | |
| style_name: Optional[str] = None | |
| is_bold: Optional[bool] = None | |
| avg_font_size_pt: Optional[float] = None | |
| text_length: int = 0 | |
| class BaseHandler(ABC): | |
| """Interface that every document format handler must implement.""" | |
| def extract_paragraphs(self, filepath: str) -> list[ParagraphInfo]: | |
| """Read the document and return all non-empty paragraphs with metadata.""" | |
| ... | |
| def apply_classifications( | |
| self, | |
| src_path: str, | |
| dst_path: str, | |
| classifications: dict[int, str], | |
| ) -> str: | |
| """Create a re-enriched copy with proper heading formatting.""" | |
| ... | |