""" Base handler interface for document format handlers. Each format (DOCX, PDF, HTML, etc.) implements this interface. """ from abc import ABC, abstractmethod from dataclasses import dataclass from typing import Optional @dataclass class ParagraphInfo: """A paragraph extracted from a document, with metadata.""" index: int text: str style_name: Optional[str] = None is_bold: Optional[bool] = None avg_font_size_pt: Optional[float] = None text_length: int = 0 class BaseHandler(ABC): """Interface that every document format handler must implement.""" @abstractmethod def extract_paragraphs(self, filepath: str) -> list[ParagraphInfo]: """Read the document and return all non-empty paragraphs with metadata.""" ... @abstractmethod def apply_classifications( self, src_path: str, dst_path: str, classifications: dict[int, str], ) -> str: """Create a re-enriched copy with proper heading formatting.""" ...