File size: 1,029 Bytes
2735587 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | """
Base handler interface for document format handlers.
Each format (DOCX, PDF, HTML, etc.) implements this interface.
"""
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Optional
@dataclass
class ParagraphInfo:
"""A paragraph extracted from a document, with metadata."""
index: int
text: str
style_name: Optional[str] = None
is_bold: Optional[bool] = None
avg_font_size_pt: Optional[float] = None
text_length: int = 0
class BaseHandler(ABC):
"""Interface that every document format handler must implement."""
@abstractmethod
def extract_paragraphs(self, filepath: str) -> list[ParagraphInfo]:
"""Read the document and return all non-empty paragraphs with metadata."""
...
@abstractmethod
def apply_classifications(
self,
src_path: str,
dst_path: str,
classifications: dict[int, str],
) -> str:
"""Create a re-enriched copy with proper heading formatting."""
...
|