File size: 1,029 Bytes
2735587
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
"""
Base handler interface for document format handlers.

Each format (DOCX, PDF, HTML, etc.) implements this interface.
"""

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Optional


@dataclass
class ParagraphInfo:
    """A paragraph extracted from a document, with metadata."""
    index: int
    text: str
    style_name: Optional[str] = None
    is_bold: Optional[bool] = None
    avg_font_size_pt: Optional[float] = None
    text_length: int = 0


class BaseHandler(ABC):
    """Interface that every document format handler must implement."""

    @abstractmethod
    def extract_paragraphs(self, filepath: str) -> list[ParagraphInfo]:
        """Read the document and return all non-empty paragraphs with metadata."""
        ...

    @abstractmethod
    def apply_classifications(
        self,
        src_path: str,
        dst_path: str,
        classifications: dict[int, str],
    ) -> str:
        """Create a re-enriched copy with proper heading formatting."""
        ...