doc-enricher / doc_enricher /base_handler.py
dwijverma2's picture
Add base handler interface
2735587 verified
"""
Base handler interface for document format handlers.
Each format (DOCX, PDF, HTML, etc.) implements this interface.
"""
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Optional
@dataclass
class ParagraphInfo:
"""A paragraph extracted from a document, with metadata."""
index: int
text: str
style_name: Optional[str] = None
is_bold: Optional[bool] = None
avg_font_size_pt: Optional[float] = None
text_length: int = 0
class BaseHandler(ABC):
"""Interface that every document format handler must implement."""
@abstractmethod
def extract_paragraphs(self, filepath: str) -> list[ParagraphInfo]:
"""Read the document and return all non-empty paragraphs with metadata."""
...
@abstractmethod
def apply_classifications(
self,
src_path: str,
dst_path: str,
classifications: dict[int, str],
) -> str:
"""Create a re-enriched copy with proper heading formatting."""
...