| """ |
| Text cleaning and normalization module |
| Removes headers, footers, page numbers, and fixes formatting |
| """ |
|
|
| import re |
| import logging |
| from typing import List, Dict |
|
|
| from .config import CLEANING_PATTERNS |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| class TextCleaner: |
| """Cleans and normalizes extracted text""" |
| |
| def __init__(self): |
| """Initialize text cleaner with compiled patterns""" |
| self.patterns = self._compile_patterns() |
| |
| def _compile_patterns(self) -> Dict[str, List[re.Pattern]]: |
| """Compile all regex patterns for efficiency""" |
| compiled = {} |
| for category, patterns in CLEANING_PATTERNS.items(): |
| compiled[category] = [re.compile(p, re.MULTILINE | re.IGNORECASE) for p in patterns] |
| return compiled |
| |
| def clean_text(self, text: str) -> str: |
| """ |
| Apply all cleaning operations to text |
| |
| Args: |
| text: Raw text to clean |
| |
| Returns: |
| Cleaned text |
| """ |
| if not text: |
| return "" |
| |
| |
| text = self._remove_page_numbers(text) |
| |
| |
| text = self._remove_headers_footers(text) |
| |
| |
| text = self._remove_toc_patterns(text) |
| |
| |
| text = self._normalize_whitespace(text) |
| |
| |
| text = self._additional_cleaning(text) |
| |
| return text.strip() |
| |
| def _remove_page_numbers(self, text: str) -> str: |
| """Remove page numbers""" |
| for pattern in self.patterns['page_numbers']: |
| text = pattern.sub('', text) |
| return text |
| |
| def _remove_headers_footers(self, text: str) -> str: |
| """Remove common headers and footers""" |
| for pattern in self.patterns['headers_footers']: |
| text = pattern.sub('', text) |
| return text |
| |
| def _remove_toc_patterns(self, text: str) -> str: |
| """Remove table of contents patterns""" |
| for pattern in self.patterns['toc_patterns']: |
| text = pattern.sub('', text) |
| return text |
| |
| def _normalize_whitespace(self, text: str) -> str: |
| """Fix excessive whitespace and line breaks""" |
| |
| text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text) |
| |
| |
| text = re.sub(r'[ \t]+', ' ', text) |
| |
| |
| text = re.sub(r'-\s*\n\s*', '', text) |
| |
| |
| |
| lines = text.split('\n') |
| normalized_lines = [] |
| |
| for i, line in enumerate(lines): |
| line = line.strip() |
| if line: |
| |
| if i < len(lines) - 1 and lines[i + 1].strip(): |
| |
| if not line.endswith(('.', '!', '?', ':', ';')): |
| |
| normalized_lines.append(line + ' ') |
| else: |
| normalized_lines.append(line + '\n') |
| else: |
| normalized_lines.append(line + '\n') |
| |
| text = ''.join(normalized_lines) |
| |
| return text |
| |
| def _additional_cleaning(self, text: str) -> str: |
| """Additional cleaning operations""" |
| |
| text = re.sub(r'\n\s*\d+\s*\n', '\n', text) |
| |
| |
| lines = text.split('\n') |
| cleaned_lines = [line for line in lines if len(line.strip()) > 3 or line.strip() == ''] |
| text = '\n'.join(cleaned_lines) |
| |
| |
| text = text.replace('\u2019', "'") |
| text = text.replace('\u2018', "'") |
| text = text.replace('\u201c', '"') |
| text = text.replace('\u201d', '"') |
| text = text.replace('\u2013', '-') |
| text = text.replace('\u2014', '--') |
| |
| return text |
| |
| def clean_pages(self, pages_data: List[Dict[str, any]]) -> str: |
| """ |
| Clean text from multiple pages and combine |
| |
| Args: |
| pages_data: List of dicts with 'page_number' and 'text' |
| |
| Returns: |
| Combined cleaned text |
| """ |
| combined_text = [] |
| |
| for page_data in pages_data: |
| page_text = page_data.get('text', '') |
| if page_text: |
| cleaned = self.clean_text(page_text) |
| if cleaned: |
| combined_text.append(cleaned) |
| |
| |
| full_text = '\n\n'.join(combined_text) |
| |
| logger.info(f"Cleaned {len(pages_data)} pages into {len(full_text)} characters") |
| |
| return full_text |
|
|