| """ |
| PDF text extraction module |
| Handles extraction from legal PDF documents |
| """ |
|
|
| import logging |
| from pathlib import Path |
| from typing import List, Dict, Tuple, Optional |
|
|
| try: |
| import pdfplumber |
| PDFPLUMBER_AVAILABLE = True |
| except ImportError: |
| PDFPLUMBER_AVAILABLE = False |
|
|
| try: |
| from PyPDF2 import PdfReader |
| PYPDF2_AVAILABLE = True |
| except ImportError: |
| PYPDF2_AVAILABLE = False |
|
|
| from .config import PDF_EXTRACTION_METHOD, PDF_FALLBACK_METHOD |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| class PDFExtractor: |
| """Extracts text from PDF files with multiple extraction methods""" |
| |
| def __init__(self, method: str = PDF_EXTRACTION_METHOD): |
| """ |
| Initialize PDF extractor |
| |
| Args: |
| method: Extraction method ('pdfplumber' or 'pypdf2') |
| """ |
| self.method = method |
| self._validate_dependencies() |
| |
| def _validate_dependencies(self): |
| """Check if required libraries are available""" |
| if self.method == "pdfplumber" and not PDFPLUMBER_AVAILABLE: |
| logger.warning("pdfplumber not available, falling back to PyPDF2") |
| self.method = "pypdf2" |
| |
| if self.method == "pypdf2" and not PYPDF2_AVAILABLE: |
| raise ImportError("No PDF extraction library available. Install pdfplumber or PyPDF2") |
| |
| def extract_from_file(self, pdf_path: Path) -> List[Dict[str, any]]: |
| """ |
| Extract text from PDF file |
| |
| Args: |
| pdf_path: Path to PDF file |
| |
| Returns: |
| List of dicts with 'page_number' and 'text' keys |
| """ |
| logger.info(f"Extracting text from {pdf_path.name} using {self.method}") |
| |
| try: |
| if self.method == "pdfplumber": |
| return self._extract_with_pdfplumber(pdf_path) |
| else: |
| return self._extract_with_pypdf2(pdf_path) |
| except Exception as e: |
| logger.error(f"Extraction failed with {self.method}: {e}") |
| |
| if self.method == "pdfplumber" and PYPDF2_AVAILABLE: |
| logger.info("Trying fallback method: PyPDF2") |
| return self._extract_with_pypdf2(pdf_path) |
| elif self.method == "pypdf2" and PDFPLUMBER_AVAILABLE: |
| logger.info("Trying fallback method: pdfplumber") |
| return self._extract_with_pdfplumber(pdf_path) |
| else: |
| raise |
| |
| def _extract_with_pdfplumber(self, pdf_path: Path) -> List[Dict[str, any]]: |
| """Extract using pdfplumber (better for complex layouts)""" |
| pages_data = [] |
| |
| with pdfplumber.open(pdf_path) as pdf: |
| for page_num, page in enumerate(pdf.pages, start=1): |
| text = page.extract_text() |
| if text: |
| pages_data.append({ |
| 'page_number': page_num, |
| 'text': text |
| }) |
| else: |
| logger.warning(f"No text extracted from page {page_num}") |
| |
| logger.info(f"Extracted {len(pages_data)} pages from {pdf_path.name}") |
| return pages_data |
| |
| def _extract_with_pypdf2(self, pdf_path: Path) -> List[Dict[str, any]]: |
| """Extract using PyPDF2 (fallback method)""" |
| pages_data = [] |
| |
| with open(pdf_path, 'rb') as file: |
| pdf_reader = PdfReader(file) |
| |
| for page_num, page in enumerate(pdf_reader.pages, start=1): |
| text = page.extract_text() |
| if text: |
| pages_data.append({ |
| 'page_number': page_num, |
| 'text': text |
| }) |
| else: |
| logger.warning(f"No text extracted from page {page_num}") |
| |
| logger.info(f"Extracted {len(pages_data)} pages from {pdf_path.name}") |
| return pages_data |
| |
| def extract_from_directory(self, directory: Path) -> Dict[str, List[Dict[str, any]]]: |
| """ |
| Extract text from all PDFs in a directory |
| |
| Args: |
| directory: Path to directory containing PDFs |
| |
| Returns: |
| Dict mapping filename to list of page data |
| """ |
| results = {} |
| pdf_files = list(directory.glob("*.pdf")) |
| |
| logger.info(f"Found {len(pdf_files)} PDF files in {directory}") |
| |
| for pdf_file in pdf_files: |
| try: |
| results[pdf_file.name] = self.extract_from_file(pdf_file) |
| except Exception as e: |
| logger.error(f"Failed to extract {pdf_file.name}: {e}") |
| results[pdf_file.name] = [] |
| |
| return results |
|
|