| """ |
| File Parser Tool - Multi-format file reading |
| Author: @mangubee |
| Date: 2026-01-02 |
| |
| Provides file parsing for: |
| - PDF files (.pdf) using PyPDF2 |
| - Excel files (.xlsx, .xls) using openpyxl |
| - Word documents (.docx) using python-docx |
| - Text files (.txt, .csv) using built-in open() |
| |
| All parsers include retry logic and error handling. |
| """ |
|
|
| import logging |
| from pathlib import Path |
| from typing import Dict, List, Optional |
| from tenacity import ( |
| retry, |
| stop_after_attempt, |
| wait_exponential, |
| retry_if_exception_type, |
| ) |
|
|
| |
| |
| |
| MAX_RETRIES = 3 |
| RETRY_MIN_WAIT = 1 |
| RETRY_MAX_WAIT = 5 |
|
|
| SUPPORTED_EXTENSIONS = { |
| '.pdf': 'PDF', |
| '.xlsx': 'Excel', |
| '.xls': 'Excel', |
| '.docx': 'Word', |
| '.txt': 'Text', |
| '.csv': 'CSV', |
| } |
|
|
| |
| |
| |
| logger = logging.getLogger(__name__) |
|
|
|
|
| |
| |
| |
|
|
| @retry( |
| stop=stop_after_attempt(MAX_RETRIES), |
| wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT), |
| retry=retry_if_exception_type((IOError, OSError)), |
| reraise=True, |
| ) |
| def parse_pdf(file_path: str) -> Dict: |
| """ |
| Parse PDF file and extract text content. |
| |
| Args: |
| file_path: Path to PDF file |
| |
| Returns: |
| Dict with structure: { |
| "content": str, # Extracted text |
| "pages": int, # Number of pages |
| "file_type": "PDF", |
| "file_path": str |
| } |
| |
| Raises: |
| FileNotFoundError: If file doesn't exist |
| ValueError: If file is corrupted or invalid |
| IOError: For file reading errors (triggers retry) |
| """ |
| try: |
| from PyPDF2 import PdfReader |
|
|
| path = Path(file_path) |
| if not path.exists(): |
| raise FileNotFoundError(f"PDF file not found: {file_path}") |
|
|
| logger.info(f"Parsing PDF: {file_path}") |
|
|
| reader = PdfReader(str(path)) |
| num_pages = len(reader.pages) |
|
|
| |
| content = [] |
| for page_num, page in enumerate(reader.pages, 1): |
| text = page.extract_text() |
| if text.strip(): |
| content.append(f"--- Page {page_num} ---\n{text}") |
|
|
| full_content = "\n\n".join(content) |
|
|
| logger.info(f"PDF parsed successfully: {num_pages} pages, {len(full_content)} chars") |
|
|
| return { |
| "content": full_content, |
| "pages": num_pages, |
| "file_type": "PDF", |
| "file_path": file_path, |
| } |
|
|
| except FileNotFoundError as e: |
| logger.error(f"PDF file not found: {e}") |
| raise |
| except (IOError, OSError) as e: |
| logger.warning(f"PDF IO error (will retry): {e}") |
| raise |
| except Exception as e: |
| logger.error(f"PDF parsing error: {e}") |
| raise ValueError(f"Failed to parse PDF: {str(e)}") |
|
|
|
|
| |
| |
| |
|
|
| @retry( |
| stop=stop_after_attempt(MAX_RETRIES), |
| wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT), |
| retry=retry_if_exception_type((IOError, OSError)), |
| reraise=True, |
| ) |
| def parse_excel(file_path: str) -> Dict: |
| """ |
| Parse Excel file and extract data from all sheets. |
| |
| Args: |
| file_path: Path to Excel file (.xlsx or .xls) |
| |
| Returns: |
| Dict with structure: { |
| "content": str, # Formatted table data |
| "sheets": List[str], # Sheet names |
| "file_type": "Excel", |
| "file_path": str |
| } |
| |
| Raises: |
| FileNotFoundError: If file doesn't exist |
| ValueError: If file is corrupted or invalid |
| IOError: For file reading errors (triggers retry) |
| """ |
| try: |
| from openpyxl import load_workbook |
|
|
| path = Path(file_path) |
| if not path.exists(): |
| raise FileNotFoundError(f"Excel file not found: {file_path}") |
|
|
| logger.info(f"Parsing Excel: {file_path}") |
|
|
| workbook = load_workbook(str(path), data_only=True) |
| sheet_names = workbook.sheetnames |
|
|
| |
| content_parts = [] |
| for sheet_name in sheet_names: |
| sheet = workbook[sheet_name] |
|
|
| |
| rows = [] |
| for row in sheet.iter_rows(values_only=True): |
| |
| if any(cell is not None for cell in row): |
| row_str = "\t".join(str(cell) if cell is not None else "" for cell in row) |
| rows.append(row_str) |
|
|
| if rows: |
| sheet_content = f"=== Sheet: {sheet_name} ===\n" + "\n".join(rows) |
| content_parts.append(sheet_content) |
|
|
| full_content = "\n\n".join(content_parts) |
|
|
| logger.info(f"Excel parsed successfully: {len(sheet_names)} sheets") |
|
|
| return { |
| "content": full_content, |
| "sheets": sheet_names, |
| "file_type": "Excel", |
| "file_path": file_path, |
| } |
|
|
| except FileNotFoundError as e: |
| logger.error(f"Excel file not found: {e}") |
| raise |
| except (IOError, OSError) as e: |
| logger.warning(f"Excel IO error (will retry): {e}") |
| raise |
| except Exception as e: |
| logger.error(f"Excel parsing error: {e}") |
| raise ValueError(f"Failed to parse Excel: {str(e)}") |
|
|
|
|
| |
| |
| |
|
|
| @retry( |
| stop=stop_after_attempt(MAX_RETRIES), |
| wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT), |
| retry=retry_if_exception_type((IOError, OSError)), |
| reraise=True, |
| ) |
| def parse_word(file_path: str) -> Dict: |
| """ |
| Parse Word document and extract text content. |
| |
| Args: |
| file_path: Path to Word file (.docx) |
| |
| Returns: |
| Dict with structure: { |
| "content": str, # Extracted text |
| "paragraphs": int, # Number of paragraphs |
| "file_type": "Word", |
| "file_path": str |
| } |
| |
| Raises: |
| FileNotFoundError: If file doesn't exist |
| ValueError: If file is corrupted or invalid |
| IOError: For file reading errors (triggers retry) |
| """ |
| try: |
| from docx import Document |
|
|
| path = Path(file_path) |
| if not path.exists(): |
| raise FileNotFoundError(f"Word file not found: {file_path}") |
|
|
| logger.info(f"Parsing Word document: {file_path}") |
|
|
| doc = Document(str(path)) |
|
|
| |
| paragraphs = [para.text for para in doc.paragraphs if para.text.strip()] |
| full_content = "\n\n".join(paragraphs) |
|
|
| logger.info(f"Word parsed successfully: {len(paragraphs)} paragraphs") |
|
|
| return { |
| "content": full_content, |
| "paragraphs": len(paragraphs), |
| "file_type": "Word", |
| "file_path": file_path, |
| } |
|
|
| except FileNotFoundError as e: |
| logger.error(f"Word file not found: {e}") |
| raise |
| except (IOError, OSError) as e: |
| logger.warning(f"Word IO error (will retry): {e}") |
| raise |
| except Exception as e: |
| logger.error(f"Word parsing error: {e}") |
| raise ValueError(f"Failed to parse Word document: {str(e)}") |
|
|
|
|
| |
| |
| |
|
|
| @retry( |
| stop=stop_after_attempt(MAX_RETRIES), |
| wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT), |
| retry=retry_if_exception_type((IOError, OSError)), |
| reraise=True, |
| ) |
| def parse_text(file_path: str) -> Dict: |
| """ |
| Parse plain text or CSV file. |
| |
| Args: |
| file_path: Path to text file (.txt or .csv) |
| |
| Returns: |
| Dict with structure: { |
| "content": str, |
| "lines": int, |
| "file_type": "Text" or "CSV", |
| "file_path": str |
| } |
| |
| Raises: |
| FileNotFoundError: If file doesn't exist |
| IOError: For file reading errors (triggers retry) |
| """ |
| try: |
| path = Path(file_path) |
| if not path.exists(): |
| raise FileNotFoundError(f"Text file not found: {file_path}") |
|
|
| logger.info(f"Parsing text file: {file_path}") |
|
|
| with open(path, 'r', encoding='utf-8') as f: |
| content = f.read() |
|
|
| lines = content.count('\n') + 1 |
| file_type = "CSV" if path.suffix == '.csv' else "Text" |
|
|
| logger.info(f"{file_type} file parsed successfully: {lines} lines") |
|
|
| return { |
| "content": content, |
| "lines": lines, |
| "file_type": file_type, |
| "file_path": file_path, |
| } |
|
|
| except FileNotFoundError as e: |
| logger.error(f"Text file not found: {e}") |
| raise |
| except (IOError, OSError) as e: |
| logger.warning(f"Text file IO error (will retry): {e}") |
| raise |
| except UnicodeDecodeError as e: |
| logger.error(f"Text file encoding error: {e}") |
| raise ValueError(f"Failed to decode text file (try UTF-8): {str(e)}") |
|
|
|
|
| |
| |
| |
|
|
| def parse_file(file_path: str) -> Dict: |
| """ |
| Parse file based on extension, automatically selecting the right parser. |
| |
| Args: |
| file_path: Path to file |
| |
| Returns: |
| Dict with parsed content and metadata |
| |
| Raises: |
| ValueError: If file type is not supported |
| FileNotFoundError: If file doesn't exist |
| Exception: For parsing errors |
| """ |
| path = Path(file_path) |
| extension = path.suffix.lower() |
|
|
| if extension not in SUPPORTED_EXTENSIONS: |
| raise ValueError( |
| f"Unsupported file type: {extension}. " |
| f"Supported: {', '.join(SUPPORTED_EXTENSIONS.keys())}" |
| ) |
|
|
| logger.info(f"Dispatching parser for {SUPPORTED_EXTENSIONS[extension]} file: {file_path}") |
|
|
| |
| if extension == '.pdf': |
| return parse_pdf(file_path) |
| elif extension in ['.xlsx', '.xls']: |
| return parse_excel(file_path) |
| elif extension == '.docx': |
| return parse_word(file_path) |
| elif extension in ['.txt', '.csv']: |
| return parse_text(file_path) |
| else: |
| |
| raise ValueError(f"No parser for extension: {extension}") |
|
|