agentbee

Sleeping

App Files Files Community

agentbee / src /tools /file_parser.py

mangubee

fix: correct author name formatting in multiple files

e7b4937 3 months ago

raw

history blame contribute delete

11.1 kB

	"""
	File Parser Tool - Multi-format file reading
	Author: @mangubee
	Date: 2026-01-02

	Provides file parsing for:
	- PDF files (.pdf) using PyPDF2
	- Excel files (.xlsx, .xls) using openpyxl
	- Word documents (.docx) using python-docx
	- Text files (.txt, .csv) using built-in open()

	All parsers include retry logic and error handling.
	"""

	import logging
	from pathlib import Path
	from typing import Dict, List, Optional
	from tenacity import (
	retry,
	stop_after_attempt,
	wait_exponential,
	retry_if_exception_type,
	)

	# ============================================================================
	# CONFIG
	# ============================================================================
	MAX_RETRIES = 3
	RETRY_MIN_WAIT = 1 # seconds
	RETRY_MAX_WAIT = 5 # seconds

	SUPPORTED_EXTENSIONS = {
	'.pdf': 'PDF',
	'.xlsx': 'Excel',
	'.xls': 'Excel',
	'.docx': 'Word',
	'.txt': 'Text',
	'.csv': 'CSV',
	}

	# ============================================================================
	# Logging Setup
	# ============================================================================
	logger = logging.getLogger(__name__)


	# ============================================================================
	# PDF Parser
	# ============================================================================

	@retry(
	stop=stop_after_attempt(MAX_RETRIES),
	wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT),
	retry=retry_if_exception_type((IOError, OSError)),
	reraise=True,
	)
	def parse_pdf(file_path: str) -> Dict:
	"""
	Parse PDF file and extract text content.

	Args:
	file_path: Path to PDF file

	Returns:
	Dict with structure: {
	"content": str, # Extracted text
	"pages": int, # Number of pages
	"file_type": "PDF",
	"file_path": str
	}

	Raises:
	FileNotFoundError: If file doesn't exist
	ValueError: If file is corrupted or invalid
	IOError: For file reading errors (triggers retry)
	"""
	try:
	from PyPDF2 import PdfReader

	path = Path(file_path)
	if not path.exists():
	raise FileNotFoundError(f"PDF file not found: {file_path}")

	logger.info(f"Parsing PDF: {file_path}")

	reader = PdfReader(str(path))
	num_pages = len(reader.pages)

	# Extract text from all pages
	content = []
	for page_num, page in enumerate(reader.pages, 1):
	text = page.extract_text()
	if text.strip():
	content.append(f"--- Page {page_num} ---\n{text}")

	full_content = "\n\n".join(content)

	logger.info(f"PDF parsed successfully: {num_pages} pages, {len(full_content)} chars")

	return {
	"content": full_content,
	"pages": num_pages,
	"file_type": "PDF",
	"file_path": file_path,
	}

	except FileNotFoundError as e:
	logger.error(f"PDF file not found: {e}")
	raise
	except (IOError, OSError) as e:
	logger.warning(f"PDF IO error (will retry): {e}")
	raise
	except Exception as e:
	logger.error(f"PDF parsing error: {e}")
	raise ValueError(f"Failed to parse PDF: {str(e)}")


	# ============================================================================
	# Excel Parser
	# ============================================================================

	@retry(
	stop=stop_after_attempt(MAX_RETRIES),
	wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT),
	retry=retry_if_exception_type((IOError, OSError)),
	reraise=True,
	)
	def parse_excel(file_path: str) -> Dict:
	"""
	Parse Excel file and extract data from all sheets.

	Args:
	file_path: Path to Excel file (.xlsx or .xls)

	Returns:
	Dict with structure: {
	"content": str, # Formatted table data
	"sheets": List[str], # Sheet names
	"file_type": "Excel",
	"file_path": str
	}

	Raises:
	FileNotFoundError: If file doesn't exist
	ValueError: If file is corrupted or invalid
	IOError: For file reading errors (triggers retry)
	"""
	try:
	from openpyxl import load_workbook

	path = Path(file_path)
	if not path.exists():
	raise FileNotFoundError(f"Excel file not found: {file_path}")

	logger.info(f"Parsing Excel: {file_path}")

	workbook = load_workbook(str(path), data_only=True)
	sheet_names = workbook.sheetnames

	# Extract data from all sheets
	content_parts = []
	for sheet_name in sheet_names:
	sheet = workbook[sheet_name]

	# Get all values
	rows = []
	for row in sheet.iter_rows(values_only=True):
	# Filter out completely empty rows
	if any(cell is not None for cell in row):
	row_str = "\t".join(str(cell) if cell is not None else "" for cell in row)
	rows.append(row_str)

	if rows:
	sheet_content = f"=== Sheet: {sheet_name} ===\n" + "\n".join(rows)
	content_parts.append(sheet_content)

	full_content = "\n\n".join(content_parts)

	logger.info(f"Excel parsed successfully: {len(sheet_names)} sheets")

	return {
	"content": full_content,
	"sheets": sheet_names,
	"file_type": "Excel",
	"file_path": file_path,
	}

	except FileNotFoundError as e:
	logger.error(f"Excel file not found: {e}")
	raise
	except (IOError, OSError) as e:
	logger.warning(f"Excel IO error (will retry): {e}")
	raise
	except Exception as e:
	logger.error(f"Excel parsing error: {e}")
	raise ValueError(f"Failed to parse Excel: {str(e)}")


	# ============================================================================
	# Word Document Parser
	# ============================================================================

	@retry(
	stop=stop_after_attempt(MAX_RETRIES),
	wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT),
	retry=retry_if_exception_type((IOError, OSError)),
	reraise=True,
	)
	def parse_word(file_path: str) -> Dict:
	"""
	Parse Word document and extract text content.

	Args:
	file_path: Path to Word file (.docx)

	Returns:
	Dict with structure: {
	"content": str, # Extracted text
	"paragraphs": int, # Number of paragraphs
	"file_type": "Word",
	"file_path": str
	}

	Raises:
	FileNotFoundError: If file doesn't exist
	ValueError: If file is corrupted or invalid
	IOError: For file reading errors (triggers retry)
	"""
	try:
	from docx import Document

	path = Path(file_path)
	if not path.exists():
	raise FileNotFoundError(f"Word file not found: {file_path}")

	logger.info(f"Parsing Word document: {file_path}")

	doc = Document(str(path))

	# Extract text from all paragraphs
	paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
	full_content = "\n\n".join(paragraphs)

	logger.info(f"Word parsed successfully: {len(paragraphs)} paragraphs")

	return {
	"content": full_content,
	"paragraphs": len(paragraphs),
	"file_type": "Word",
	"file_path": file_path,
	}

	except FileNotFoundError as e:
	logger.error(f"Word file not found: {e}")
	raise
	except (IOError, OSError) as e:
	logger.warning(f"Word IO error (will retry): {e}")
	raise
	except Exception as e:
	logger.error(f"Word parsing error: {e}")
	raise ValueError(f"Failed to parse Word document: {str(e)}")


	# ============================================================================
	# Text/CSV Parser
	# ============================================================================

	@retry(
	stop=stop_after_attempt(MAX_RETRIES),
	wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT),
	retry=retry_if_exception_type((IOError, OSError)),
	reraise=True,
	)
	def parse_text(file_path: str) -> Dict:
	"""
	Parse plain text or CSV file.

	Args:
	file_path: Path to text file (.txt or .csv)

	Returns:
	Dict with structure: {
	"content": str,
	"lines": int,
	"file_type": "Text" or "CSV",
	"file_path": str
	}

	Raises:
	FileNotFoundError: If file doesn't exist
	IOError: For file reading errors (triggers retry)
	"""
	try:
	path = Path(file_path)
	if not path.exists():
	raise FileNotFoundError(f"Text file not found: {file_path}")

	logger.info(f"Parsing text file: {file_path}")

	with open(path, 'r', encoding='utf-8') as f:
	content = f.read()

	lines = content.count('\n') + 1
	file_type = "CSV" if path.suffix == '.csv' else "Text"

	logger.info(f"{file_type} file parsed successfully: {lines} lines")

	return {
	"content": content,
	"lines": lines,
	"file_type": file_type,
	"file_path": file_path,
	}

	except FileNotFoundError as e:
	logger.error(f"Text file not found: {e}")
	raise
	except (IOError, OSError) as e:
	logger.warning(f"Text file IO error (will retry): {e}")
	raise
	except UnicodeDecodeError as e:
	logger.error(f"Text file encoding error: {e}")
	raise ValueError(f"Failed to decode text file (try UTF-8): {str(e)}")


	# ============================================================================
	# Unified File Parser
	# ============================================================================

	def parse_file(file_path: str) -> Dict:
	"""
	Parse file based on extension, automatically selecting the right parser.

	Args:
	file_path: Path to file

	Returns:
	Dict with parsed content and metadata

	Raises:
	ValueError: If file type is not supported
	FileNotFoundError: If file doesn't exist
	Exception: For parsing errors
	"""
	path = Path(file_path)
	extension = path.suffix.lower()

	if extension not in SUPPORTED_EXTENSIONS:
	raise ValueError(
	f"Unsupported file type: {extension}. "
	f"Supported: {', '.join(SUPPORTED_EXTENSIONS.keys())}"
	)

	logger.info(f"Dispatching parser for {SUPPORTED_EXTENSIONS[extension]} file: {file_path}")

	# Dispatch to appropriate parser
	if extension == '.pdf':
	return parse_pdf(file_path)
	elif extension in ['.xlsx', '.xls']:
	return parse_excel(file_path)
	elif extension == '.docx':
	return parse_word(file_path)
	elif extension in ['.txt', '.csv']:
	return parse_text(file_path)
	else:
	# Should never reach here due to check above
	raise ValueError(f"No parser for extension: {extension}")