Final_Assignment_Template

Sleeping

Final_Assignment_Template / utils /document_parser_tool.py

chevisli

Alpha version app

bfb26a0 10 months ago

9.32 kB

	import os
	import pandas as pd
	from langchain_community.document_loaders import PyPDFLoader
	from langchain.tools import Tool
	from utils.file_downloader import FileDownloader
	from dotenv import load_dotenv

	load_dotenv()


	class DocumentParserTool:
	"""A tool for parsing PDF and XLSX documents."""

	def __init__(self):
	"""Initialize the DocumentParserTool with FileDownloader."""
	self.downloader = FileDownloader()

	def parse_document_from_url_or_path(self, path_or_url: str) -> str:
	"""
	Parse a document from URL or file path. Downloads if URL, uses directly if path.

	Args:
	path_or_url (str): URL to download from or file path to use

	Returns:
	str: Parsed content of the document
	"""
	try:
	# Get file path (download if URL, verify if file path)
	file_path = self.downloader.get_file_path(path_or_url)

	# Parse the document
	result = self.parse_document(file_path)

	# Add context about the source
	source_info = f"Source: {'Downloaded from ' + path_or_url if self.downloader.is_url(path_or_url) else 'File at ' + path_or_url}\n"
	source_info += f"Local file path: {file_path}\n\n"

	return source_info + result

	except Exception as e:
	return f"Error processing {path_or_url}: {str(e)}"

	def parse_document(self, document_path: str) -> str:
	"""
	Parse a document from the given file path.

	Args:
	document_path (str): Path to the document file

	Returns:
	str: Parsed content of the document
	"""
	if not os.path.exists(document_path):
	return f"Error: File not found at path: {document_path}"

	try:
	file_extension = os.path.splitext(document_path)[1].lower()

	if file_extension == ".pdf":
	return self._parse_pdf(document_path)
	elif file_extension in [".xlsx", ".xls"]:
	return self._parse_excel(document_path)
	else:
	return f"Error: Unsupported file format '{file_extension}'. Supported formats: PDF (.pdf), Excel (.xlsx, .xls)"

	except Exception as e:
	return f"Error parsing document: {str(e)}"

	def _parse_pdf(self, document_path: str) -> str:
	"""Parse PDF document and extract text content."""
	try:
	loader = PyPDFLoader(document_path)
	pages = loader.load_and_split()
	pdf_text = " ".join(page.page_content for page in pages)

	if not pdf_text.strip():
	return (
	"Warning: PDF appears to be empty or contains no extractable text."
	)

	return (
	f"PDF Content (from {os.path.basename(document_path)}):\n\n{pdf_text}"
	)

	except Exception as e:
	return f"Error parsing PDF: {str(e)}"

	def _parse_excel(self, document_path: str) -> str:
	"""Parse Excel document and extract structured data."""
	try:
	# Read all sheets from the Excel file
	excel_file = pd.ExcelFile(document_path)
	sheet_names = excel_file.sheet_names

	if not sheet_names:
	return "Warning: Excel file contains no sheets."

	parsed_content = (
	f"Excel Content (from {os.path.basename(document_path)}):\n\n"
	)
	parsed_content += f"Number of sheets: {len(sheet_names)}\n"
	parsed_content += f"Sheet names: {', '.join(sheet_names)}\n\n"

	for sheet_name in sheet_names:
	try:
	df = pd.read_excel(document_path, sheet_name=sheet_name)

	parsed_content += f"--- Sheet: {sheet_name} ---\n"
	parsed_content += (
	f"Dimensions: {df.shape[0]} rows × {df.shape[1]} columns\n"
	)

	if df.empty:
	parsed_content += "Sheet is empty.\n\n"
	continue

	parsed_content += (
	f"Columns: {', '.join(df.columns.astype(str))}\n\n"
	)

	# Include first few rows as sample data
	sample_rows = min(5, len(df))
	parsed_content += f"Sample data (first {sample_rows} rows):\n"
	parsed_content += df.head(sample_rows).to_string(index=False)
	parsed_content += "\n\n"

	# Include summary statistics for numeric columns
	numeric_cols = df.select_dtypes(include=["number"]).columns
	if not numeric_cols.empty:
	parsed_content += "Summary statistics for numeric columns:\n"
	parsed_content += df[numeric_cols].describe().to_string()
	parsed_content += "\n\n"

	except Exception as sheet_error:
	parsed_content += (
	f"Error reading sheet '{sheet_name}': {str(sheet_error)}\n\n"
	)

	return parsed_content

	except Exception as e:
	return f"Error parsing Excel file: {str(e)}"


	# Create the DocumentParserTool instance
	document_parser_tool_instance = DocumentParserTool()

	# Create a LangChain Tool wrapper for the document parser (file paths only)
	document_parser_tool = Tool(
	name="document_parser",
	description=(
	"Parse PDF and Excel (.xlsx, .xls) documents to extract their content. "
	"For PDFs, extracts all text content. For Excel files, provides structured data "
	"including sheet names, dimensions, column headers, sample data, and summary statistics. "
	"Input should be a file path to the document."
	),
	func=document_parser_tool_instance.parse_document,
	)

	# Create a LangChain Tool wrapper for the document parser with URL/path support
	document_parser_url_tool = Tool(
	name="document_parser_url",
	description=(
	"Parse PDF and Excel (.xlsx, .xls) documents from URLs or file paths. "
	"If URL is provided, downloads the file first. If file path is provided, uses it directly. "
	"For PDFs, extracts all text content. For Excel files, provides structured data "
	"including sheet names, dimensions, column headers, sample data, and summary statistics. "
	"Input can be either a URL (http/https) or a local file path."
	),
	func=document_parser_tool_instance.parse_document_from_url_or_path,
	)

	if __name__ == "__main__":
	print("Start testing document parser tool with file downloader integration")

	# Import here to avoid circular import
	from utils.agent_executor import create_agent_executor

	# Initialize file downloader
	downloader = FileDownloader()

	# Test with both URLs and file paths
	test_files = [
	"https://arxiv.org/pdf/2501.00147", # URL - should be downloaded
	# "https://agents-course-unit4-scoring.hf.space/files/7bd855d8-463d-4ed5-93ca-5fe35145f733", # URL - should be downloaded
	# "./test_document.pdf", # File path - should be used directly (if exists)
	]

	downloaded_files = [] # Keep track of downloaded files for cleanup

	for test_input in test_files:
	print(f"\n--- Processing: {test_input} ---")

	try:
	# Get file path (download if URL, verify if file path)
	file_path = downloader.get_file_path(test_input)
	print(f"Using file path: {file_path}")

	# Track downloaded files for cleanup
	if downloader.is_url(test_input):
	downloaded_files.append(file_path)

	# Test document parser with the file
	result = document_parser_tool_instance.parse_document(file_path)
	print(
	f"Parse result preview: {result[:500] + '...' if len(result) > 500 else result}"
	)

	# Test with agent executor using the URL-capable tool
	tools = [document_parser_url_tool]
	agent_executor = create_agent_executor(tools=tools)

	# Create a comprehensive prompt that includes the original input
	prompt_with_input = f"""Please analyze the document from this source: {test_input}

	Use the document_parser_url tool to download (if URL) and analyze the content.
	Provide a comprehensive summary of what you find in the document.

	The tool will handle both URLs (by downloading) and file paths (by using directly)."""

	print(f"\n--- Testing with Agent Executor (URL-capable tool) ---")
	response = agent_executor.invoke({"input": prompt_with_input})
	print("Agent Response:")
	print(response["output"])

	except Exception as e:
	print(f"Error processing {test_input}: {str(e)}")

	# Cleanup downloaded files
	print(f"\n--- Cleanup ---")
	for file_path in downloaded_files:
	try:
	downloader.delete_file(file_path)
	except Exception as e:
	print(f"Warning: Could not delete {file_path}: {e}")

	print(f"Final downloader state: {repr(downloader)}")