| import os |
| import pandas as pd |
| from langchain_community.document_loaders import PyPDFLoader |
| from langchain.tools import Tool |
| from utils.file_downloader import FileDownloader |
| from dotenv import load_dotenv |
|
|
| load_dotenv() |
|
|
|
|
| class DocumentParserTool: |
| """A tool for parsing PDF and XLSX documents.""" |
|
|
| def __init__(self): |
| """Initialize the DocumentParserTool with FileDownloader.""" |
| self.downloader = FileDownloader() |
|
|
| def parse_document_from_url_or_path(self, path_or_url: str) -> str: |
| """ |
| Parse a document from URL or file path. Downloads if URL, uses directly if path. |
| |
| Args: |
| path_or_url (str): URL to download from or file path to use |
| |
| Returns: |
| str: Parsed content of the document |
| """ |
| try: |
| |
| file_path = self.downloader.get_file_path(path_or_url) |
|
|
| |
| result = self.parse_document(file_path) |
|
|
| |
| source_info = f"Source: {'Downloaded from ' + path_or_url if self.downloader.is_url(path_or_url) else 'File at ' + path_or_url}\n" |
| source_info += f"Local file path: {file_path}\n\n" |
|
|
| return source_info + result |
|
|
| except Exception as e: |
| return f"Error processing {path_or_url}: {str(e)}" |
|
|
| def parse_document(self, document_path: str) -> str: |
| """ |
| Parse a document from the given file path. |
| |
| Args: |
| document_path (str): Path to the document file |
| |
| Returns: |
| str: Parsed content of the document |
| """ |
| if not os.path.exists(document_path): |
| return f"Error: File not found at path: {document_path}" |
|
|
| try: |
| file_extension = os.path.splitext(document_path)[1].lower() |
|
|
| if file_extension == ".pdf": |
| return self._parse_pdf(document_path) |
| elif file_extension in [".xlsx", ".xls"]: |
| return self._parse_excel(document_path) |
| else: |
| return f"Error: Unsupported file format '{file_extension}'. Supported formats: PDF (.pdf), Excel (.xlsx, .xls)" |
|
|
| except Exception as e: |
| return f"Error parsing document: {str(e)}" |
|
|
| def _parse_pdf(self, document_path: str) -> str: |
| """Parse PDF document and extract text content.""" |
| try: |
| loader = PyPDFLoader(document_path) |
| pages = loader.load_and_split() |
| pdf_text = " ".join(page.page_content for page in pages) |
|
|
| if not pdf_text.strip(): |
| return ( |
| "Warning: PDF appears to be empty or contains no extractable text." |
| ) |
|
|
| return ( |
| f"PDF Content (from {os.path.basename(document_path)}):\n\n{pdf_text}" |
| ) |
|
|
| except Exception as e: |
| return f"Error parsing PDF: {str(e)}" |
|
|
| def _parse_excel(self, document_path: str) -> str: |
| """Parse Excel document and extract structured data.""" |
| try: |
| |
| excel_file = pd.ExcelFile(document_path) |
| sheet_names = excel_file.sheet_names |
|
|
| if not sheet_names: |
| return "Warning: Excel file contains no sheets." |
|
|
| parsed_content = ( |
| f"Excel Content (from {os.path.basename(document_path)}):\n\n" |
| ) |
| parsed_content += f"Number of sheets: {len(sheet_names)}\n" |
| parsed_content += f"Sheet names: {', '.join(sheet_names)}\n\n" |
|
|
| for sheet_name in sheet_names: |
| try: |
| df = pd.read_excel(document_path, sheet_name=sheet_name) |
|
|
| parsed_content += f"--- Sheet: {sheet_name} ---\n" |
| parsed_content += ( |
| f"Dimensions: {df.shape[0]} rows × {df.shape[1]} columns\n" |
| ) |
|
|
| if df.empty: |
| parsed_content += "Sheet is empty.\n\n" |
| continue |
|
|
| parsed_content += ( |
| f"Columns: {', '.join(df.columns.astype(str))}\n\n" |
| ) |
|
|
| |
| sample_rows = min(5, len(df)) |
| parsed_content += f"Sample data (first {sample_rows} rows):\n" |
| parsed_content += df.head(sample_rows).to_string(index=False) |
| parsed_content += "\n\n" |
|
|
| |
| numeric_cols = df.select_dtypes(include=["number"]).columns |
| if not numeric_cols.empty: |
| parsed_content += "Summary statistics for numeric columns:\n" |
| parsed_content += df[numeric_cols].describe().to_string() |
| parsed_content += "\n\n" |
|
|
| except Exception as sheet_error: |
| parsed_content += ( |
| f"Error reading sheet '{sheet_name}': {str(sheet_error)}\n\n" |
| ) |
|
|
| return parsed_content |
|
|
| except Exception as e: |
| return f"Error parsing Excel file: {str(e)}" |
|
|
|
|
| |
| document_parser_tool_instance = DocumentParserTool() |
|
|
| |
| document_parser_tool = Tool( |
| name="document_parser", |
| description=( |
| "Parse PDF and Excel (.xlsx, .xls) documents to extract their content. " |
| "For PDFs, extracts all text content. For Excel files, provides structured data " |
| "including sheet names, dimensions, column headers, sample data, and summary statistics. " |
| "Input should be a file path to the document." |
| ), |
| func=document_parser_tool_instance.parse_document, |
| ) |
|
|
| |
| document_parser_url_tool = Tool( |
| name="document_parser_url", |
| description=( |
| "Parse PDF and Excel (.xlsx, .xls) documents from URLs or file paths. " |
| "If URL is provided, downloads the file first. If file path is provided, uses it directly. " |
| "For PDFs, extracts all text content. For Excel files, provides structured data " |
| "including sheet names, dimensions, column headers, sample data, and summary statistics. " |
| "Input can be either a URL (http/https) or a local file path." |
| ), |
| func=document_parser_tool_instance.parse_document_from_url_or_path, |
| ) |
|
|
| if __name__ == "__main__": |
| print("Start testing document parser tool with file downloader integration") |
| |
| |
| from utils.agent_executor import create_agent_executor |
|
|
| |
| downloader = FileDownloader() |
|
|
| |
| test_files = [ |
| "https://arxiv.org/pdf/2501.00147", |
| |
| |
| ] |
|
|
| downloaded_files = [] |
|
|
| for test_input in test_files: |
| print(f"\n--- Processing: {test_input} ---") |
|
|
| try: |
| |
| file_path = downloader.get_file_path(test_input) |
| print(f"Using file path: {file_path}") |
|
|
| |
| if downloader.is_url(test_input): |
| downloaded_files.append(file_path) |
|
|
| |
| result = document_parser_tool_instance.parse_document(file_path) |
| print( |
| f"Parse result preview: {result[:500] + '...' if len(result) > 500 else result}" |
| ) |
|
|
| |
| tools = [document_parser_url_tool] |
| agent_executor = create_agent_executor(tools=tools) |
|
|
| |
| prompt_with_input = f"""Please analyze the document from this source: {test_input} |
| |
| Use the document_parser_url tool to download (if URL) and analyze the content. |
| Provide a comprehensive summary of what you find in the document. |
| |
| The tool will handle both URLs (by downloading) and file paths (by using directly).""" |
|
|
| print(f"\n--- Testing with Agent Executor (URL-capable tool) ---") |
| response = agent_executor.invoke({"input": prompt_with_input}) |
| print("Agent Response:") |
| print(response["output"]) |
|
|
| except Exception as e: |
| print(f"Error processing {test_input}: {str(e)}") |
|
|
| |
| print(f"\n--- Cleanup ---") |
| for file_path in downloaded_files: |
| try: |
| downloader.delete_file(file_path) |
| except Exception as e: |
| print(f"Warning: Could not delete {file_path}: {e}") |
|
|
| print(f"Final downloader state: {repr(downloader)}") |
|
|