Stardust00 commited on
Commit
0335261
·
1 Parent(s): 71ddb0d

setup tools

Browse files
pyproject.toml CHANGED
@@ -8,12 +8,14 @@ dependencies = [
8
  "gradio[oauth]>=4.0.0",
9
  "requests>=2.25.0",
10
  "pandas>=1.3.0",
 
11
  "python-dotenv>=1.0.0",
12
  "langchain>=0.1.0",
13
  "langchain-community>=0.0.20",
14
  "langchain-core>=0.1.0",
15
  "langchain-openai>=0.0.5",
16
  "langchain-google-community>=1.0.0",
 
17
  ]
18
 
19
  [project.optional-dependencies]
 
8
  "gradio[oauth]>=4.0.0",
9
  "requests>=2.25.0",
10
  "pandas>=1.3.0",
11
+ "pypdf>=5.6.0",
12
  "python-dotenv>=1.0.0",
13
  "langchain>=0.1.0",
14
  "langchain-community>=0.0.20",
15
  "langchain-core>=0.1.0",
16
  "langchain-openai>=0.0.5",
17
  "langchain-google-community>=1.0.0",
18
+ "openpyxl>=3.0.0",
19
  ]
20
 
21
  [project.optional-dependencies]
tools.py DELETED
@@ -1,114 +0,0 @@
1
- import os
2
- from langchain.agents import AgentExecutor, create_react_agent
3
- from langchain_google_community import GoogleSearchRun, GoogleSearchAPIWrapper
4
- from langchain_core.prompts import PromptTemplate
5
- from langchain_openai import ChatOpenAI # Or any other LangChain compatible LLM
6
- from langchain.tools import Tool
7
- from dotenv import load_dotenv
8
- import pandas as pd
9
- import json
10
-
11
- load_dotenv()
12
-
13
- def analyze_file_content(file_path: str) -> str:
14
- """
15
- Analyze file content and provide information about the file.
16
- """
17
- if not os.path.exists(file_path):
18
- return f"File not found: {file_path}"
19
-
20
- try:
21
- file_size = os.path.getsize(file_path)
22
- file_extension = os.path.splitext(file_path)[1].lower()
23
-
24
- # Handle different file types
25
- if file_extension == '.csv':
26
- df = pd.read_csv(file_path)
27
- return f"CSV file with {len(df)} rows and {len(df.columns)} columns. Columns: {list(df.columns)[:10]}. First few rows:\n{df.head().to_string()}"
28
-
29
- elif file_extension == '.json':
30
- with open(file_path, 'r', encoding='utf-8') as f:
31
- data = json.load(f)
32
- return f"JSON file. Keys: {list(data.keys()) if isinstance(data, dict) else 'Array with ' + str(len(data)) + ' items'}"
33
-
34
- elif file_extension in ['.txt', '.md', '.py', '.js', '.html', '.css']:
35
- with open(file_path, 'r', encoding='utf-8') as f:
36
- content = f.read()
37
- return f"Text file ({file_extension}) with {len(content)} characters. Content preview:\n{content[:500]}..."
38
-
39
- elif file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']:
40
- return f"Image file ({file_extension}) - {file_size} bytes. Use vision capabilities to analyze this image."
41
-
42
- else:
43
- return f"File: {file_path} ({file_extension}) - {file_size} bytes. Binary or unknown format."
44
-
45
- except Exception as e:
46
- return f"Error analyzing file {file_path}: {str(e)}"
47
-
48
- # 1. Initialize the Tools
49
- # Google Search Tool
50
- search_wrapper = GoogleSearchAPIWrapper()
51
- search_tool = GoogleSearchRun(api_wrapper=search_wrapper)
52
-
53
- # File Analysis Tool
54
- file_analysis_tool = Tool(
55
- name="file_analyzer",
56
- description="Analyze the content of files including CSV, JSON, text files, and images. Input should be a file path.",
57
- func=analyze_file_content
58
- )
59
-
60
- tools = [search_tool, file_analysis_tool]
61
-
62
- # 2. Create a simple prompt template for an agent
63
- template = """
64
- You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
65
-
66
- Answer the following questions as best you can. You have access to the following tools:
67
-
68
- {tools}
69
-
70
- Use the following format:
71
-
72
- Question: the input question you must answer
73
- Thought: you should always think about what to do
74
- Action: the action to take, should be one of [{tool_names}]
75
- Action Input: the input to the action
76
- Observation: the result of the action
77
- ... (this Thought/Action/Action Input/Observation can repeat N times)
78
- Thought: I now know the final answer
79
- Final Answer: the final answer to the original input question
80
-
81
- Begin!
82
-
83
- Question: {input}
84
- {agent_scratchpad}
85
- """
86
- prompt = PromptTemplate.from_template(template)
87
-
88
-
89
- # 3. Set up the LLM and Agent
90
- llm = ChatOpenAI(
91
- model="gpt-4o", # Vision-capable model
92
- temperature=0,
93
- timeout=60, # 60 second timeout for LLM calls
94
- request_timeout=120, # 2 minute timeout for requests
95
- max_retries=2, # Retry failed requests
96
- )
97
-
98
- agent = create_react_agent(llm, tools, prompt)
99
- agent_executor = AgentExecutor(
100
- agent=agent,
101
- tools=tools,
102
- verbose=True,
103
- max_execution_time=60, # 1 minute timeout for entire agent execution
104
- max_iterations=10, # Limit agent iterations to prevent infinite loops
105
- early_stopping_method="generate" # Stop early if final answer is generated
106
- )
107
-
108
- # 4. Run the agent with a question
109
- # response = agent_executor.invoke({
110
- # "input": "What is the current capital of Australia and when was it founded?"
111
- # })
112
-
113
- # print("\nFinal Answer:")
114
- # print(response['output'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/agent_executor.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.agents import AgentExecutor, create_react_agent
2
+ from langchain_openai import ChatOpenAI
3
+ from prompt import prompt_default
4
+
5
+
6
+ def create_agent_executor(
7
+ llm=None,
8
+ tools=None,
9
+ prompt=None,
10
+ verbose=True,
11
+ max_execution_time=60,
12
+ max_iterations=10,
13
+ early_stopping_method="generate",
14
+ ):
15
+ if llm is None:
16
+ llm = ChatOpenAI(
17
+ model="gpt-4o", # Vision-capable model
18
+ temperature=0,
19
+ timeout=60, # 60 second timeout for LLM calls
20
+ request_timeout=120, # 2 minute timeout for requests
21
+ max_retries=2, # Retry failed requests
22
+ )
23
+
24
+ if tools is None:
25
+ tools = []
26
+
27
+ if prompt is None:
28
+ prompt = prompt_default
29
+
30
+ agent = create_react_agent(llm, tools, prompt)
31
+ agent_executor = AgentExecutor(
32
+ agent=agent,
33
+ tools=tools,
34
+ verbose=verbose,
35
+ max_execution_time=max_execution_time,
36
+ max_iterations=max_iterations,
37
+ early_stopping_method=early_stopping_method,
38
+ )
39
+ return agent_executor
utils/audio_parser_tool.py ADDED
File without changes
utils/document_parser_tool.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ from langchain_community.document_loaders import PyPDFLoader
4
+ from langchain.tools import Tool
5
+ from agent_executor import create_agent_executor
6
+ from file_downloader import FileDownloader
7
+ from dotenv import load_dotenv
8
+
9
+ load_dotenv()
10
+
11
+
12
+ class DocumentParserTool:
13
+ """A tool for parsing PDF and XLSX documents."""
14
+
15
+ def __init__(self):
16
+ """Initialize the DocumentParserTool with FileDownloader."""
17
+ self.downloader = FileDownloader()
18
+
19
+ def parse_document_from_url_or_path(self, path_or_url: str) -> str:
20
+ """
21
+ Parse a document from URL or file path. Downloads if URL, uses directly if path.
22
+
23
+ Args:
24
+ path_or_url (str): URL to download from or file path to use
25
+
26
+ Returns:
27
+ str: Parsed content of the document
28
+ """
29
+ try:
30
+ # Get file path (download if URL, verify if file path)
31
+ file_path = self.downloader.get_file_path(path_or_url)
32
+
33
+ # Parse the document
34
+ result = self.parse_document(file_path)
35
+
36
+ # Add context about the source
37
+ source_info = f"Source: {'Downloaded from ' + path_or_url if self.downloader.is_url(path_or_url) else 'File at ' + path_or_url}\n"
38
+ source_info += f"Local file path: {file_path}\n\n"
39
+
40
+ return source_info + result
41
+
42
+ except Exception as e:
43
+ return f"Error processing {path_or_url}: {str(e)}"
44
+
45
+ def parse_document(self, document_path: str) -> str:
46
+ """
47
+ Parse a document from the given file path.
48
+
49
+ Args:
50
+ document_path (str): Path to the document file
51
+
52
+ Returns:
53
+ str: Parsed content of the document
54
+ """
55
+ if not os.path.exists(document_path):
56
+ return f"Error: File not found at path: {document_path}"
57
+
58
+ try:
59
+ file_extension = os.path.splitext(document_path)[1].lower()
60
+
61
+ if file_extension == ".pdf":
62
+ return self._parse_pdf(document_path)
63
+ elif file_extension in [".xlsx", ".xls"]:
64
+ return self._parse_excel(document_path)
65
+ else:
66
+ return f"Error: Unsupported file format '{file_extension}'. Supported formats: PDF (.pdf), Excel (.xlsx, .xls)"
67
+
68
+ except Exception as e:
69
+ return f"Error parsing document: {str(e)}"
70
+
71
+ def _parse_pdf(self, document_path: str) -> str:
72
+ """Parse PDF document and extract text content."""
73
+ try:
74
+ loader = PyPDFLoader(document_path)
75
+ pages = loader.load_and_split()
76
+ pdf_text = " ".join(page.page_content for page in pages)
77
+
78
+ if not pdf_text.strip():
79
+ return (
80
+ "Warning: PDF appears to be empty or contains no extractable text."
81
+ )
82
+
83
+ return (
84
+ f"PDF Content (from {os.path.basename(document_path)}):\n\n{pdf_text}"
85
+ )
86
+
87
+ except Exception as e:
88
+ return f"Error parsing PDF: {str(e)}"
89
+
90
+ def _parse_excel(self, document_path: str) -> str:
91
+ """Parse Excel document and extract structured data."""
92
+ try:
93
+ # Read all sheets from the Excel file
94
+ excel_file = pd.ExcelFile(document_path)
95
+ sheet_names = excel_file.sheet_names
96
+
97
+ if not sheet_names:
98
+ return "Warning: Excel file contains no sheets."
99
+
100
+ parsed_content = (
101
+ f"Excel Content (from {os.path.basename(document_path)}):\n\n"
102
+ )
103
+ parsed_content += f"Number of sheets: {len(sheet_names)}\n"
104
+ parsed_content += f"Sheet names: {', '.join(sheet_names)}\n\n"
105
+
106
+ for sheet_name in sheet_names:
107
+ try:
108
+ df = pd.read_excel(document_path, sheet_name=sheet_name)
109
+
110
+ parsed_content += f"--- Sheet: {sheet_name} ---\n"
111
+ parsed_content += (
112
+ f"Dimensions: {df.shape[0]} rows × {df.shape[1]} columns\n"
113
+ )
114
+
115
+ if df.empty:
116
+ parsed_content += "Sheet is empty.\n\n"
117
+ continue
118
+
119
+ parsed_content += (
120
+ f"Columns: {', '.join(df.columns.astype(str))}\n\n"
121
+ )
122
+
123
+ # Include first few rows as sample data
124
+ sample_rows = min(5, len(df))
125
+ parsed_content += f"Sample data (first {sample_rows} rows):\n"
126
+ parsed_content += df.head(sample_rows).to_string(index=False)
127
+ parsed_content += "\n\n"
128
+
129
+ # Include summary statistics for numeric columns
130
+ numeric_cols = df.select_dtypes(include=["number"]).columns
131
+ if not numeric_cols.empty:
132
+ parsed_content += "Summary statistics for numeric columns:\n"
133
+ parsed_content += df[numeric_cols].describe().to_string()
134
+ parsed_content += "\n\n"
135
+
136
+ except Exception as sheet_error:
137
+ parsed_content += (
138
+ f"Error reading sheet '{sheet_name}': {str(sheet_error)}\n\n"
139
+ )
140
+
141
+ return parsed_content
142
+
143
+ except Exception as e:
144
+ return f"Error parsing Excel file: {str(e)}"
145
+
146
+
147
+ # Create the DocumentParserTool instance
148
+ document_parser_tool_instance = DocumentParserTool()
149
+
150
+ # Create a LangChain Tool wrapper for the document parser (file paths only)
151
+ document_parser_tool = Tool(
152
+ name="document_parser",
153
+ description=(
154
+ "Parse PDF and Excel (.xlsx, .xls) documents to extract their content. "
155
+ "For PDFs, extracts all text content. For Excel files, provides structured data "
156
+ "including sheet names, dimensions, column headers, sample data, and summary statistics. "
157
+ "Input should be a file path to the document."
158
+ ),
159
+ func=document_parser_tool_instance.parse_document,
160
+ )
161
+
162
+ # Create a LangChain Tool wrapper for the document parser with URL/path support
163
+ document_parser_url_tool = Tool(
164
+ name="document_parser_url",
165
+ description=(
166
+ "Parse PDF and Excel (.xlsx, .xls) documents from URLs or file paths. "
167
+ "If URL is provided, downloads the file first. If file path is provided, uses it directly. "
168
+ "For PDFs, extracts all text content. For Excel files, provides structured data "
169
+ "including sheet names, dimensions, column headers, sample data, and summary statistics. "
170
+ "Input can be either a URL (http/https) or a local file path."
171
+ ),
172
+ func=document_parser_tool_instance.parse_document_from_url_or_path,
173
+ )
174
+
175
+ if __name__ == "__main__":
176
+ print("Start testing document parser tool with file downloader integration")
177
+
178
+ # Initialize file downloader
179
+ downloader = FileDownloader()
180
+
181
+ # Test with both URLs and file paths
182
+ test_files = [
183
+ "https://arxiv.org/pdf/2501.00147", # URL - should be downloaded
184
+ # "https://agents-course-unit4-scoring.hf.space/files/7bd855d8-463d-4ed5-93ca-5fe35145f733", # URL - should be downloaded
185
+ # "./test_document.pdf", # File path - should be used directly (if exists)
186
+ ]
187
+
188
+ downloaded_files = [] # Keep track of downloaded files for cleanup
189
+
190
+ for test_input in test_files:
191
+ print(f"\n--- Processing: {test_input} ---")
192
+
193
+ try:
194
+ # Get file path (download if URL, verify if file path)
195
+ file_path = downloader.get_file_path(test_input)
196
+ print(f"Using file path: {file_path}")
197
+
198
+ # Track downloaded files for cleanup
199
+ if downloader.is_url(test_input):
200
+ downloaded_files.append(file_path)
201
+
202
+ # Test document parser with the file
203
+ result = document_parser_tool_instance.parse_document(file_path)
204
+ print(
205
+ f"Parse result preview: {result[:500] + '...' if len(result) > 500 else result}"
206
+ )
207
+
208
+ # Test with agent executor using the URL-capable tool
209
+ tools = [document_parser_url_tool]
210
+ agent_executor = create_agent_executor(tools=tools)
211
+
212
+ # Create a comprehensive prompt that includes the original input
213
+ prompt_with_input = f"""Please analyze the document from this source: {test_input}
214
+
215
+ Use the document_parser_url tool to download (if URL) and analyze the content.
216
+ Provide a comprehensive summary of what you find in the document.
217
+
218
+ The tool will handle both URLs (by downloading) and file paths (by using directly)."""
219
+
220
+ print(f"\n--- Testing with Agent Executor (URL-capable tool) ---")
221
+ response = agent_executor.invoke({"input": prompt_with_input})
222
+ print("Agent Response:")
223
+ print(response["output"])
224
+
225
+ except Exception as e:
226
+ print(f"Error processing {test_input}: {str(e)}")
227
+
228
+ # Cleanup downloaded files
229
+ print(f"\n--- Cleanup ---")
230
+ for file_path in downloaded_files:
231
+ try:
232
+ downloader.delete_file(file_path)
233
+ except Exception as e:
234
+ print(f"Warning: Could not delete {file_path}: {e}")
235
+
236
+ print(f"Final downloader state: {repr(downloader)}")
utils/file_downloader.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import shutil
4
+ import tempfile
5
+ import uuid
6
+ from pathlib import Path
7
+ from typing import Optional, List
8
+ from urllib.parse import urlparse
9
+
10
+
11
+ class FileDownloader:
12
+ """
13
+ A class for downloading files from URLs and managing them in a temporary directory.
14
+
15
+ Provides functionality to:
16
+ 1. Download files from URLs and save to tmp directory
17
+ 2. Delete specific files from tmp directory
18
+ 3. Clear all files from tmp directory
19
+ """
20
+
21
+ def __init__(self, tmp_dir_name: str = "tmp"):
22
+ """
23
+ Initialize the FileDownloader.
24
+
25
+ Args:
26
+ tmp_dir_name (str): Name of the temporary directory to use
27
+ """
28
+ self.tmp_dir_name = tmp_dir_name
29
+ self.tmp_dir_path = Path(tmp_dir_name)
30
+ self._ensure_tmp_directory()
31
+
32
+ def _ensure_tmp_directory(self) -> None:
33
+ """Ensure the temporary directory exists."""
34
+ self.tmp_dir_path.mkdir(exist_ok=True)
35
+
36
+ def _get_filename_from_url(self, url: str) -> str:
37
+ """
38
+ Extract filename from URL, with fallback to generated name.
39
+
40
+ Args:
41
+ url (str): The URL to extract filename from
42
+
43
+ Returns:
44
+ str: The filename
45
+ """
46
+ parsed_url = urlparse(url)
47
+ filename = os.path.basename(parsed_url.path)
48
+
49
+ # If no filename found in URL, generate one
50
+ if not filename or '.' not in filename:
51
+ # Try to get extension from content-type later, for now use generic
52
+ filename = f"downloaded_file_{uuid.uuid4().hex[:8]}"
53
+
54
+ return filename
55
+
56
+ def _get_unique_filename(self, filename: str) -> str:
57
+ """
58
+ Ensure filename is unique in the tmp directory.
59
+
60
+ Args:
61
+ filename (str): Original filename
62
+
63
+ Returns:
64
+ str: Unique filename
65
+ """
66
+ base_path = self.tmp_dir_path / filename
67
+ if not base_path.exists():
68
+ return filename
69
+
70
+ # Split filename into name and extension
71
+ name_part = base_path.stem
72
+ ext_part = base_path.suffix
73
+
74
+ counter = 1
75
+ while True:
76
+ new_filename = f"{name_part}_{counter}{ext_part}"
77
+ new_path = self.tmp_dir_path / new_filename
78
+ if not new_path.exists():
79
+ return new_filename
80
+ counter += 1
81
+
82
+ def download(self, url: str, filename: Optional[str] = None,
83
+ timeout: int = 30, chunk_size: int = 8192) -> str:
84
+ """
85
+ Download a file from URL and save to tmp directory.
86
+
87
+ Args:
88
+ url (str): URL to download from
89
+ filename (str, optional): Custom filename. If None, extract from URL
90
+ timeout (int): Request timeout in seconds
91
+ chunk_size (int): Size of chunks for streaming download
92
+
93
+ Returns:
94
+ str: Full path to the downloaded file
95
+
96
+ Raises:
97
+ requests.RequestException: If download fails
98
+ IOError: If file writing fails
99
+ """
100
+ try:
101
+ # Start the download
102
+ response = requests.get(url, stream=True, timeout=timeout)
103
+ response.raise_for_status()
104
+
105
+ # Determine filename
106
+ if filename is None:
107
+ filename = self._get_filename_from_url(url)
108
+
109
+ # Try to get better filename from Content-Disposition header
110
+ content_disposition = response.headers.get('content-disposition')
111
+ if content_disposition and 'filename=' in content_disposition:
112
+ try:
113
+ # Extract filename from Content-Disposition header
114
+ import re
115
+ filename_match = re.search(r'filename[*]?=([^;]+)', content_disposition)
116
+ if filename_match:
117
+ header_filename = filename_match.group(1).strip('"\'')
118
+ if header_filename:
119
+ filename = header_filename
120
+ except Exception:
121
+ # If header parsing fails, keep the original filename
122
+ pass
123
+
124
+ # If still no extension, try to infer from content-type
125
+ if '.' not in filename:
126
+ content_type = response.headers.get('content-type', '').lower()
127
+ if 'pdf' in content_type:
128
+ filename += '.pdf'
129
+ elif 'image/jpeg' in content_type or 'image/jpg' in content_type:
130
+ filename += '.jpg'
131
+ elif 'image/png' in content_type:
132
+ filename += '.png'
133
+ elif 'text/plain' in content_type:
134
+ filename += '.txt'
135
+ elif 'application/json' in content_type:
136
+ filename += '.json'
137
+ elif 'text/html' in content_type:
138
+ filename += '.html'
139
+
140
+ # Ensure unique filename
141
+ filename = self._get_unique_filename(filename)
142
+ file_path = self.tmp_dir_path / filename
143
+
144
+ # Download and save file in chunks
145
+ with open(file_path, 'wb') as f:
146
+ for chunk in response.iter_content(chunk_size=chunk_size):
147
+ if chunk: # Filter out keep-alive chunks
148
+ f.write(chunk)
149
+
150
+ print(f"Successfully downloaded: {url} -> {file_path}")
151
+ return str(file_path)
152
+
153
+ except requests.exceptions.RequestException as e:
154
+ raise requests.RequestException(f"Failed to download {url}: {str(e)}")
155
+ except IOError as e:
156
+ raise IOError(f"Failed to save file {filename}: {str(e)}")
157
+
158
+ def delete_file(self, file_path: str) -> bool:
159
+ """
160
+ Delete a specific file from the tmp directory.
161
+
162
+ Args:
163
+ file_path (str): Path to the file to delete (can be full path or just filename)
164
+
165
+ Returns:
166
+ bool: True if file was deleted, False if file didn't exist
167
+
168
+ Raises:
169
+ ValueError: If file is not in the tmp directory
170
+ OSError: If deletion fails
171
+ """
172
+ # Convert to Path object
173
+ path = Path(file_path)
174
+
175
+ # If it's just a filename, assume it's in tmp directory
176
+ if not path.is_absolute() and len(path.parts) == 1:
177
+ path = self.tmp_dir_path / path
178
+
179
+ # Ensure the file is within our tmp directory for security
180
+ try:
181
+ resolved_path = path.resolve()
182
+ tmp_resolved = self.tmp_dir_path.resolve()
183
+ if not str(resolved_path).startswith(str(tmp_resolved)):
184
+ raise ValueError(f"File {file_path} is not in the tmp directory {self.tmp_dir_path}")
185
+ except (OSError, ValueError) as e:
186
+ raise ValueError(f"Invalid file path {file_path}: {str(e)}")
187
+
188
+ # Delete the file
189
+ if path.exists():
190
+ try:
191
+ path.unlink()
192
+ print(f"Successfully deleted: {path}")
193
+ return True
194
+ except OSError as e:
195
+ raise OSError(f"Failed to delete {path}: {str(e)}")
196
+ else:
197
+ print(f"File not found: {path}")
198
+ return False
199
+
200
+ def clear_tmp_directory(self) -> int:
201
+ """
202
+ Clear all files from the tmp directory.
203
+
204
+ Returns:
205
+ int: Number of files deleted
206
+
207
+ Raises:
208
+ OSError: If clearing fails
209
+ """
210
+ if not self.tmp_dir_path.exists():
211
+ print(f"Tmp directory {self.tmp_dir_path} does not exist")
212
+ return 0
213
+
214
+ deleted_count = 0
215
+ errors = []
216
+
217
+ try:
218
+ for item in self.tmp_dir_path.iterdir():
219
+ try:
220
+ if item.is_file():
221
+ item.unlink()
222
+ deleted_count += 1
223
+ print(f"Deleted file: {item}")
224
+ elif item.is_dir():
225
+ shutil.rmtree(item)
226
+ deleted_count += 1
227
+ print(f"Deleted directory: {item}")
228
+ except OSError as e:
229
+ errors.append(f"Failed to delete {item}: {str(e)}")
230
+
231
+ except OSError as e:
232
+ raise OSError(f"Failed to access tmp directory: {str(e)}")
233
+
234
+ if errors:
235
+ error_msg = "; ".join(errors)
236
+ raise OSError(f"Some files could not be deleted: {error_msg}")
237
+
238
+ print(f"Successfully cleared tmp directory. Deleted {deleted_count} items.")
239
+ return deleted_count
240
+
241
+ def list_files(self) -> List[str]:
242
+ """
243
+ List all files in the tmp directory.
244
+
245
+ Returns:
246
+ List[str]: List of file paths in the tmp directory
247
+ """
248
+ if not self.tmp_dir_path.exists():
249
+ return []
250
+
251
+ files = []
252
+ try:
253
+ for item in self.tmp_dir_path.iterdir():
254
+ if item.is_file():
255
+ files.append(str(item))
256
+ except OSError:
257
+ # If we can't read the directory, return empty list
258
+ pass
259
+
260
+ return files
261
+
262
+ def get_tmp_dir_size(self) -> int:
263
+ """
264
+ Get the total size of all files in the tmp directory.
265
+
266
+ Returns:
267
+ int: Total size in bytes
268
+ """
269
+ if not self.tmp_dir_path.exists():
270
+ return 0
271
+
272
+ total_size = 0
273
+ try:
274
+ for item in self.tmp_dir_path.rglob('*'):
275
+ if item.is_file():
276
+ total_size += item.stat().st_size
277
+ except OSError:
278
+ # If we can't access some files, return partial size
279
+ pass
280
+
281
+ return total_size
282
+
283
+ def is_url(self, path_or_url: str) -> bool:
284
+ """
285
+ Check if the given string is a URL or a file path.
286
+
287
+ Args:
288
+ path_or_url (str): String to check
289
+
290
+ Returns:
291
+ bool: True if it's a URL, False if it's a file path
292
+ """
293
+ return path_or_url.startswith(('http://', 'https://'))
294
+
295
+ def get_file_path(self, path_or_url: str, filename: Optional[str] = None) -> str:
296
+ """
297
+ Get file path - download if URL, return as-is if file path.
298
+
299
+ Args:
300
+ path_or_url (str): URL to download or file path to use
301
+ filename (str, optional): Custom filename for downloads
302
+
303
+ Returns:
304
+ str: File path to use
305
+
306
+ Raises:
307
+ FileNotFoundError: If file path doesn't exist
308
+ requests.RequestException: If URL download fails
309
+ """
310
+ if self.is_url(path_or_url):
311
+ # It's a URL, download it
312
+ return self.download(path_or_url, filename)
313
+ else:
314
+ # It's a file path, verify it exists
315
+ if not os.path.exists(path_or_url):
316
+ raise FileNotFoundError(f"File not found: {path_or_url}")
317
+ return path_or_url
318
+
319
+ def __str__(self) -> str:
320
+ """String representation of the FileDownloader."""
321
+ return f"FileDownloader(tmp_dir='{self.tmp_dir_path}')"
322
+
323
+ def __repr__(self) -> str:
324
+ """Detailed string representation of the FileDownloader."""
325
+ file_count = len(self.list_files())
326
+ size = self.get_tmp_dir_size()
327
+ return f"FileDownloader(tmp_dir='{self.tmp_dir_path}', files={file_count}, size={size} bytes)"
utils/prompt.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.prompts import PromptTemplate
2
+
3
+ template_default = """
4
+ You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
5
+
6
+ Answer the following questions as best you can. You have access to the following tools:
7
+
8
+ {tools}
9
+
10
+ Use the following format:
11
+
12
+ Question: the input question you must answer
13
+ Thought: you should always think about what to do
14
+ Action: the action to take, should be one of [{tool_names}]
15
+ Action Input: the input to the action
16
+ Observation: the result of the action
17
+ ... (this Thought/Action/Action Input/Observation can repeat N times)
18
+ Thought: I now know the final answer
19
+ Final Answer: the final answer to the original input question
20
+
21
+ Begin!
22
+
23
+ Question: {input}
24
+ {agent_scratchpad}
25
+ """
26
+ prompt_default = PromptTemplate.from_template(template_default)
utils/search_tool.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_google_community import GoogleSearchRun, GoogleSearchAPIWrapper
2
+ from dotenv import load_dotenv
3
+ from agent_executor import create_agent_executor
4
+
5
+ load_dotenv()
6
+
7
+ search_wrapper = GoogleSearchAPIWrapper()
8
+ search_tool = GoogleSearchRun(api_wrapper=search_wrapper)
9
+
10
+ if __name__ == "__main__":
11
+ print("Start testing search tool with an example question")
12
+
13
+ tools = [search_tool]
14
+ agent_executor = create_agent_executor(tools=tools)
15
+ response = agent_executor.invoke(
16
+ {"input": "What is the current capital of Australia and when was it founded?"}
17
+ )
18
+
19
+ print("\nFinal Answer:")
20
+ print(response["output"])
utils/tools.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from search_tool import search_tool
2
+ from document_parser_tool import document_parser_tool
3
+
4
+ tools = [search_tool, document_parser_tool]
uv.lock CHANGED
@@ -20,7 +20,9 @@ dependencies = [
20
  { name = "langchain-core" },
21
  { name = "langchain-google-community" },
22
  { name = "langchain-openai" },
 
23
  { name = "pandas" },
 
24
  { name = "python-dotenv" },
25
  { name = "requests" },
26
  ]
@@ -49,7 +51,9 @@ requires-dist = [
49
  { name = "langchain-core", specifier = ">=0.1.0" },
50
  { name = "langchain-google-community", specifier = ">=1.0.0" },
51
  { name = "langchain-openai", specifier = ">=0.0.5" },
 
52
  { name = "pandas", specifier = ">=1.3.0" },
 
53
  { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0" },
54
  { name = "python-dotenv", specifier = ">=1.0.0" },
55
  { name = "requests", specifier = ">=2.25.0" },
@@ -533,6 +537,15 @@ wheels = [
533
  { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" },
534
  ]
535
 
 
 
 
 
 
 
 
 
 
536
  [[package]]
537
  name = "exceptiongroup"
538
  version = "1.3.0"
@@ -1650,6 +1663,18 @@ wheels = [
1650
  { url = "https://files.pythonhosted.org/packages/58/c1/dfb16b3432810fc9758564f9d1a4dbce6b93b7fb763ba57530c7fc48316d/openai-1.86.0-py3-none-any.whl", hash = "sha256:c8889c39410621fe955c230cc4c21bfe36ec887f4e60a957de05f507d7e1f349", size = 730296, upload-time = "2025-06-10T16:50:30.495Z" },
1651
  ]
1652
 
 
 
 
 
 
 
 
 
 
 
 
 
1653
  [[package]]
1654
  name = "orjson"
1655
  version = "3.10.18"
@@ -2184,6 +2209,18 @@ wheels = [
2184
  { url = "https://files.pythonhosted.org/packages/05/e7/df2285f3d08fee213f2d041540fa4fc9ca6c2d44cf36d3a035bf2a8d2bcc/pyparsing-3.2.3-py3-none-any.whl", hash = "sha256:a749938e02d6fd0b59b356ca504a24982314bb090c383e3cf201c95ef7e2bfcf", size = 111120, upload-time = "2025-03-25T05:01:24.908Z" },
2185
  ]
2186
 
 
 
 
 
 
 
 
 
 
 
 
 
2187
  [[package]]
2188
  name = "pytest"
2189
  version = "8.4.0"
 
20
  { name = "langchain-core" },
21
  { name = "langchain-google-community" },
22
  { name = "langchain-openai" },
23
+ { name = "openpyxl" },
24
  { name = "pandas" },
25
+ { name = "pypdf" },
26
  { name = "python-dotenv" },
27
  { name = "requests" },
28
  ]
 
51
  { name = "langchain-core", specifier = ">=0.1.0" },
52
  { name = "langchain-google-community", specifier = ">=1.0.0" },
53
  { name = "langchain-openai", specifier = ">=0.0.5" },
54
+ { name = "openpyxl", specifier = ">=3.0.0" },
55
  { name = "pandas", specifier = ">=1.3.0" },
56
+ { name = "pypdf", specifier = ">=5.6.0" },
57
  { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0" },
58
  { name = "python-dotenv", specifier = ">=1.0.0" },
59
  { name = "requests", specifier = ">=2.25.0" },
 
537
  { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" },
538
  ]
539
 
540
+ [[package]]
541
+ name = "et-xmlfile"
542
+ version = "2.0.0"
543
+ source = { registry = "https://pypi.org/simple" }
544
+ sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" }
545
+ wheels = [
546
+ { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" },
547
+ ]
548
+
549
  [[package]]
550
  name = "exceptiongroup"
551
  version = "1.3.0"
 
1663
  { url = "https://files.pythonhosted.org/packages/58/c1/dfb16b3432810fc9758564f9d1a4dbce6b93b7fb763ba57530c7fc48316d/openai-1.86.0-py3-none-any.whl", hash = "sha256:c8889c39410621fe955c230cc4c21bfe36ec887f4e60a957de05f507d7e1f349", size = 730296, upload-time = "2025-06-10T16:50:30.495Z" },
1664
  ]
1665
 
1666
+ [[package]]
1667
+ name = "openpyxl"
1668
+ version = "3.1.5"
1669
+ source = { registry = "https://pypi.org/simple" }
1670
+ dependencies = [
1671
+ { name = "et-xmlfile" },
1672
+ ]
1673
+ sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" }
1674
+ wheels = [
1675
+ { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" },
1676
+ ]
1677
+
1678
  [[package]]
1679
  name = "orjson"
1680
  version = "3.10.18"
 
2209
  { url = "https://files.pythonhosted.org/packages/05/e7/df2285f3d08fee213f2d041540fa4fc9ca6c2d44cf36d3a035bf2a8d2bcc/pyparsing-3.2.3-py3-none-any.whl", hash = "sha256:a749938e02d6fd0b59b356ca504a24982314bb090c383e3cf201c95ef7e2bfcf", size = 111120, upload-time = "2025-03-25T05:01:24.908Z" },
2210
  ]
2211
 
2212
+ [[package]]
2213
+ name = "pypdf"
2214
+ version = "5.6.0"
2215
+ source = { registry = "https://pypi.org/simple" }
2216
+ dependencies = [
2217
+ { name = "typing-extensions", marker = "python_full_version < '3.11'" },
2218
+ ]
2219
+ sdist = { url = "https://files.pythonhosted.org/packages/40/46/67de1d7a65412aa1c896e6b280829b70b57d203fadae6859b690006b8e0a/pypdf-5.6.0.tar.gz", hash = "sha256:a4b6538b77fc796622000db7127e4e58039ec5e6afd292f8e9bf42e2e985a749", size = 5023749, upload-time = "2025-06-01T12:19:40.101Z" }
2220
+ wheels = [
2221
+ { url = "https://files.pythonhosted.org/packages/71/8b/dc3a72d98c22be7a4cbd664ad14c5a3e6295c2dbdf572865ed61e24b5e38/pypdf-5.6.0-py3-none-any.whl", hash = "sha256:ca6bf446bfb0a2d8d71d6d6bb860798d864c36a29b3d9ae8d7fc7958c59f88e7", size = 304208, upload-time = "2025-06-01T12:19:38.003Z" },
2222
+ ]
2223
+
2224
  [[package]]
2225
  name = "pytest"
2226
  version = "8.4.0"