LeVinh commited on
Commit
c09239f
·
1 Parent(s): e284096
Files changed (8) hide show
  1. Dockerfile +0 -1
  2. agent.json +4 -4
  3. agent.py +66 -38
  4. eval.py +91 -9
  5. logic.py +0 -104
  6. metadata.jsonl +0 -0
  7. multimodal_tools.py +214 -0
  8. requirements.txt +2 -4
Dockerfile CHANGED
@@ -10,7 +10,6 @@ RUN apt-get update && apt-get install -y \
10
  libxext6 \
11
  cmake \
12
  libgl1 \
13
- tesseract-ocr \
14
  curl \
15
  && rm -rf /var/lib/apt/lists/*
16
 
 
10
  libxext6 \
11
  cmake \
12
  libgl1 \
 
13
  curl \
14
  && rm -rf /var/lib/apt/lists/*
15
 
agent.json CHANGED
@@ -7,13 +7,13 @@
7
  "model": {
8
  "class": "ChatOpenAI",
9
  "data": {
10
- "max_tokens": 4096,
11
  "temperature": 0.01,
12
  "last_input_token_count": null,
13
  "last_output_token_count": null,
14
- "model_id": "Meta-Llama-3.1-8B-Instruct",
15
- "base_url": "https://api.sambanova.ai/v1",
16
- "api_key_env": "SAMBANOVA_API_KEY",
17
  "custom_role_conversions": null
18
  }
19
  },
 
7
  "model": {
8
  "class": "ChatOpenAI",
9
  "data": {
10
+ "max_tokens": 2048,
11
  "temperature": 0.01,
12
  "last_input_token_count": null,
13
  "last_output_token_count": null,
14
+ "model_id": "qwen/qwen3-32b",
15
+ "base_url": "https://openrouter.ai/api/v1",
16
+ "api_key_env": "OPENROUTER_API_KEY",
17
  "custom_role_conversions": null
18
  }
19
  },
agent.py CHANGED
@@ -9,9 +9,7 @@ import numpy as np
9
  import pandas as pd
10
  from typing import List, Dict, Any, Optional
11
  from urllib.parse import urlparse
12
- import pytesseract
13
- import cv2
14
- import yt_dlp
15
  from PIL import Image, ImageDraw, ImageFont, ImageEnhance, ImageFilter
16
  from dotenv import load_dotenv
17
 
@@ -32,6 +30,7 @@ from langchain_huggingface import (
32
  HuggingFaceEndpoint,
33
  HuggingFaceEmbeddings,
34
  )
 
35
  from langchain_core.messages import SystemMessage, HumanMessage
36
  from langchain_core.tools import tool, Tool
37
  from supabase.client import Client, create_client
@@ -40,9 +39,13 @@ from supabase.client import Client, create_client
40
  from code_interpreter import CodeInterpreter
41
  from img_processing import decode_image, encode_image, save_image
42
  from dotenv import load_dotenv
 
 
 
 
 
43
 
44
  load_dotenv()
45
- # Configure Gemini API key
46
 
47
 
48
  interpreter_instance = CodeInterpreter()
@@ -298,20 +301,15 @@ def download_file_from_url(url: str, filename: Optional[str] = None) -> str:
298
  @tool
299
  def extract_text_from_image(image_path: str) -> str:
300
  """
301
- Extract text from an image using OCR.
302
-
303
  Args:
304
  image_path (str): The path to the image file.
305
 
306
  Returns:
307
  str: Extracted text or error message.
308
  """
309
- try:
310
- image = Image.open(image_path)
311
- text = pytesseract.image_to_string(image)
312
- return f"Extracted text from image:\n\n{text}"
313
- except Exception as e:
314
- return f"Error extracting text from image: {str(e)}"
315
 
316
 
317
  @tool
@@ -693,6 +691,11 @@ tools = [
693
  draw_on_image,
694
  generate_simple_image,
695
  combine_images,
 
 
 
 
 
696
  ]
697
 
698
 
@@ -762,36 +765,61 @@ def build_graph(provider: str = None):
762
  # Default fallback
763
  provider = "openai" # Default to openai as fallback
764
 
765
- if provider == "google":
766
- # Check if model_id is in config, otherwise default
767
- model_id = model_data.get("model_id", "gemini-1.5-flash")
768
- # Ensure it starts with gemini
769
- if not model_id.startswith("gemini"):
770
- model_id = "gemini-1.5-flash"
771
-
772
- llm = ChatGoogleGenerativeAI(model=model_id)
773
- print(f"Using Google provider with model: {model_id}")
774
 
775
- elif provider == "huggingface":
776
- repo_id = model_data.get("model_id", "Qwen/Qwen2.5-7B-Instruct")
777
- llm = ChatHuggingFace(
778
- llm=HuggingFaceEndpoint(
779
- repo_id=repo_id,
780
- task="text-generation",
781
- max_new_tokens=model_data.get("max_tokens", 4096),
782
- do_sample=False,
783
- repetition_penalty=1.03,
 
 
 
 
 
 
 
 
 
 
784
  temperature=model_data.get("temperature", 0.01),
785
- ),
786
- verbose=True,
787
- )
788
- print(f"Using Hugging Face provider with model: {repo_id}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
789
  elif provider == "openai":
790
- model_id = model_data.get("model_id", "Meta-Llama-3.1-8B-Instruct")
791
- api_key_env = model_data.get("api_key_env", "OPENAI_API_KEY")
 
 
 
 
792
  api_key = os.getenv(api_key_env)
793
  if not api_key:
794
- print(f"Warning: {api_key_env} not found in environment variables. Calls might fail.")
795
 
796
  llm = ChatOpenAI(
797
  model=model_id,
@@ -804,7 +832,7 @@ def build_graph(provider: str = None):
804
 
805
  else:
806
  # Fallback or error if other providers are requested but not implemented
807
- raise ValueError(f"Invalid provider: {provider}. Supported: 'google', 'huggingface', 'openai'.")
808
 
809
  llm_with_tools = llm.bind_tools(tools)
810
 
 
9
  import pandas as pd
10
  from typing import List, Dict, Any, Optional
11
  from urllib.parse import urlparse
12
+
 
 
13
  from PIL import Image, ImageDraw, ImageFont, ImageEnhance, ImageFilter
14
  from dotenv import load_dotenv
15
 
 
30
  HuggingFaceEndpoint,
31
  HuggingFaceEmbeddings,
32
  )
33
+ from huggingface_hub import InferenceClient
34
  from langchain_core.messages import SystemMessage, HumanMessage
35
  from langchain_core.tools import tool, Tool
36
  from supabase.client import Client, create_client
 
39
  from code_interpreter import CodeInterpreter
40
  from img_processing import decode_image, encode_image, save_image
41
  from dotenv import load_dotenv
42
+ from multimodal_tools import (
43
+ vision_analyze_image,
44
+ vision_analyze_video,
45
+ vision_analyze_document
46
+ )
47
 
48
  load_dotenv()
 
49
 
50
 
51
  interpreter_instance = CodeInterpreter()
 
301
  @tool
302
  def extract_text_from_image(image_path: str) -> str:
303
  """
304
+ Extract text from an image using AI Vision (OCR).
305
+
306
  Args:
307
  image_path (str): The path to the image file.
308
 
309
  Returns:
310
  str: Extracted text or error message.
311
  """
312
+ return vision_analyze_image("Transcribe all text from this image verbatim.", image_path)
 
 
 
 
 
313
 
314
 
315
  @tool
 
691
  draw_on_image,
692
  generate_simple_image,
693
  combine_images,
694
+
695
+ # Multimodal vision tools (OpenRouter)
696
+ vision_analyze_image,
697
+ vision_analyze_video,
698
+ vision_analyze_document,
699
  ]
700
 
701
 
 
765
  # Default fallback
766
  provider = "openai" # Default to openai as fallback
767
 
768
+ if provider == "huggingface":
769
+ # All config must come from agent.json
770
+ if "model_id" not in model_data:
771
+ raise ValueError("model_id is required in agent.json for HuggingFace provider")
 
 
 
 
 
772
 
773
+ model_id = model_data["model_id"]
774
+
775
+ # Parse provider suffix from model_id (e.g., "Qwen/Qwen3-32B:cerebras")
776
+ # Format: "model_name" or "model_name:provider"
777
+ if ":" in model_id:
778
+ # Use router for third-party providers (cerebras, novita, etc.)
779
+ model_name, provider_suffix = model_id.rsplit(":", 1)
780
+ print(f"Using HuggingFace Router with model: {model_name}, provider: {provider_suffix}")
781
+
782
+ api_key = os.getenv("HF_TOKEN")
783
+ if not api_key:
784
+ raise ValueError("HF_TOKEN not found in environment variables")
785
+
786
+ # Use router for third-party providers
787
+ llm = ChatOpenAI(
788
+ model=model_id, # Full model_id with provider suffix
789
+ base_url="https://router.huggingface.co/v1",
790
+ api_key=api_key,
791
+ max_tokens=model_data.get("max_tokens", 4096),
792
  temperature=model_data.get("temperature", 0.01),
793
+ )
794
+ else:
795
+ # Use free serverless inference (no provider suffix)
796
+ print(f"Using HuggingFace Serverless Inference with model: {model_id}")
797
+
798
+ # Construct the serverless inference API URL to bypass router
799
+ # Format: https://api-inference.huggingface.co/models/{model_id}
800
+ serverless_url = f"https://api-inference.huggingface.co/models/{model_id}"
801
+
802
+ llm = ChatHuggingFace(
803
+ llm=HuggingFaceEndpoint(
804
+ endpoint_url=serverless_url, # Use direct serverless API
805
+ task="text-generation",
806
+ max_new_tokens=model_data.get("max_tokens", 4096),
807
+ do_sample=False,
808
+ repetition_penalty=1.03,
809
+ temperature=model_data.get("temperature", 0.01),
810
+ ),
811
+ verbose=True,
812
+ )
813
  elif provider == "openai":
814
+ # All config must come from agent.json
815
+ if "model_id" not in model_data:
816
+ raise ValueError("model_id is required in agent.json for OpenAI provider")
817
+
818
+ model_id = model_data["model_id"]
819
+ api_key_env = model_data.get("api_key_env", "OPENAI_API_KEY") # Keep this default for compatibility
820
  api_key = os.getenv(api_key_env)
821
  if not api_key:
822
+ raise ValueError(f"{api_key_env} not found in environment variables")
823
 
824
  llm = ChatOpenAI(
825
  model=model_id,
 
832
 
833
  else:
834
  # Fallback or error if other providers are requested but not implemented
835
+ raise ValueError(f"Invalid provider: {provider}. Supported: 'huggingface', 'openai'.")
836
 
837
  llm_with_tools = llm.bind_tools(tools)
838
 
eval.py CHANGED
@@ -17,10 +17,45 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
17
  # Debug Environment
18
  print("--- Environment Debug ---")
19
  print(f"SPACE_ID: {os.getenv('SPACE_ID')}")
 
20
  print(f"HF_TOKEN present: {bool(os.getenv('HF_TOKEN'))}")
21
  print(f"Gradio Version: {gr.__version__}")
22
  print("-------------------------")
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  # --- Basic Agent Definition ---
25
  class BasicAgent:
26
  def __init__(self):
@@ -45,21 +80,55 @@ class BasicAgent:
45
  # content is the response from the agent
46
  content = result["messages"][-1].content
47
 
48
- # Clean up response if it's a list or has prefixes
49
  if isinstance(content, list):
50
  content = " ".join([str(item) for item in content])
51
 
52
- # Remove "Final Answer:" prefix case-insensitively
 
 
 
53
  import re
54
- content = re.sub(r"^Final Answer:\s*", "", content, flags=re.IGNORECASE).strip()
55
- # Also remove "Answer:" if present
56
- content = re.sub(r"^Answer:\s*", "", content, flags=re.IGNORECASE).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
- print(f"Agent returning answer: {content[:100]}...")
 
 
 
59
  return content
60
  except Exception as e:
61
- print(f"Error invoking agent: {e}")
62
- return f"Error: {e}"
 
 
 
 
 
 
63
 
64
  def run_and_submit_all( profile: gr.OAuthProfile | None):
65
  """
@@ -115,12 +184,19 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
115
  results_log = []
116
  answers_payload = []
117
  print(f"Running agent on {len(questions_data)} questions...")
118
- for item in questions_data:
 
 
 
 
 
119
  task_id = item.get("task_id")
120
  question_text = item.get("question")
121
  if not task_id or question_text is None:
122
  print(f"Skipping item with missing task_id or question: {item}")
123
  continue
 
 
124
  try:
125
  submitted_answer = agent(question_text)
126
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
@@ -128,6 +204,12 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
128
  except Exception as e:
129
  print(f"Error running agent on task {task_id}: {e}")
130
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
 
 
 
 
 
 
131
 
132
  if not answers_payload:
133
  print("Agent did not produce any answers to submit.")
 
17
  # Debug Environment
18
  print("--- Environment Debug ---")
19
  print(f"SPACE_ID: {os.getenv('SPACE_ID')}")
20
+ print(f"SPACE_HOST: {os.getenv('SPACE_HOST')}")
21
  print(f"HF_TOKEN present: {bool(os.getenv('HF_TOKEN'))}")
22
  print(f"Gradio Version: {gr.__version__}")
23
  print("-------------------------")
24
 
25
+ # CRITICAL FIX: Derive SPACE_ID from SPACE_HOST if not set
26
+ # HF Spaces sets SPACE_HOST (e.g., "vinhle-first-agent-template.hf.space")
27
+ # but not always SPACE_ID in Docker containers
28
+ if not os.getenv("SPACE_ID") and os.getenv("SPACE_HOST"):
29
+ space_host = os.getenv("SPACE_HOST")
30
+ # Parse: "username-spacename.hf.space" -> "username/spacename"
31
+ if space_host.endswith(".hf.space"):
32
+ space_slug = space_host.replace(".hf.space", "")
33
+ # Convert "vinhle-first-agent-template" to "vinhle/first_agent_template"
34
+ parts = space_slug.split("-", 1) # Split on first hyphen only
35
+ if len(parts) == 2:
36
+ username, space_name = parts
37
+ space_id = f"{username}/{space_name.replace('-', '_')}"
38
+ os.environ["SPACE_ID"] = space_id
39
+ print(f"✅ Derived SPACE_ID from SPACE_HOST: {space_id}")
40
+ else:
41
+ print(f"⚠️ Could not parse SPACE_HOST: {space_host}")
42
+
43
+
44
+ # Display configured model
45
+ try:
46
+ import json
47
+ with open("agent.json", "r") as f:
48
+ config = json.load(f)
49
+ model_config = config.get("model", {}).get("data", {})
50
+ model_id = model_config.get("model_id", "Unknown")
51
+ base_url = model_config.get("base_url", "Unknown")
52
+ print(f"\n🤖 Configured Model: {model_id}")
53
+ print(f" Provider: {base_url}")
54
+ print()
55
+ except Exception as e:
56
+ print(f"⚠️ Could not load model config: {e}\n")
57
+
58
+
59
  # --- Basic Agent Definition ---
60
  class BasicAgent:
61
  def __init__(self):
 
80
  # content is the response from the agent
81
  content = result["messages"][-1].content
82
 
83
+ # Clean up response if it's a list
84
  if isinstance(content, list):
85
  content = " ".join([str(item) for item in content])
86
 
87
+ # DEBUG: Show full raw response (first 500 chars)
88
+ print(f"Raw model response: {content[:500]}...")
89
+
90
+ # Extract ONLY the final answer
91
  import re
92
+ original_content = content
93
+
94
+ # Strategy 1: Look for "FINAL ANSWER:" (case-insensitive) and extract everything after it
95
+ final_answer_match = re.search(r'FINAL\s+ANSWER:\s*(.+?)(?:\s*</think>|$)', content, re.IGNORECASE | re.DOTALL)
96
+ if final_answer_match:
97
+ content = final_answer_match.group(1).strip()
98
+ print("✅ Extracted using FINAL ANSWER pattern")
99
+ else:
100
+ # Strategy 2: If no "FINAL ANSWER:", try to extract text after </think> tag
101
+ think_match = re.search(r'</think>\s*(.+)$', content, re.DOTALL)
102
+ if think_match:
103
+ content = think_match.group(1).strip()
104
+ print("✅ Extracted text after </think> tag")
105
+ else:
106
+ # Strategy 3: Remove all <think>...</think> blocks entirely
107
+ content = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL).strip()
108
+ print("✅ Removed <think> blocks")
109
+
110
+ # If nothing remains, the model didn't follow format - return error
111
+ if not content:
112
+ print("⚠️ Model output only contained reasoning, no answer found!")
113
+ return "ERROR: Model did not provide a final answer"
114
+
115
+ # Remove any remaining XML-like tags
116
+ content = re.sub(r'<[^>]+>', '', content).strip()
117
 
118
+ # Remove any leading "Answer:" or "Final Answer:" that might remain
119
+ content = re.sub(r'^(Final\s+)?Answer:\s*', '', content, flags=re.IGNORECASE).strip()
120
+
121
+ print(f"📤 Submitting answer: '{content}'")
122
  return content
123
  except Exception as e:
124
+ error_msg = str(e)
125
+ # Check if it's a rate limit error
126
+ if "429" in error_msg or "rate limit" in error_msg.lower():
127
+ print(f"⚠️ Rate limit exceeded (429): {e}")
128
+ return "ERROR: Rate limit exceeded"
129
+ else:
130
+ print(f"Error invoking agent: {e}")
131
+ return f"Error: {e}"
132
 
133
  def run_and_submit_all( profile: gr.OAuthProfile | None):
134
  """
 
184
  results_log = []
185
  answers_payload = []
186
  print(f"Running agent on {len(questions_data)} questions...")
187
+
188
+ # Add delay between requests to avoid rate limiting
189
+ import time
190
+ DELAY_BETWEEN_REQUESTS = 3 # seconds - adjust as needed
191
+
192
+ for idx, item in enumerate(questions_data, 1):
193
  task_id = item.get("task_id")
194
  question_text = item.get("question")
195
  if not task_id or question_text is None:
196
  print(f"Skipping item with missing task_id or question: {item}")
197
  continue
198
+
199
+ print(f"\n📝 Processing question {idx}/{len(questions_data)}...")
200
  try:
201
  submitted_answer = agent(question_text)
202
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
 
204
  except Exception as e:
205
  print(f"Error running agent on task {task_id}: {e}")
206
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
207
+
208
+ # Add delay between requests (except after the last one)
209
+ if idx < len(questions_data):
210
+ print(f"⏳ Waiting {DELAY_BETWEEN_REQUESTS}s before next request to avoid rate limiting...")
211
+ time.sleep(DELAY_BETWEEN_REQUESTS)
212
+
213
 
214
  if not answers_payload:
215
  print("Agent did not produce any answers to submit.")
logic.py DELETED
@@ -1,104 +0,0 @@
1
- import os
2
- import logging
3
- from typing import List, Tuple, Optional
4
-
5
- logging.basicConfig(level=logging.INFO)
6
- logger = logging.getLogger(__name__)
7
-
8
- try:
9
- from agent import build_graph
10
- AGENT_AVAILABLE = True
11
- logger.info("Agent successfully imported!")
12
- except ImportError as e:
13
- AGENT_AVAILABLE = False
14
- logger.error(f"Could not import 'agent.build_graph': {str(e)}")
15
- import traceback
16
- traceback.print_exc()
17
-
18
- class GaiaApp:
19
- def __init__(self):
20
- self.agent = None
21
-
22
- def _ensure_agent(self):
23
- if self.agent is None:
24
- logger.info("Initializing Agent...")
25
- self.agent = build_graph() if AGENT_AVAILABLE else None
26
- logger.info("Agent initialization complete.")
27
-
28
- def process_input(self, user_message: str, history: List[dict], uploaded_files: Optional[List[str]]):
29
- """
30
- Main handler for chat input.
31
- Args:
32
- user_message: The text input from the user.
33
- history: The existing chat history (list of message dicts).
34
- uploaded_files: List of file paths.
35
- """
36
- if not user_message and not uploaded_files:
37
- return "", history, None
38
-
39
- self._ensure_agent()
40
-
41
- # 1. Process Files
42
- context_msg = ""
43
- if uploaded_files:
44
- file_names = [os.path.basename(f) for f in uploaded_files]
45
- context_msg = f"\n[User uploaded files: {', '.join(file_names)}]"
46
-
47
- full_query = user_message + context_msg
48
-
49
- # 2. Append User Message to History immediately for UI update
50
- current_history = history + [{"role": "user", "content": user_message}]
51
-
52
- # 3. Yield back immediately to show user message
53
- yield "", current_history, None
54
-
55
- # 4. Invoke Agent
56
- try:
57
- # Prepare messages for LangChain/Agent
58
- # (Simplification: just sending last message)
59
- from langchain_core.messages import HumanMessage
60
-
61
- inputs = {"messages": [HumanMessage(content=full_query)]}
62
- result = self.agent.invoke(inputs)
63
-
64
- # Extract response
65
- # Assuming standard LangGraph/LangChain output
66
- if isinstance(result, dict) and 'messages' in result:
67
- last_msg = result['messages'][-1]
68
- # Handle both Message objects and dicts
69
- if hasattr(last_msg, 'content'):
70
- bot_response = last_msg.content
71
- elif isinstance(last_msg, dict):
72
- bot_response = last_msg.get('content', str(last_msg))
73
- else:
74
- bot_response = str(last_msg)
75
- else:
76
- bot_response = str(result)
77
-
78
- # Clean up response prefixes if present
79
- if isinstance(bot_response, list):
80
- # If content is a list of blocks, join them or take the first text block
81
- bot_response = " ".join([str(item) for item in bot_response])
82
-
83
- if isinstance(bot_response, str) and bot_response.startswith("Assistant:"):
84
- bot_response = bot_response.replace("Assistant:", "").strip()
85
-
86
- # 5. Stream/Update Bot Response
87
- current_history.append({"role": "assistant", "content": bot_response})
88
- yield "", current_history, None
89
-
90
- except Exception as e:
91
- logger.error(f"Error invoking agent: {e}")
92
- error_msg = f"Error: {str(e)}"
93
- current_history.append({"role": "assistant", "content": error_msg})
94
- yield "", current_history, None
95
-
96
- def create_new_chat(self):
97
- """Resets the state."""
98
- return [], None, ""
99
-
100
- def load_example(self, prompt):
101
- return prompt
102
-
103
- # Singleton instance for the app
104
- gaia_logic = GaiaApp()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
metadata.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
multimodal_tools.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Multimodal Analysis Tools using OpenRouter Vision Models
3
+ """
4
+ import os
5
+ import base64
6
+ import tempfile
7
+ import requests
8
+ import numpy as np
9
+ from langchain_core.tools import tool
10
+
11
+
12
+ def call_openrouter_vision(
13
+ model: str,
14
+ question: str,
15
+ image_base64: str = None,
16
+ fallback_model: str = None
17
+ ) -> str:
18
+ """
19
+ Call OpenRouter vision model for image analysis.
20
+
21
+ Args:
22
+ model: Model ID (e.g., "qwen/qwen3-vl-30b-a3b-thinking")
23
+ question: Question about the image
24
+ image_base64: Base64 encoded image
25
+ fallback_model: Fallback model if primary fails
26
+
27
+ Returns:
28
+ Model's response text
29
+ """
30
+ api_key = os.getenv("OPENROUTER_API_KEY")
31
+ if not api_key:
32
+ raise ValueError("OPENROUTER_API_KEY not found in environment")
33
+
34
+ messages = [
35
+ {
36
+ "role": "user",
37
+ "content": [
38
+ {"type": "text", "text": question}
39
+ ]
40
+ }
41
+ ]
42
+
43
+ # Add image if provided
44
+ if image_base64:
45
+ messages[0]["content"].append({
46
+ "type": "image_url",
47
+ "image_url": {
48
+ "url": f"data:image/jpeg;base64,{image_base64}"
49
+ }
50
+ })
51
+
52
+ try:
53
+ response = requests.post(
54
+ "https://openrouter.ai/api/v1/chat/completions",
55
+ headers={
56
+ "Authorization": f"Bearer {api_key}",
57
+ "Content-Type": "application/json"
58
+ },
59
+ json={
60
+ "model": model,
61
+ "messages": messages,
62
+ "max_tokens": 2048
63
+ },
64
+ timeout=60
65
+ )
66
+ response.raise_for_status()
67
+ return response.json()["choices"][0]["message"]["content"]
68
+ except Exception as e:
69
+ if fallback_model and fallback_model != model:
70
+ print(f"Primary model {model} failed, trying fallback {fallback_model}: {e}")
71
+ return call_openrouter_vision(fallback_model, question, image_base64)
72
+ raise Exception(f"OpenRouter vision call failed: {e}")
73
+
74
+
75
+ @tool
76
+ def vision_analyze_image(question: str, image_path: str) -> str:
77
+ """
78
+ Analyze an image using AI vision model to answer questions about it.
79
+ Use this for semantic understanding of images (chess positions, charts, diagrams, screenshots, etc.)
80
+
81
+ Args:
82
+ question: Question about the image
83
+ image_path: Path to image file
84
+
85
+ Returns:
86
+ Analysis result from vision model
87
+ """
88
+ try:
89
+ # Load and encode image
90
+ with open(image_path, "rb") as f:
91
+ image_data = base64.b64encode(f.read()).decode("utf-8")
92
+
93
+ # Call OpenRouter vision model with fallback
94
+ result = call_openrouter_vision(
95
+ model="qwen/qwen3-vl-30b-a3b-thinking",
96
+ question=question,
97
+ image_base64=image_data,
98
+ fallback_model="google/gemini-2.5-flash"
99
+ )
100
+ return result
101
+ except Exception as e:
102
+ return f"Error analyzing image: {str(e)}"
103
+
104
+
105
+ import subprocess
106
+ import glob
107
+
108
+ # ... (call_openrouter_vision remains same)
109
+
110
+ @tool
111
+ def vision_analyze_video(question: str, video_path: str, num_frames: int = 5) -> str:
112
+ """
113
+ Analyze a video file by extracting key frames using FFmpeg.
114
+
115
+ Args:
116
+ question: Question about the video
117
+ video_path: Path to video file
118
+ num_frames: Number of frames to extract
119
+
120
+ Returns:
121
+ Analysis result combining insights from all frames
122
+ """
123
+ try:
124
+ with tempfile.TemporaryDirectory() as tmpdir:
125
+ # Use FFmpeg to extract frames at intervals
126
+ # fps=1/interval? Easier: just extract 5 frames uniformly?
127
+ # Strategy: Extract 5 frames at percentage intervals (0%, 20%, 40%...)
128
+
129
+ # First, extract frames
130
+ subprocess.run([
131
+ "ffmpeg", "-i", video_path,
132
+ "-vf", f"fps={num_frames}/(duration)", # approximate
133
+ # Better: select='not(mod(n,1000))' is hard without duration.
134
+ # Simplest: vf fps=1 to get 1 per second, then take N
135
+ "-vf", "fps=1",
136
+ os.path.join(tmpdir, "frame_%03d.jpg")
137
+ ], check=False, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
138
+
139
+ # List generated frames
140
+ frames = sorted(glob.glob(os.path.join(tmpdir, "*.jpg")))
141
+
142
+ if not frames:
143
+ # Fallback: try extracting just 5 frames total using 'select' filter
144
+ # or just 1 frame if short
145
+ subprocess.run([
146
+ "ffmpeg", "-i", video_path, "-vframes", "5",
147
+ os.path.join(tmpdir, "thumb%d.jpg")
148
+ ], check=False, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
149
+ frames = sorted(glob.glob(os.path.join(tmpdir, "*.jpg")))
150
+
151
+ # Pick num_frames evenly spaced
152
+ if len(frames) > num_frames:
153
+ indices = np.linspace(0, len(frames)-1, num_frames, dtype=int)
154
+ selected_frames = [frames[i] for i in indices]
155
+ else:
156
+ selected_frames = frames
157
+
158
+ frames_analysis = []
159
+ for idx, frame_path in enumerate(selected_frames):
160
+ with open(frame_path, "rb") as f:
161
+ frame_b64 = base64.b64encode(f.read()).decode("utf-8")
162
+
163
+ frame_question = f"Frame {idx+1}: {question}"
164
+ analysis = call_openrouter_vision(
165
+ model="qwen/qwen3-vl-30b-a3b-thinking",
166
+ question=frame_question,
167
+ image_base64=frame_b64,
168
+ fallback_model="google/gemini-2.5-flash"
169
+ )
170
+ frames_analysis.append(f"Frame {idx+1}: {analysis}")
171
+
172
+ combined = "\n\n".join(frames_analysis)
173
+ return f"Video analysis ({len(selected_frames)} frames extracted via FFmpeg):\n{combined}"
174
+
175
+ except Exception as e:
176
+ return f"Error analyzing video: {str(e)}"
177
+
178
+
179
+
180
+
181
+ @tool
182
+ def vision_analyze_document(question: str, file_path: str) -> str:
183
+ """
184
+ Analyze a document (TXT/MD) using AI.
185
+ For PDF or other formats, please use Code Interpreter to extract text or convert to images first.
186
+
187
+ Args:
188
+ question: Question about the document
189
+ file_path: Path to document file
190
+
191
+ Returns:
192
+ Analysis result from document content
193
+ """
194
+ try:
195
+ text_content = ""
196
+
197
+ # Extract text based on file type
198
+ if file_path.lower().endswith(('.txt', '.md', '.py', '.js', '.json', '.html', '.css', '.csv')):
199
+ with open(file_path, 'r', encoding='utf-8') as f:
200
+ text_content = f.read()
201
+
202
+ # Analyze with Gemini (good for documents)
203
+ result = call_openrouter_vision(
204
+ model="google/gemini-3-flash-preview",
205
+ question=f"{question}\n\nDocument content:\n{text_content[:15000]}", # Limit length
206
+ fallback_model="google/gemini-2.5-flash"
207
+ )
208
+ return result
209
+
210
+ else:
211
+ return f"Direct analysis for this file type ({os.path.basename(file_path)}) is not supported directly. Please use the Code Interpreter tool to read/convert this file first."
212
+
213
+ except Exception as e:
214
+ return f"Error analyzing document: {str(e)}"
requirements.txt CHANGED
@@ -12,20 +12,18 @@ langchain
12
  langchain-community
13
  langchain-core
14
  langchain-huggingface
 
15
  langchain-tavily
16
  langgraph
17
  huggingface_hub
18
  supabase>=2.0.0
19
  arxiv
20
- pymupdf
21
  wikipedia
22
  pgvector
23
  python-dotenv
24
- pytesseract
25
  matplotlib
26
  sentence_transformers
27
  numpy
28
  tavily-python
29
- opencv-python
30
- yt-dlp
31
  langchain-openai
 
12
  langchain-community
13
  langchain-core
14
  langchain-huggingface
15
+ huggingface-hub>=0.20.0
16
  langchain-tavily
17
  langgraph
18
  huggingface_hub
19
  supabase>=2.0.0
20
  arxiv
21
+
22
  wikipedia
23
  pgvector
24
  python-dotenv
 
25
  matplotlib
26
  sentence_transformers
27
  numpy
28
  tavily-python
 
 
29
  langchain-openai