Spaces:
Sleeping
Sleeping
LeVinh commited on
Commit ·
c09239f
1
Parent(s): e284096
fix...
Browse files- Dockerfile +0 -1
- agent.json +4 -4
- agent.py +66 -38
- eval.py +91 -9
- logic.py +0 -104
- metadata.jsonl +0 -0
- multimodal_tools.py +214 -0
- requirements.txt +2 -4
Dockerfile
CHANGED
|
@@ -10,7 +10,6 @@ RUN apt-get update && apt-get install -y \
|
|
| 10 |
libxext6 \
|
| 11 |
cmake \
|
| 12 |
libgl1 \
|
| 13 |
-
tesseract-ocr \
|
| 14 |
curl \
|
| 15 |
&& rm -rf /var/lib/apt/lists/*
|
| 16 |
|
|
|
|
| 10 |
libxext6 \
|
| 11 |
cmake \
|
| 12 |
libgl1 \
|
|
|
|
| 13 |
curl \
|
| 14 |
&& rm -rf /var/lib/apt/lists/*
|
| 15 |
|
agent.json
CHANGED
|
@@ -7,13 +7,13 @@
|
|
| 7 |
"model": {
|
| 8 |
"class": "ChatOpenAI",
|
| 9 |
"data": {
|
| 10 |
-
"max_tokens":
|
| 11 |
"temperature": 0.01,
|
| 12 |
"last_input_token_count": null,
|
| 13 |
"last_output_token_count": null,
|
| 14 |
-
"model_id": "
|
| 15 |
-
"base_url": "https://
|
| 16 |
-
"api_key_env": "
|
| 17 |
"custom_role_conversions": null
|
| 18 |
}
|
| 19 |
},
|
|
|
|
| 7 |
"model": {
|
| 8 |
"class": "ChatOpenAI",
|
| 9 |
"data": {
|
| 10 |
+
"max_tokens": 2048,
|
| 11 |
"temperature": 0.01,
|
| 12 |
"last_input_token_count": null,
|
| 13 |
"last_output_token_count": null,
|
| 14 |
+
"model_id": "qwen/qwen3-32b",
|
| 15 |
+
"base_url": "https://openrouter.ai/api/v1",
|
| 16 |
+
"api_key_env": "OPENROUTER_API_KEY",
|
| 17 |
"custom_role_conversions": null
|
| 18 |
}
|
| 19 |
},
|
agent.py
CHANGED
|
@@ -9,9 +9,7 @@ import numpy as np
|
|
| 9 |
import pandas as pd
|
| 10 |
from typing import List, Dict, Any, Optional
|
| 11 |
from urllib.parse import urlparse
|
| 12 |
-
|
| 13 |
-
import cv2
|
| 14 |
-
import yt_dlp
|
| 15 |
from PIL import Image, ImageDraw, ImageFont, ImageEnhance, ImageFilter
|
| 16 |
from dotenv import load_dotenv
|
| 17 |
|
|
@@ -32,6 +30,7 @@ from langchain_huggingface import (
|
|
| 32 |
HuggingFaceEndpoint,
|
| 33 |
HuggingFaceEmbeddings,
|
| 34 |
)
|
|
|
|
| 35 |
from langchain_core.messages import SystemMessage, HumanMessage
|
| 36 |
from langchain_core.tools import tool, Tool
|
| 37 |
from supabase.client import Client, create_client
|
|
@@ -40,9 +39,13 @@ from supabase.client import Client, create_client
|
|
| 40 |
from code_interpreter import CodeInterpreter
|
| 41 |
from img_processing import decode_image, encode_image, save_image
|
| 42 |
from dotenv import load_dotenv
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
load_dotenv()
|
| 45 |
-
# Configure Gemini API key
|
| 46 |
|
| 47 |
|
| 48 |
interpreter_instance = CodeInterpreter()
|
|
@@ -298,20 +301,15 @@ def download_file_from_url(url: str, filename: Optional[str] = None) -> str:
|
|
| 298 |
@tool
|
| 299 |
def extract_text_from_image(image_path: str) -> str:
|
| 300 |
"""
|
| 301 |
-
Extract text from an image using OCR.
|
| 302 |
-
|
| 303 |
Args:
|
| 304 |
image_path (str): The path to the image file.
|
| 305 |
|
| 306 |
Returns:
|
| 307 |
str: Extracted text or error message.
|
| 308 |
"""
|
| 309 |
-
|
| 310 |
-
image = Image.open(image_path)
|
| 311 |
-
text = pytesseract.image_to_string(image)
|
| 312 |
-
return f"Extracted text from image:\n\n{text}"
|
| 313 |
-
except Exception as e:
|
| 314 |
-
return f"Error extracting text from image: {str(e)}"
|
| 315 |
|
| 316 |
|
| 317 |
@tool
|
|
@@ -693,6 +691,11 @@ tools = [
|
|
| 693 |
draw_on_image,
|
| 694 |
generate_simple_image,
|
| 695 |
combine_images,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 696 |
]
|
| 697 |
|
| 698 |
|
|
@@ -762,36 +765,61 @@ def build_graph(provider: str = None):
|
|
| 762 |
# Default fallback
|
| 763 |
provider = "openai" # Default to openai as fallback
|
| 764 |
|
| 765 |
-
if provider == "
|
| 766 |
-
#
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
if not model_id.startswith("gemini"):
|
| 770 |
-
model_id = "gemini-1.5-flash"
|
| 771 |
-
|
| 772 |
-
llm = ChatGoogleGenerativeAI(model=model_id)
|
| 773 |
-
print(f"Using Google provider with model: {model_id}")
|
| 774 |
|
| 775 |
-
|
| 776 |
-
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
|
| 780 |
-
|
| 781 |
-
|
| 782 |
-
|
| 783 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 784 |
temperature=model_data.get("temperature", 0.01),
|
| 785 |
-
)
|
| 786 |
-
|
| 787 |
-
|
| 788 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 789 |
elif provider == "openai":
|
| 790 |
-
|
| 791 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 792 |
api_key = os.getenv(api_key_env)
|
| 793 |
if not api_key:
|
| 794 |
-
|
| 795 |
|
| 796 |
llm = ChatOpenAI(
|
| 797 |
model=model_id,
|
|
@@ -804,7 +832,7 @@ def build_graph(provider: str = None):
|
|
| 804 |
|
| 805 |
else:
|
| 806 |
# Fallback or error if other providers are requested but not implemented
|
| 807 |
-
raise ValueError(f"Invalid provider: {provider}. Supported: '
|
| 808 |
|
| 809 |
llm_with_tools = llm.bind_tools(tools)
|
| 810 |
|
|
|
|
| 9 |
import pandas as pd
|
| 10 |
from typing import List, Dict, Any, Optional
|
| 11 |
from urllib.parse import urlparse
|
| 12 |
+
|
|
|
|
|
|
|
| 13 |
from PIL import Image, ImageDraw, ImageFont, ImageEnhance, ImageFilter
|
| 14 |
from dotenv import load_dotenv
|
| 15 |
|
|
|
|
| 30 |
HuggingFaceEndpoint,
|
| 31 |
HuggingFaceEmbeddings,
|
| 32 |
)
|
| 33 |
+
from huggingface_hub import InferenceClient
|
| 34 |
from langchain_core.messages import SystemMessage, HumanMessage
|
| 35 |
from langchain_core.tools import tool, Tool
|
| 36 |
from supabase.client import Client, create_client
|
|
|
|
| 39 |
from code_interpreter import CodeInterpreter
|
| 40 |
from img_processing import decode_image, encode_image, save_image
|
| 41 |
from dotenv import load_dotenv
|
| 42 |
+
from multimodal_tools import (
|
| 43 |
+
vision_analyze_image,
|
| 44 |
+
vision_analyze_video,
|
| 45 |
+
vision_analyze_document
|
| 46 |
+
)
|
| 47 |
|
| 48 |
load_dotenv()
|
|
|
|
| 49 |
|
| 50 |
|
| 51 |
interpreter_instance = CodeInterpreter()
|
|
|
|
| 301 |
@tool
|
| 302 |
def extract_text_from_image(image_path: str) -> str:
|
| 303 |
"""
|
| 304 |
+
Extract text from an image using AI Vision (OCR).
|
| 305 |
+
|
| 306 |
Args:
|
| 307 |
image_path (str): The path to the image file.
|
| 308 |
|
| 309 |
Returns:
|
| 310 |
str: Extracted text or error message.
|
| 311 |
"""
|
| 312 |
+
return vision_analyze_image("Transcribe all text from this image verbatim.", image_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
|
| 314 |
|
| 315 |
@tool
|
|
|
|
| 691 |
draw_on_image,
|
| 692 |
generate_simple_image,
|
| 693 |
combine_images,
|
| 694 |
+
|
| 695 |
+
# Multimodal vision tools (OpenRouter)
|
| 696 |
+
vision_analyze_image,
|
| 697 |
+
vision_analyze_video,
|
| 698 |
+
vision_analyze_document,
|
| 699 |
]
|
| 700 |
|
| 701 |
|
|
|
|
| 765 |
# Default fallback
|
| 766 |
provider = "openai" # Default to openai as fallback
|
| 767 |
|
| 768 |
+
if provider == "huggingface":
|
| 769 |
+
# All config must come from agent.json
|
| 770 |
+
if "model_id" not in model_data:
|
| 771 |
+
raise ValueError("model_id is required in agent.json for HuggingFace provider")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 772 |
|
| 773 |
+
model_id = model_data["model_id"]
|
| 774 |
+
|
| 775 |
+
# Parse provider suffix from model_id (e.g., "Qwen/Qwen3-32B:cerebras")
|
| 776 |
+
# Format: "model_name" or "model_name:provider"
|
| 777 |
+
if ":" in model_id:
|
| 778 |
+
# Use router for third-party providers (cerebras, novita, etc.)
|
| 779 |
+
model_name, provider_suffix = model_id.rsplit(":", 1)
|
| 780 |
+
print(f"Using HuggingFace Router with model: {model_name}, provider: {provider_suffix}")
|
| 781 |
+
|
| 782 |
+
api_key = os.getenv("HF_TOKEN")
|
| 783 |
+
if not api_key:
|
| 784 |
+
raise ValueError("HF_TOKEN not found in environment variables")
|
| 785 |
+
|
| 786 |
+
# Use router for third-party providers
|
| 787 |
+
llm = ChatOpenAI(
|
| 788 |
+
model=model_id, # Full model_id with provider suffix
|
| 789 |
+
base_url="https://router.huggingface.co/v1",
|
| 790 |
+
api_key=api_key,
|
| 791 |
+
max_tokens=model_data.get("max_tokens", 4096),
|
| 792 |
temperature=model_data.get("temperature", 0.01),
|
| 793 |
+
)
|
| 794 |
+
else:
|
| 795 |
+
# Use free serverless inference (no provider suffix)
|
| 796 |
+
print(f"Using HuggingFace Serverless Inference with model: {model_id}")
|
| 797 |
+
|
| 798 |
+
# Construct the serverless inference API URL to bypass router
|
| 799 |
+
# Format: https://api-inference.huggingface.co/models/{model_id}
|
| 800 |
+
serverless_url = f"https://api-inference.huggingface.co/models/{model_id}"
|
| 801 |
+
|
| 802 |
+
llm = ChatHuggingFace(
|
| 803 |
+
llm=HuggingFaceEndpoint(
|
| 804 |
+
endpoint_url=serverless_url, # Use direct serverless API
|
| 805 |
+
task="text-generation",
|
| 806 |
+
max_new_tokens=model_data.get("max_tokens", 4096),
|
| 807 |
+
do_sample=False,
|
| 808 |
+
repetition_penalty=1.03,
|
| 809 |
+
temperature=model_data.get("temperature", 0.01),
|
| 810 |
+
),
|
| 811 |
+
verbose=True,
|
| 812 |
+
)
|
| 813 |
elif provider == "openai":
|
| 814 |
+
# All config must come from agent.json
|
| 815 |
+
if "model_id" not in model_data:
|
| 816 |
+
raise ValueError("model_id is required in agent.json for OpenAI provider")
|
| 817 |
+
|
| 818 |
+
model_id = model_data["model_id"]
|
| 819 |
+
api_key_env = model_data.get("api_key_env", "OPENAI_API_KEY") # Keep this default for compatibility
|
| 820 |
api_key = os.getenv(api_key_env)
|
| 821 |
if not api_key:
|
| 822 |
+
raise ValueError(f"{api_key_env} not found in environment variables")
|
| 823 |
|
| 824 |
llm = ChatOpenAI(
|
| 825 |
model=model_id,
|
|
|
|
| 832 |
|
| 833 |
else:
|
| 834 |
# Fallback or error if other providers are requested but not implemented
|
| 835 |
+
raise ValueError(f"Invalid provider: {provider}. Supported: 'huggingface', 'openai'.")
|
| 836 |
|
| 837 |
llm_with_tools = llm.bind_tools(tools)
|
| 838 |
|
eval.py
CHANGED
|
@@ -17,10 +17,45 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
|
| 17 |
# Debug Environment
|
| 18 |
print("--- Environment Debug ---")
|
| 19 |
print(f"SPACE_ID: {os.getenv('SPACE_ID')}")
|
|
|
|
| 20 |
print(f"HF_TOKEN present: {bool(os.getenv('HF_TOKEN'))}")
|
| 21 |
print(f"Gradio Version: {gr.__version__}")
|
| 22 |
print("-------------------------")
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
# --- Basic Agent Definition ---
|
| 25 |
class BasicAgent:
|
| 26 |
def __init__(self):
|
|
@@ -45,21 +80,55 @@ class BasicAgent:
|
|
| 45 |
# content is the response from the agent
|
| 46 |
content = result["messages"][-1].content
|
| 47 |
|
| 48 |
-
# Clean up response if it's a list
|
| 49 |
if isinstance(content, list):
|
| 50 |
content = " ".join([str(item) for item in content])
|
| 51 |
|
| 52 |
-
#
|
|
|
|
|
|
|
|
|
|
| 53 |
import re
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
| 59 |
return content
|
| 60 |
except Exception as e:
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
| 65 |
"""
|
|
@@ -115,12 +184,19 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
| 115 |
results_log = []
|
| 116 |
answers_payload = []
|
| 117 |
print(f"Running agent on {len(questions_data)} questions...")
|
| 118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
task_id = item.get("task_id")
|
| 120 |
question_text = item.get("question")
|
| 121 |
if not task_id or question_text is None:
|
| 122 |
print(f"Skipping item with missing task_id or question: {item}")
|
| 123 |
continue
|
|
|
|
|
|
|
| 124 |
try:
|
| 125 |
submitted_answer = agent(question_text)
|
| 126 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
|
@@ -128,6 +204,12 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
| 128 |
except Exception as e:
|
| 129 |
print(f"Error running agent on task {task_id}: {e}")
|
| 130 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
if not answers_payload:
|
| 133 |
print("Agent did not produce any answers to submit.")
|
|
|
|
| 17 |
# Debug Environment
|
| 18 |
print("--- Environment Debug ---")
|
| 19 |
print(f"SPACE_ID: {os.getenv('SPACE_ID')}")
|
| 20 |
+
print(f"SPACE_HOST: {os.getenv('SPACE_HOST')}")
|
| 21 |
print(f"HF_TOKEN present: {bool(os.getenv('HF_TOKEN'))}")
|
| 22 |
print(f"Gradio Version: {gr.__version__}")
|
| 23 |
print("-------------------------")
|
| 24 |
|
| 25 |
+
# CRITICAL FIX: Derive SPACE_ID from SPACE_HOST if not set
|
| 26 |
+
# HF Spaces sets SPACE_HOST (e.g., "vinhle-first-agent-template.hf.space")
|
| 27 |
+
# but not always SPACE_ID in Docker containers
|
| 28 |
+
if not os.getenv("SPACE_ID") and os.getenv("SPACE_HOST"):
|
| 29 |
+
space_host = os.getenv("SPACE_HOST")
|
| 30 |
+
# Parse: "username-spacename.hf.space" -> "username/spacename"
|
| 31 |
+
if space_host.endswith(".hf.space"):
|
| 32 |
+
space_slug = space_host.replace(".hf.space", "")
|
| 33 |
+
# Convert "vinhle-first-agent-template" to "vinhle/first_agent_template"
|
| 34 |
+
parts = space_slug.split("-", 1) # Split on first hyphen only
|
| 35 |
+
if len(parts) == 2:
|
| 36 |
+
username, space_name = parts
|
| 37 |
+
space_id = f"{username}/{space_name.replace('-', '_')}"
|
| 38 |
+
os.environ["SPACE_ID"] = space_id
|
| 39 |
+
print(f"✅ Derived SPACE_ID from SPACE_HOST: {space_id}")
|
| 40 |
+
else:
|
| 41 |
+
print(f"⚠️ Could not parse SPACE_HOST: {space_host}")
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# Display configured model
|
| 45 |
+
try:
|
| 46 |
+
import json
|
| 47 |
+
with open("agent.json", "r") as f:
|
| 48 |
+
config = json.load(f)
|
| 49 |
+
model_config = config.get("model", {}).get("data", {})
|
| 50 |
+
model_id = model_config.get("model_id", "Unknown")
|
| 51 |
+
base_url = model_config.get("base_url", "Unknown")
|
| 52 |
+
print(f"\n🤖 Configured Model: {model_id}")
|
| 53 |
+
print(f" Provider: {base_url}")
|
| 54 |
+
print()
|
| 55 |
+
except Exception as e:
|
| 56 |
+
print(f"⚠️ Could not load model config: {e}\n")
|
| 57 |
+
|
| 58 |
+
|
| 59 |
# --- Basic Agent Definition ---
|
| 60 |
class BasicAgent:
|
| 61 |
def __init__(self):
|
|
|
|
| 80 |
# content is the response from the agent
|
| 81 |
content = result["messages"][-1].content
|
| 82 |
|
| 83 |
+
# Clean up response if it's a list
|
| 84 |
if isinstance(content, list):
|
| 85 |
content = " ".join([str(item) for item in content])
|
| 86 |
|
| 87 |
+
# DEBUG: Show full raw response (first 500 chars)
|
| 88 |
+
print(f"Raw model response: {content[:500]}...")
|
| 89 |
+
|
| 90 |
+
# Extract ONLY the final answer
|
| 91 |
import re
|
| 92 |
+
original_content = content
|
| 93 |
+
|
| 94 |
+
# Strategy 1: Look for "FINAL ANSWER:" (case-insensitive) and extract everything after it
|
| 95 |
+
final_answer_match = re.search(r'FINAL\s+ANSWER:\s*(.+?)(?:\s*</think>|$)', content, re.IGNORECASE | re.DOTALL)
|
| 96 |
+
if final_answer_match:
|
| 97 |
+
content = final_answer_match.group(1).strip()
|
| 98 |
+
print("✅ Extracted using FINAL ANSWER pattern")
|
| 99 |
+
else:
|
| 100 |
+
# Strategy 2: If no "FINAL ANSWER:", try to extract text after </think> tag
|
| 101 |
+
think_match = re.search(r'</think>\s*(.+)$', content, re.DOTALL)
|
| 102 |
+
if think_match:
|
| 103 |
+
content = think_match.group(1).strip()
|
| 104 |
+
print("✅ Extracted text after </think> tag")
|
| 105 |
+
else:
|
| 106 |
+
# Strategy 3: Remove all <think>...</think> blocks entirely
|
| 107 |
+
content = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL).strip()
|
| 108 |
+
print("✅ Removed <think> blocks")
|
| 109 |
+
|
| 110 |
+
# If nothing remains, the model didn't follow format - return error
|
| 111 |
+
if not content:
|
| 112 |
+
print("⚠️ Model output only contained reasoning, no answer found!")
|
| 113 |
+
return "ERROR: Model did not provide a final answer"
|
| 114 |
+
|
| 115 |
+
# Remove any remaining XML-like tags
|
| 116 |
+
content = re.sub(r'<[^>]+>', '', content).strip()
|
| 117 |
|
| 118 |
+
# Remove any leading "Answer:" or "Final Answer:" that might remain
|
| 119 |
+
content = re.sub(r'^(Final\s+)?Answer:\s*', '', content, flags=re.IGNORECASE).strip()
|
| 120 |
+
|
| 121 |
+
print(f"📤 Submitting answer: '{content}'")
|
| 122 |
return content
|
| 123 |
except Exception as e:
|
| 124 |
+
error_msg = str(e)
|
| 125 |
+
# Check if it's a rate limit error
|
| 126 |
+
if "429" in error_msg or "rate limit" in error_msg.lower():
|
| 127 |
+
print(f"⚠️ Rate limit exceeded (429): {e}")
|
| 128 |
+
return "ERROR: Rate limit exceeded"
|
| 129 |
+
else:
|
| 130 |
+
print(f"Error invoking agent: {e}")
|
| 131 |
+
return f"Error: {e}"
|
| 132 |
|
| 133 |
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
| 134 |
"""
|
|
|
|
| 184 |
results_log = []
|
| 185 |
answers_payload = []
|
| 186 |
print(f"Running agent on {len(questions_data)} questions...")
|
| 187 |
+
|
| 188 |
+
# Add delay between requests to avoid rate limiting
|
| 189 |
+
import time
|
| 190 |
+
DELAY_BETWEEN_REQUESTS = 3 # seconds - adjust as needed
|
| 191 |
+
|
| 192 |
+
for idx, item in enumerate(questions_data, 1):
|
| 193 |
task_id = item.get("task_id")
|
| 194 |
question_text = item.get("question")
|
| 195 |
if not task_id or question_text is None:
|
| 196 |
print(f"Skipping item with missing task_id or question: {item}")
|
| 197 |
continue
|
| 198 |
+
|
| 199 |
+
print(f"\n📝 Processing question {idx}/{len(questions_data)}...")
|
| 200 |
try:
|
| 201 |
submitted_answer = agent(question_text)
|
| 202 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
|
|
|
| 204 |
except Exception as e:
|
| 205 |
print(f"Error running agent on task {task_id}: {e}")
|
| 206 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
|
| 207 |
+
|
| 208 |
+
# Add delay between requests (except after the last one)
|
| 209 |
+
if idx < len(questions_data):
|
| 210 |
+
print(f"⏳ Waiting {DELAY_BETWEEN_REQUESTS}s before next request to avoid rate limiting...")
|
| 211 |
+
time.sleep(DELAY_BETWEEN_REQUESTS)
|
| 212 |
+
|
| 213 |
|
| 214 |
if not answers_payload:
|
| 215 |
print("Agent did not produce any answers to submit.")
|
logic.py
DELETED
|
@@ -1,104 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import logging
|
| 3 |
-
from typing import List, Tuple, Optional
|
| 4 |
-
|
| 5 |
-
logging.basicConfig(level=logging.INFO)
|
| 6 |
-
logger = logging.getLogger(__name__)
|
| 7 |
-
|
| 8 |
-
try:
|
| 9 |
-
from agent import build_graph
|
| 10 |
-
AGENT_AVAILABLE = True
|
| 11 |
-
logger.info("Agent successfully imported!")
|
| 12 |
-
except ImportError as e:
|
| 13 |
-
AGENT_AVAILABLE = False
|
| 14 |
-
logger.error(f"Could not import 'agent.build_graph': {str(e)}")
|
| 15 |
-
import traceback
|
| 16 |
-
traceback.print_exc()
|
| 17 |
-
|
| 18 |
-
class GaiaApp:
|
| 19 |
-
def __init__(self):
|
| 20 |
-
self.agent = None
|
| 21 |
-
|
| 22 |
-
def _ensure_agent(self):
|
| 23 |
-
if self.agent is None:
|
| 24 |
-
logger.info("Initializing Agent...")
|
| 25 |
-
self.agent = build_graph() if AGENT_AVAILABLE else None
|
| 26 |
-
logger.info("Agent initialization complete.")
|
| 27 |
-
|
| 28 |
-
def process_input(self, user_message: str, history: List[dict], uploaded_files: Optional[List[str]]):
|
| 29 |
-
"""
|
| 30 |
-
Main handler for chat input.
|
| 31 |
-
Args:
|
| 32 |
-
user_message: The text input from the user.
|
| 33 |
-
history: The existing chat history (list of message dicts).
|
| 34 |
-
uploaded_files: List of file paths.
|
| 35 |
-
"""
|
| 36 |
-
if not user_message and not uploaded_files:
|
| 37 |
-
return "", history, None
|
| 38 |
-
|
| 39 |
-
self._ensure_agent()
|
| 40 |
-
|
| 41 |
-
# 1. Process Files
|
| 42 |
-
context_msg = ""
|
| 43 |
-
if uploaded_files:
|
| 44 |
-
file_names = [os.path.basename(f) for f in uploaded_files]
|
| 45 |
-
context_msg = f"\n[User uploaded files: {', '.join(file_names)}]"
|
| 46 |
-
|
| 47 |
-
full_query = user_message + context_msg
|
| 48 |
-
|
| 49 |
-
# 2. Append User Message to History immediately for UI update
|
| 50 |
-
current_history = history + [{"role": "user", "content": user_message}]
|
| 51 |
-
|
| 52 |
-
# 3. Yield back immediately to show user message
|
| 53 |
-
yield "", current_history, None
|
| 54 |
-
|
| 55 |
-
# 4. Invoke Agent
|
| 56 |
-
try:
|
| 57 |
-
# Prepare messages for LangChain/Agent
|
| 58 |
-
# (Simplification: just sending last message)
|
| 59 |
-
from langchain_core.messages import HumanMessage
|
| 60 |
-
|
| 61 |
-
inputs = {"messages": [HumanMessage(content=full_query)]}
|
| 62 |
-
result = self.agent.invoke(inputs)
|
| 63 |
-
|
| 64 |
-
# Extract response
|
| 65 |
-
# Assuming standard LangGraph/LangChain output
|
| 66 |
-
if isinstance(result, dict) and 'messages' in result:
|
| 67 |
-
last_msg = result['messages'][-1]
|
| 68 |
-
# Handle both Message objects and dicts
|
| 69 |
-
if hasattr(last_msg, 'content'):
|
| 70 |
-
bot_response = last_msg.content
|
| 71 |
-
elif isinstance(last_msg, dict):
|
| 72 |
-
bot_response = last_msg.get('content', str(last_msg))
|
| 73 |
-
else:
|
| 74 |
-
bot_response = str(last_msg)
|
| 75 |
-
else:
|
| 76 |
-
bot_response = str(result)
|
| 77 |
-
|
| 78 |
-
# Clean up response prefixes if present
|
| 79 |
-
if isinstance(bot_response, list):
|
| 80 |
-
# If content is a list of blocks, join them or take the first text block
|
| 81 |
-
bot_response = " ".join([str(item) for item in bot_response])
|
| 82 |
-
|
| 83 |
-
if isinstance(bot_response, str) and bot_response.startswith("Assistant:"):
|
| 84 |
-
bot_response = bot_response.replace("Assistant:", "").strip()
|
| 85 |
-
|
| 86 |
-
# 5. Stream/Update Bot Response
|
| 87 |
-
current_history.append({"role": "assistant", "content": bot_response})
|
| 88 |
-
yield "", current_history, None
|
| 89 |
-
|
| 90 |
-
except Exception as e:
|
| 91 |
-
logger.error(f"Error invoking agent: {e}")
|
| 92 |
-
error_msg = f"Error: {str(e)}"
|
| 93 |
-
current_history.append({"role": "assistant", "content": error_msg})
|
| 94 |
-
yield "", current_history, None
|
| 95 |
-
|
| 96 |
-
def create_new_chat(self):
|
| 97 |
-
"""Resets the state."""
|
| 98 |
-
return [], None, ""
|
| 99 |
-
|
| 100 |
-
def load_example(self, prompt):
|
| 101 |
-
return prompt
|
| 102 |
-
|
| 103 |
-
# Singleton instance for the app
|
| 104 |
-
gaia_logic = GaiaApp()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
metadata.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
multimodal_tools.py
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Multimodal Analysis Tools using OpenRouter Vision Models
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
import base64
|
| 6 |
+
import tempfile
|
| 7 |
+
import requests
|
| 8 |
+
import numpy as np
|
| 9 |
+
from langchain_core.tools import tool
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def call_openrouter_vision(
|
| 13 |
+
model: str,
|
| 14 |
+
question: str,
|
| 15 |
+
image_base64: str = None,
|
| 16 |
+
fallback_model: str = None
|
| 17 |
+
) -> str:
|
| 18 |
+
"""
|
| 19 |
+
Call OpenRouter vision model for image analysis.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
model: Model ID (e.g., "qwen/qwen3-vl-30b-a3b-thinking")
|
| 23 |
+
question: Question about the image
|
| 24 |
+
image_base64: Base64 encoded image
|
| 25 |
+
fallback_model: Fallback model if primary fails
|
| 26 |
+
|
| 27 |
+
Returns:
|
| 28 |
+
Model's response text
|
| 29 |
+
"""
|
| 30 |
+
api_key = os.getenv("OPENROUTER_API_KEY")
|
| 31 |
+
if not api_key:
|
| 32 |
+
raise ValueError("OPENROUTER_API_KEY not found in environment")
|
| 33 |
+
|
| 34 |
+
messages = [
|
| 35 |
+
{
|
| 36 |
+
"role": "user",
|
| 37 |
+
"content": [
|
| 38 |
+
{"type": "text", "text": question}
|
| 39 |
+
]
|
| 40 |
+
}
|
| 41 |
+
]
|
| 42 |
+
|
| 43 |
+
# Add image if provided
|
| 44 |
+
if image_base64:
|
| 45 |
+
messages[0]["content"].append({
|
| 46 |
+
"type": "image_url",
|
| 47 |
+
"image_url": {
|
| 48 |
+
"url": f"data:image/jpeg;base64,{image_base64}"
|
| 49 |
+
}
|
| 50 |
+
})
|
| 51 |
+
|
| 52 |
+
try:
|
| 53 |
+
response = requests.post(
|
| 54 |
+
"https://openrouter.ai/api/v1/chat/completions",
|
| 55 |
+
headers={
|
| 56 |
+
"Authorization": f"Bearer {api_key}",
|
| 57 |
+
"Content-Type": "application/json"
|
| 58 |
+
},
|
| 59 |
+
json={
|
| 60 |
+
"model": model,
|
| 61 |
+
"messages": messages,
|
| 62 |
+
"max_tokens": 2048
|
| 63 |
+
},
|
| 64 |
+
timeout=60
|
| 65 |
+
)
|
| 66 |
+
response.raise_for_status()
|
| 67 |
+
return response.json()["choices"][0]["message"]["content"]
|
| 68 |
+
except Exception as e:
|
| 69 |
+
if fallback_model and fallback_model != model:
|
| 70 |
+
print(f"Primary model {model} failed, trying fallback {fallback_model}: {e}")
|
| 71 |
+
return call_openrouter_vision(fallback_model, question, image_base64)
|
| 72 |
+
raise Exception(f"OpenRouter vision call failed: {e}")
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
@tool
|
| 76 |
+
def vision_analyze_image(question: str, image_path: str) -> str:
|
| 77 |
+
"""
|
| 78 |
+
Analyze an image using AI vision model to answer questions about it.
|
| 79 |
+
Use this for semantic understanding of images (chess positions, charts, diagrams, screenshots, etc.)
|
| 80 |
+
|
| 81 |
+
Args:
|
| 82 |
+
question: Question about the image
|
| 83 |
+
image_path: Path to image file
|
| 84 |
+
|
| 85 |
+
Returns:
|
| 86 |
+
Analysis result from vision model
|
| 87 |
+
"""
|
| 88 |
+
try:
|
| 89 |
+
# Load and encode image
|
| 90 |
+
with open(image_path, "rb") as f:
|
| 91 |
+
image_data = base64.b64encode(f.read()).decode("utf-8")
|
| 92 |
+
|
| 93 |
+
# Call OpenRouter vision model with fallback
|
| 94 |
+
result = call_openrouter_vision(
|
| 95 |
+
model="qwen/qwen3-vl-30b-a3b-thinking",
|
| 96 |
+
question=question,
|
| 97 |
+
image_base64=image_data,
|
| 98 |
+
fallback_model="google/gemini-2.5-flash"
|
| 99 |
+
)
|
| 100 |
+
return result
|
| 101 |
+
except Exception as e:
|
| 102 |
+
return f"Error analyzing image: {str(e)}"
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
import subprocess
|
| 106 |
+
import glob
|
| 107 |
+
|
| 108 |
+
# ... (call_openrouter_vision remains same)
|
| 109 |
+
|
| 110 |
+
@tool
|
| 111 |
+
def vision_analyze_video(question: str, video_path: str, num_frames: int = 5) -> str:
|
| 112 |
+
"""
|
| 113 |
+
Analyze a video file by extracting key frames using FFmpeg.
|
| 114 |
+
|
| 115 |
+
Args:
|
| 116 |
+
question: Question about the video
|
| 117 |
+
video_path: Path to video file
|
| 118 |
+
num_frames: Number of frames to extract
|
| 119 |
+
|
| 120 |
+
Returns:
|
| 121 |
+
Analysis result combining insights from all frames
|
| 122 |
+
"""
|
| 123 |
+
try:
|
| 124 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 125 |
+
# Use FFmpeg to extract frames at intervals
|
| 126 |
+
# fps=1/interval? Easier: just extract 5 frames uniformly?
|
| 127 |
+
# Strategy: Extract 5 frames at percentage intervals (0%, 20%, 40%...)
|
| 128 |
+
|
| 129 |
+
# First, extract frames
|
| 130 |
+
subprocess.run([
|
| 131 |
+
"ffmpeg", "-i", video_path,
|
| 132 |
+
"-vf", f"fps={num_frames}/(duration)", # approximate
|
| 133 |
+
# Better: select='not(mod(n,1000))' is hard without duration.
|
| 134 |
+
# Simplest: vf fps=1 to get 1 per second, then take N
|
| 135 |
+
"-vf", "fps=1",
|
| 136 |
+
os.path.join(tmpdir, "frame_%03d.jpg")
|
| 137 |
+
], check=False, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
| 138 |
+
|
| 139 |
+
# List generated frames
|
| 140 |
+
frames = sorted(glob.glob(os.path.join(tmpdir, "*.jpg")))
|
| 141 |
+
|
| 142 |
+
if not frames:
|
| 143 |
+
# Fallback: try extracting just 5 frames total using 'select' filter
|
| 144 |
+
# or just 1 frame if short
|
| 145 |
+
subprocess.run([
|
| 146 |
+
"ffmpeg", "-i", video_path, "-vframes", "5",
|
| 147 |
+
os.path.join(tmpdir, "thumb%d.jpg")
|
| 148 |
+
], check=False, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
| 149 |
+
frames = sorted(glob.glob(os.path.join(tmpdir, "*.jpg")))
|
| 150 |
+
|
| 151 |
+
# Pick num_frames evenly spaced
|
| 152 |
+
if len(frames) > num_frames:
|
| 153 |
+
indices = np.linspace(0, len(frames)-1, num_frames, dtype=int)
|
| 154 |
+
selected_frames = [frames[i] for i in indices]
|
| 155 |
+
else:
|
| 156 |
+
selected_frames = frames
|
| 157 |
+
|
| 158 |
+
frames_analysis = []
|
| 159 |
+
for idx, frame_path in enumerate(selected_frames):
|
| 160 |
+
with open(frame_path, "rb") as f:
|
| 161 |
+
frame_b64 = base64.b64encode(f.read()).decode("utf-8")
|
| 162 |
+
|
| 163 |
+
frame_question = f"Frame {idx+1}: {question}"
|
| 164 |
+
analysis = call_openrouter_vision(
|
| 165 |
+
model="qwen/qwen3-vl-30b-a3b-thinking",
|
| 166 |
+
question=frame_question,
|
| 167 |
+
image_base64=frame_b64,
|
| 168 |
+
fallback_model="google/gemini-2.5-flash"
|
| 169 |
+
)
|
| 170 |
+
frames_analysis.append(f"Frame {idx+1}: {analysis}")
|
| 171 |
+
|
| 172 |
+
combined = "\n\n".join(frames_analysis)
|
| 173 |
+
return f"Video analysis ({len(selected_frames)} frames extracted via FFmpeg):\n{combined}"
|
| 174 |
+
|
| 175 |
+
except Exception as e:
|
| 176 |
+
return f"Error analyzing video: {str(e)}"
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
@tool
|
| 182 |
+
def vision_analyze_document(question: str, file_path: str) -> str:
|
| 183 |
+
"""
|
| 184 |
+
Analyze a document (TXT/MD) using AI.
|
| 185 |
+
For PDF or other formats, please use Code Interpreter to extract text or convert to images first.
|
| 186 |
+
|
| 187 |
+
Args:
|
| 188 |
+
question: Question about the document
|
| 189 |
+
file_path: Path to document file
|
| 190 |
+
|
| 191 |
+
Returns:
|
| 192 |
+
Analysis result from document content
|
| 193 |
+
"""
|
| 194 |
+
try:
|
| 195 |
+
text_content = ""
|
| 196 |
+
|
| 197 |
+
# Extract text based on file type
|
| 198 |
+
if file_path.lower().endswith(('.txt', '.md', '.py', '.js', '.json', '.html', '.css', '.csv')):
|
| 199 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 200 |
+
text_content = f.read()
|
| 201 |
+
|
| 202 |
+
# Analyze with Gemini (good for documents)
|
| 203 |
+
result = call_openrouter_vision(
|
| 204 |
+
model="google/gemini-3-flash-preview",
|
| 205 |
+
question=f"{question}\n\nDocument content:\n{text_content[:15000]}", # Limit length
|
| 206 |
+
fallback_model="google/gemini-2.5-flash"
|
| 207 |
+
)
|
| 208 |
+
return result
|
| 209 |
+
|
| 210 |
+
else:
|
| 211 |
+
return f"Direct analysis for this file type ({os.path.basename(file_path)}) is not supported directly. Please use the Code Interpreter tool to read/convert this file first."
|
| 212 |
+
|
| 213 |
+
except Exception as e:
|
| 214 |
+
return f"Error analyzing document: {str(e)}"
|
requirements.txt
CHANGED
|
@@ -12,20 +12,18 @@ langchain
|
|
| 12 |
langchain-community
|
| 13 |
langchain-core
|
| 14 |
langchain-huggingface
|
|
|
|
| 15 |
langchain-tavily
|
| 16 |
langgraph
|
| 17 |
huggingface_hub
|
| 18 |
supabase>=2.0.0
|
| 19 |
arxiv
|
| 20 |
-
|
| 21 |
wikipedia
|
| 22 |
pgvector
|
| 23 |
python-dotenv
|
| 24 |
-
pytesseract
|
| 25 |
matplotlib
|
| 26 |
sentence_transformers
|
| 27 |
numpy
|
| 28 |
tavily-python
|
| 29 |
-
opencv-python
|
| 30 |
-
yt-dlp
|
| 31 |
langchain-openai
|
|
|
|
| 12 |
langchain-community
|
| 13 |
langchain-core
|
| 14 |
langchain-huggingface
|
| 15 |
+
huggingface-hub>=0.20.0
|
| 16 |
langchain-tavily
|
| 17 |
langgraph
|
| 18 |
huggingface_hub
|
| 19 |
supabase>=2.0.0
|
| 20 |
arxiv
|
| 21 |
+
|
| 22 |
wikipedia
|
| 23 |
pgvector
|
| 24 |
python-dotenv
|
|
|
|
| 25 |
matplotlib
|
| 26 |
sentence_transformers
|
| 27 |
numpy
|
| 28 |
tavily-python
|
|
|
|
|
|
|
| 29 |
langchain-openai
|