First_agent_template

Sleeping

App Files Files Community

LeVinh commited on Jan 25

Commit

c09239f

1 Parent(s): e284096

fix...

Browse files

Files changed (8) hide show

Dockerfile +0 -1
agent.json +4 -4
agent.py +66 -38
eval.py +91 -9
logic.py +0 -104
metadata.jsonl +0 -0
multimodal_tools.py +214 -0
requirements.txt +2 -4

Dockerfile CHANGED Viewed

@@ -10,7 +10,6 @@ RUN apt-get update && apt-get install -y \
     libxext6 \
     cmake \
     libgl1 \
-    tesseract-ocr \
     curl \
     && rm -rf /var/lib/apt/lists/*

     libxext6 \
     cmake \
     libgl1 \
     curl \
     && rm -rf /var/lib/apt/lists/*

agent.json CHANGED Viewed

@@ -7,13 +7,13 @@
     "model": {
         "class": "ChatOpenAI",
         "data": {
-            "max_tokens": 4096,
             "temperature": 0.01,
             "last_input_token_count": null,
             "last_output_token_count": null,
-            "model_id": "Meta-Llama-3.1-8B-Instruct",
-            "base_url": "https://api.sambanova.ai/v1",
-            "api_key_env": "SAMBANOVA_API_KEY",
             "custom_role_conversions": null
         }
     },

     "model": {
         "class": "ChatOpenAI",
         "data": {
+            "max_tokens": 2048,
             "temperature": 0.01,
             "last_input_token_count": null,
             "last_output_token_count": null,
+            "model_id": "qwen/qwen3-32b",
+            "base_url": "https://openrouter.ai/api/v1",
+            "api_key_env": "OPENROUTER_API_KEY",
             "custom_role_conversions": null
         }
     },

agent.py CHANGED Viewed

@@ -9,9 +9,7 @@ import numpy as np
 import pandas as pd
 from typing import List, Dict, Any, Optional
 from urllib.parse import urlparse
-import pytesseract
-import cv2
-import yt_dlp
 from PIL import Image, ImageDraw, ImageFont, ImageEnhance, ImageFilter
 from dotenv import load_dotenv
@@ -32,6 +30,7 @@ from langchain_huggingface import (
     HuggingFaceEndpoint,
     HuggingFaceEmbeddings,
 )
 from langchain_core.messages import SystemMessage, HumanMessage
 from langchain_core.tools import tool, Tool
 from supabase.client import Client, create_client
@@ -40,9 +39,13 @@ from supabase.client import Client, create_client
 from code_interpreter import CodeInterpreter
 from img_processing import decode_image, encode_image, save_image
 from dotenv import load_dotenv
 load_dotenv()
-# Configure Gemini API key
 interpreter_instance = CodeInterpreter()
@@ -298,20 +301,15 @@ def download_file_from_url(url: str, filename: Optional[str] = None) -> str:
 @tool
 def extract_text_from_image(image_path: str) -> str:
     """
-    Extract text from an image using OCR.
     Args:
         image_path (str): The path to the image file.
     Returns:
         str: Extracted text or error message.
     """
-    try:
-        image = Image.open(image_path)
-        text = pytesseract.image_to_string(image)
-        return f"Extracted text from image:\n\n{text}"
-    except Exception as e:
-        return f"Error extracting text from image: {str(e)}"
 @tool
@@ -693,6 +691,11 @@ tools = [
     draw_on_image,
     generate_simple_image,
     combine_images,
 ]
@@ -762,36 +765,61 @@ def build_graph(provider: str = None):
              # Default fallback
             provider = "openai" # Default to openai as fallback
-    if provider == "google":
-        # Check if model_id is in config, otherwise default
-        model_id = model_data.get("model_id", "gemini-1.5-flash")
-        # Ensure it starts with gemini
-        if not model_id.startswith("gemini"):
-             model_id = "gemini-1.5-flash"
-        llm = ChatGoogleGenerativeAI(model=model_id)
-        print(f"Using Google provider with model: {model_id}")
-    elif provider == "huggingface":
-        repo_id = model_data.get("model_id", "Qwen/Qwen2.5-7B-Instruct")
-        llm = ChatHuggingFace(
-            llm=HuggingFaceEndpoint(
-                repo_id=repo_id,
-                task="text-generation",
-                max_new_tokens=model_data.get("max_tokens", 4096),
-                do_sample=False,
-                repetition_penalty=1.03,
                 temperature=model_data.get("temperature", 0.01),
-            ),
-            verbose=True,
-        )
-        print(f"Using Hugging Face provider with model: {repo_id}")
     elif provider == "openai":
-        model_id = model_data.get("model_id", "Meta-Llama-3.1-8B-Instruct")
-        api_key_env = model_data.get("api_key_env", "OPENAI_API_KEY")
         api_key = os.getenv(api_key_env)
         if not api_key:
-            print(f"Warning: {api_key_env} not found in environment variables. Calls might fail.")
         llm = ChatOpenAI(
             model=model_id,
@@ -804,7 +832,7 @@ def build_graph(provider: str = None):
     else:
         # Fallback or error if other providers are requested but not implemented
-        raise ValueError(f"Invalid provider: {provider}. Supported: 'google', 'huggingface', 'openai'.")
     llm_with_tools = llm.bind_tools(tools)

 import pandas as pd
 from typing import List, Dict, Any, Optional
 from urllib.parse import urlparse
 from PIL import Image, ImageDraw, ImageFont, ImageEnhance, ImageFilter
 from dotenv import load_dotenv
     HuggingFaceEndpoint,
     HuggingFaceEmbeddings,
 )
+from huggingface_hub import InferenceClient
 from langchain_core.messages import SystemMessage, HumanMessage
 from langchain_core.tools import tool, Tool
 from supabase.client import Client, create_client
 from code_interpreter import CodeInterpreter
 from img_processing import decode_image, encode_image, save_image
 from dotenv import load_dotenv
+from multimodal_tools import (
+    vision_analyze_image,
+    vision_analyze_video,
+    vision_analyze_document
+)
 load_dotenv()
 interpreter_instance = CodeInterpreter()
 @tool
 def extract_text_from_image(image_path: str) -> str:
     """
+    Extract text from an image using AI Vision (OCR).
     Args:
         image_path (str): The path to the image file.
     Returns:
         str: Extracted text or error message.
     """
+    return vision_analyze_image("Transcribe all text from this image verbatim.", image_path)
 @tool
     draw_on_image,
     generate_simple_image,
     combine_images,
+    # Multimodal vision tools (OpenRouter)
+    vision_analyze_image,
+    vision_analyze_video,
+    vision_analyze_document,
 ]
              # Default fallback
             provider = "openai" # Default to openai as fallback
+    if provider == "huggingface":
+        # All config must come from agent.json
+        if "model_id" not in model_data:
+            raise ValueError("model_id is required in agent.json for HuggingFace provider")
+        model_id = model_data["model_id"]
+        # Parse provider suffix from model_id (e.g., "Qwen/Qwen3-32B:cerebras")
+        # Format: "model_name" or "model_name:provider"
+        if ":" in model_id:
+            # Use router for third-party providers (cerebras, novita, etc.)
+            model_name, provider_suffix = model_id.rsplit(":", 1)
+            print(f"Using HuggingFace Router with model: {model_name}, provider: {provider_suffix}")
+            api_key = os.getenv("HF_TOKEN")
+            if not api_key:
+                raise ValueError("HF_TOKEN not found in environment variables")
+            # Use router for third-party providers
+            llm = ChatOpenAI(
+                model=model_id,  # Full model_id with provider suffix
+                base_url="https://router.huggingface.co/v1",
+                api_key=api_key,
+                max_tokens=model_data.get("max_tokens", 4096),
                 temperature=model_data.get("temperature", 0.01),
+            )
+        else:
+            # Use free serverless inference (no provider suffix)
+            print(f"Using HuggingFace Serverless Inference with model: {model_id}")
+            # Construct the serverless inference API URL to bypass router
+            # Format: https://api-inference.huggingface.co/models/{model_id}
+            serverless_url = f"https://api-inference.huggingface.co/models/{model_id}"
+            llm = ChatHuggingFace(
+                llm=HuggingFaceEndpoint(
+                    endpoint_url=serverless_url,  # Use direct serverless API
+                    task="text-generation",
+                    max_new_tokens=model_data.get("max_tokens", 4096),
+                    do_sample=False,
+                    repetition_penalty=1.03,
+                    temperature=model_data.get("temperature", 0.01),
+                ),
+                verbose=True,
+            )
     elif provider == "openai":
+        # All config must come from agent.json
+        if "model_id" not in model_data:
+            raise ValueError("model_id is required in agent.json for OpenAI provider")
+        model_id = model_data["model_id"]
+        api_key_env = model_data.get("api_key_env", "OPENAI_API_KEY")  # Keep this default for compatibility
         api_key = os.getenv(api_key_env)
         if not api_key:
+            raise ValueError(f"{api_key_env} not found in environment variables")
         llm = ChatOpenAI(
             model=model_id,
     else:
         # Fallback or error if other providers are requested but not implemented
+        raise ValueError(f"Invalid provider: {provider}. Supported: 'huggingface', 'openai'.")
     llm_with_tools = llm.bind_tools(tools)

eval.py CHANGED Viewed

@@ -17,10 +17,45 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 # Debug Environment
 print("--- Environment Debug ---")
 print(f"SPACE_ID: {os.getenv('SPACE_ID')}")
 print(f"HF_TOKEN present: {bool(os.getenv('HF_TOKEN'))}")
 print(f"Gradio Version: {gr.__version__}")
 print("-------------------------")
 # --- Basic Agent Definition ---
 class BasicAgent:
     def __init__(self):
@@ -45,21 +80,55 @@ class BasicAgent:
             # content is the response from the agent
             content = result["messages"][-1].content
-            # Clean up response if it's a list or has prefixes
             if isinstance(content, list):
                 content = " ".join([str(item) for item in content])
-            # Remove "Final Answer:" prefix case-insensitively
             import re
-            content = re.sub(r"^Final Answer:\s*", "", content, flags=re.IGNORECASE).strip()
-            # Also remove "Answer:" if present
-            content = re.sub(r"^Answer:\s*", "", content, flags=re.IGNORECASE).strip()
-            print(f"Agent returning answer: {content[:100]}...")
             return content
         except Exception as e:
-            print(f"Error invoking agent: {e}")
-            return f"Error: {e}"
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
@@ -115,12 +184,19 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
     results_log = []
     answers_payload = []
     print(f"Running agent on {len(questions_data)} questions...")
-    for item in questions_data:
         task_id = item.get("task_id")
         question_text = item.get("question")
         if not task_id or question_text is None:
             print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
             submitted_answer = agent(question_text)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
@@ -128,6 +204,12 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
         except Exception as e:
              print(f"Error running agent on task {task_id}: {e}")
              results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
     if not answers_payload:
         print("Agent did not produce any answers to submit.")

 # Debug Environment
 print("--- Environment Debug ---")
 print(f"SPACE_ID: {os.getenv('SPACE_ID')}")
+print(f"SPACE_HOST: {os.getenv('SPACE_HOST')}")
 print(f"HF_TOKEN present: {bool(os.getenv('HF_TOKEN'))}")
 print(f"Gradio Version: {gr.__version__}")
 print("-------------------------")
+# CRITICAL FIX: Derive SPACE_ID from SPACE_HOST if not set
+# HF Spaces sets SPACE_HOST (e.g., "vinhle-first-agent-template.hf.space")
+# but not always SPACE_ID in Docker containers
+if not os.getenv("SPACE_ID") and os.getenv("SPACE_HOST"):
+    space_host = os.getenv("SPACE_HOST")
+    # Parse: "username-spacename.hf.space" -> "username/spacename"
+    if space_host.endswith(".hf.space"):
+        space_slug = space_host.replace(".hf.space", "")
+        # Convert "vinhle-first-agent-template" to "vinhle/first_agent_template"
+        parts = space_slug.split("-", 1)  # Split on first hyphen only
+        if len(parts) == 2:
+            username, space_name = parts
+            space_id = f"{username}/{space_name.replace('-', '_')}"
+            os.environ["SPACE_ID"] = space_id
+            print(f"✅ Derived SPACE_ID from SPACE_HOST: {space_id}")
+        else:
+            print(f"⚠️ Could not parse SPACE_HOST: {space_host}")
+# Display configured model
+try:
+    import json
+    with open("agent.json", "r") as f:
+        config = json.load(f)
+    model_config = config.get("model", {}).get("data", {})
+    model_id = model_config.get("model_id", "Unknown")
+    base_url = model_config.get("base_url", "Unknown")
+    print(f"\n🤖 Configured Model: {model_id}")
+    print(f"   Provider: {base_url}")
+    print()
+except Exception as e:
+    print(f"⚠️ Could not load model config: {e}\n")
 # --- Basic Agent Definition ---
 class BasicAgent:
     def __init__(self):
             # content is the response from the agent
             content = result["messages"][-1].content
+            # Clean up response if it's a list
             if isinstance(content, list):
                 content = " ".join([str(item) for item in content])
+            # DEBUG: Show full raw response (first 500 chars)
+            print(f"Raw model response: {content[:500]}...")
+            # Extract ONLY the final answer
             import re
+            original_content = content
+            # Strategy 1: Look for "FINAL ANSWER:" (case-insensitive) and extract everything after it
+            final_answer_match = re.search(r'FINAL\s+ANSWER:\s*(.+?)(?:\s*</think>|$)', content, re.IGNORECASE | re.DOTALL)
+            if final_answer_match:
+                content = final_answer_match.group(1).strip()
+                print("✅ Extracted using FINAL ANSWER pattern")
+            else:
+                # Strategy 2: If no "FINAL ANSWER:", try to extract text after </think> tag
+                think_match = re.search(r'</think>\s*(.+)$', content, re.DOTALL)
+                if think_match:
+                    content = think_match.group(1).strip()
+                    print("✅ Extracted text after </think> tag")
+                else:
+                    # Strategy 3: Remove all <think>...</think> blocks entirely
+                    content = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL).strip()
+                    print("✅ Removed <think> blocks")
+                    # If nothing remains, the model didn't follow format - return error
+                    if not content:
+                        print("⚠️ Model output only contained reasoning, no answer found!")
+                        return "ERROR: Model did not provide a final answer"
+            # Remove any remaining XML-like tags
+            content = re.sub(r'<[^>]+>', '', content).strip()
+            # Remove any leading "Answer:" or "Final Answer:" that might remain
+            content = re.sub(r'^(Final\s+)?Answer:\s*', '', content, flags=re.IGNORECASE).strip()
+            print(f"📤 Submitting answer: '{content}'")
             return content
         except Exception as e:
+            error_msg = str(e)
+            # Check if it's a rate limit error
+            if "429" in error_msg or "rate limit" in error_msg.lower():
+                print(f"⚠️ Rate limit exceeded (429): {e}")
+                return "ERROR: Rate limit exceeded"
+            else:
+                print(f"Error invoking agent: {e}")
+                return f"Error: {e}"
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
     results_log = []
     answers_payload = []
     print(f"Running agent on {len(questions_data)} questions...")
+    # Add delay between requests to avoid rate limiting
+    import time
+    DELAY_BETWEEN_REQUESTS = 3  # seconds - adjust as needed
+    for idx, item in enumerate(questions_data, 1):
         task_id = item.get("task_id")
         question_text = item.get("question")
         if not task_id or question_text is None:
             print(f"Skipping item with missing task_id or question: {item}")
             continue
+        print(f"\n📝 Processing question {idx}/{len(questions_data)}...")
         try:
             submitted_answer = agent(question_text)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
         except Exception as e:
              print(f"Error running agent on task {task_id}: {e}")
              results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
+        # Add delay between requests (except after the last one)
+        if idx < len(questions_data):
+            print(f"⏳ Waiting {DELAY_BETWEEN_REQUESTS}s before next request to avoid rate limiting...")
+            time.sleep(DELAY_BETWEEN_REQUESTS)
     if not answers_payload:
         print("Agent did not produce any answers to submit.")

logic.py DELETED Viewed

@@ -1,104 +0,0 @@
-import os
-import logging
-from typing import List, Tuple, Optional
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-try:
-    from agent import build_graph
-    AGENT_AVAILABLE = True
-    logger.info("Agent successfully imported!")
-except ImportError as e:
-    AGENT_AVAILABLE = False
-    logger.error(f"Could not import 'agent.build_graph': {str(e)}")
-    import traceback
-    traceback.print_exc()
-class GaiaApp:
-    def __init__(self):
-        self.agent = None
-    def _ensure_agent(self):
-        if self.agent is None:
-            logger.info("Initializing Agent...")
-            self.agent = build_graph() if AGENT_AVAILABLE else None
-            logger.info("Agent initialization complete.")
-    def process_input(self, user_message: str, history: List[dict], uploaded_files: Optional[List[str]]):
-        """
-        Main handler for chat input.
-        Args:
-            user_message: The text input from the user.
-            history: The existing chat history (list of message dicts).
-            uploaded_files: List of file paths.
-        """
-        if not user_message and not uploaded_files:
-            return "", history, None
-        self._ensure_agent()
-        # 1. Process Files
-        context_msg = ""
-        if uploaded_files:
-            file_names = [os.path.basename(f) for f in uploaded_files]
-            context_msg = f"\n[User uploaded files: {', '.join(file_names)}]"
-        full_query = user_message + context_msg
-        # 2. Append User Message to History immediately for UI update
-        current_history = history + [{"role": "user", "content": user_message}]
-        # 3. Yield back immediately to show user message
-        yield "", current_history, None
-        # 4. Invoke Agent
-        try:
-            # Prepare messages for LangChain/Agent
-            # (Simplification: just sending last message)
-            from langchain_core.messages import HumanMessage
-            inputs = {"messages": [HumanMessage(content=full_query)]}
-            result = self.agent.invoke(inputs)
-            # Extract response
-            # Assuming standard LangGraph/LangChain output
-            if isinstance(result, dict) and 'messages' in result:
-                last_msg = result['messages'][-1]
-                # Handle both Message objects and dicts
-                if hasattr(last_msg, 'content'):
-                    bot_response = last_msg.content
-                elif isinstance(last_msg, dict):
-                    bot_response = last_msg.get('content', str(last_msg))
-                else:
-                    bot_response = str(last_msg)
-            else:
-                bot_response = str(result)
-            # Clean up response prefixes if present
-            if isinstance(bot_response, list):
-                # If content is a list of blocks, join them or take the first text block
-                bot_response = " ".join([str(item) for item in bot_response])
-            if isinstance(bot_response, str) and bot_response.startswith("Assistant:"):
-                bot_response = bot_response.replace("Assistant:", "").strip()
-            # 5. Stream/Update Bot Response
-            current_history.append({"role": "assistant", "content": bot_response})
-            yield "", current_history, None
-        except Exception as e:
-            logger.error(f"Error invoking agent: {e}")
-            error_msg = f"Error: {str(e)}"
-            current_history.append({"role": "assistant", "content": error_msg})
-            yield "", current_history, None
-    def create_new_chat(self):
-        """Resets the state."""
-        return [], None, ""
-    def load_example(self, prompt):
-        return prompt
-# Singleton instance for the app
-gaia_logic = GaiaApp()

metadata.jsonl CHANGED Viewed

The diff for this file is too large to render. See raw diff

multimodal_tools.py ADDED Viewed

	@@ -0,0 +1,214 @@

+"""
+Multimodal Analysis Tools using OpenRouter Vision Models
+"""
+import os
+import base64
+import tempfile
+import requests
+import numpy as np
+from langchain_core.tools import tool
+def call_openrouter_vision(
+    model: str,
+    question: str,
+    image_base64: str = None,
+    fallback_model: str = None
+) -> str:
+    """
+    Call OpenRouter vision model for image analysis.
+    Args:
+        model: Model ID (e.g.,  "qwen/qwen3-vl-30b-a3b-thinking")
+        question: Question about the image
+        image_base64: Base64 encoded image
+        fallback_model: Fallback model if primary fails
+    Returns:
+        Model's response text
+    """
+    api_key = os.getenv("OPENROUTER_API_KEY")
+    if not api_key:
+        raise ValueError("OPENROUTER_API_KEY not found in environment")
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": question}
+            ]
+        }
+    ]
+    # Add image if provided
+    if image_base64:
+        messages[0]["content"].append({
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/jpeg;base64,{image_base64}"
+            }
+        })
+    try:
+        response = requests.post(
+            "https://openrouter.ai/api/v1/chat/completions",
+            headers={
+                "Authorization": f"Bearer {api_key}",
+                "Content-Type": "application/json"
+            },
+            json={
+                "model": model,
+                "messages": messages,
+                "max_tokens": 2048
+            },
+            timeout=60
+        )
+        response.raise_for_status()
+        return response.json()["choices"][0]["message"]["content"]
+    except Exception as e:
+        if fallback_model and fallback_model != model:
+            print(f"Primary model {model} failed, trying fallback {fallback_model}: {e}")
+            return call_openrouter_vision(fallback_model, question, image_base64)
+        raise Exception(f"OpenRouter vision call failed: {e}")
+@tool
+def vision_analyze_image(question: str, image_path: str) -> str:
+    """
+    Analyze an image using AI vision model to answer questions about it.
+    Use this for semantic understanding of images (chess positions, charts, diagrams, screenshots, etc.)
+    Args:
+        question: Question about the image
+        image_path: Path to image file
+    Returns:
+        Analysis result from vision model
+    """
+    try:
+        # Load and encode image
+        with open(image_path, "rb") as f:
+            image_data = base64.b64encode(f.read()).decode("utf-8")
+        # Call OpenRouter vision model with fallback
+        result = call_openrouter_vision(
+            model="qwen/qwen3-vl-30b-a3b-thinking",
+            question=question,
+            image_base64=image_data,
+            fallback_model="google/gemini-2.5-flash"
+        )
+        return result
+    except Exception as e:
+        return f"Error analyzing image: {str(e)}"
+import subprocess
+import glob
+# ... (call_openrouter_vision remains same)
+@tool
+def vision_analyze_video(question: str, video_path: str, num_frames: int = 5) -> str:
+    """
+    Analyze a video file by extracting key frames using FFmpeg.
+    Args:
+        question: Question about the video
+        video_path: Path to video file
+        num_frames: Number of frames to extract
+    Returns:
+        Analysis result combining insights from all frames
+    """
+    try:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Use FFmpeg to extract frames at intervals
+            # fps=1/interval? Easier: just extract 5 frames uniformly?
+            # Strategy: Extract 5 frames at percentage intervals (0%, 20%, 40%...)
+            # First, extract frames
+            subprocess.run([
+                "ffmpeg", "-i", video_path,
+                "-vf", f"fps={num_frames}/(duration)", # approximate
+                # Better: select='not(mod(n,1000))' is hard without duration.
+                # Simplest: vf fps=1 to get 1 per second, then take N
+                "-vf", "fps=1",
+                os.path.join(tmpdir, "frame_%03d.jpg")
+            ], check=False, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+            # List generated frames
+            frames = sorted(glob.glob(os.path.join(tmpdir, "*.jpg")))
+            if not frames:
+                # Fallback: try extracting just 5 frames total using 'select' filter
+                # or just 1 frame if short
+                subprocess.run([
+                   "ffmpeg", "-i", video_path, "-vframes", "5",
+                   os.path.join(tmpdir, "thumb%d.jpg")
+                ], check=False, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+                frames = sorted(glob.glob(os.path.join(tmpdir, "*.jpg")))
+            # Pick num_frames evenly spaced
+            if len(frames) > num_frames:
+                indices = np.linspace(0, len(frames)-1, num_frames, dtype=int)
+                selected_frames = [frames[i] for i in indices]
+            else:
+                selected_frames = frames
+            frames_analysis = []
+            for idx, frame_path in enumerate(selected_frames):
+                with open(frame_path, "rb") as f:
+                    frame_b64 = base64.b64encode(f.read()).decode("utf-8")
+                frame_question = f"Frame {idx+1}: {question}"
+                analysis = call_openrouter_vision(
+                    model="qwen/qwen3-vl-30b-a3b-thinking",
+                    question=frame_question,
+                    image_base64=frame_b64,
+                    fallback_model="google/gemini-2.5-flash"
+                )
+                frames_analysis.append(f"Frame {idx+1}: {analysis}")
+            combined = "\n\n".join(frames_analysis)
+            return f"Video analysis ({len(selected_frames)} frames extracted via FFmpeg):\n{combined}"
+    except Exception as e:
+        return f"Error analyzing video: {str(e)}"
+@tool
+def vision_analyze_document(question: str, file_path: str) -> str:
+    """
+    Analyze a document (TXT/MD) using AI.
+    For PDF or other formats, please use Code Interpreter to extract text or convert to images first.
+    Args:
+        question: Question about the document
+        file_path: Path to document file
+    Returns:
+        Analysis result from document content
+    """
+    try:
+        text_content = ""
+        # Extract text based on file type
+        if file_path.lower().endswith(('.txt', '.md', '.py', '.js', '.json', '.html', '.css', '.csv')):
+            with open(file_path, 'r', encoding='utf-8') as f:
+                text_content = f.read()
+            # Analyze with Gemini (good for documents)
+            result = call_openrouter_vision(
+                model="google/gemini-3-flash-preview",
+                question=f"{question}\n\nDocument content:\n{text_content[:15000]}",  # Limit length
+                fallback_model="google/gemini-2.5-flash"
+            )
+            return result
+        else:
+            return f"Direct analysis for this file type ({os.path.basename(file_path)}) is not supported directly. Please use the Code Interpreter tool to read/convert this file first."
+    except Exception as e:
+        return f"Error analyzing document: {str(e)}"

requirements.txt CHANGED Viewed

@@ -12,20 +12,18 @@ langchain
 langchain-community
 langchain-core
 langchain-huggingface
 langchain-tavily
 langgraph
 huggingface_hub
 supabase>=2.0.0
 arxiv
-pymupdf
 wikipedia
 pgvector
 python-dotenv
-pytesseract
 matplotlib
 sentence_transformers
 numpy
 tavily-python
-opencv-python
-yt-dlp
 langchain-openai

 langchain-community
 langchain-core
 langchain-huggingface
+huggingface-hub>=0.20.0
 langchain-tavily
 langgraph
 huggingface_hub
 supabase>=2.0.0
 arxiv
 wikipedia
 pgvector
 python-dotenv
 matplotlib
 sentence_transformers
 numpy
 tavily-python
 langchain-openai