First_agent_template / multimodal_tools.py
LeVinh
fix...
c09239f
"""
Multimodal Analysis Tools using OpenRouter Vision Models
"""
import os
import base64
import tempfile
import requests
import numpy as np
from langchain_core.tools import tool
def call_openrouter_vision(
model: str,
question: str,
image_base64: str = None,
fallback_model: str = None
) -> str:
"""
Call OpenRouter vision model for image analysis.
Args:
model: Model ID (e.g., "qwen/qwen3-vl-30b-a3b-thinking")
question: Question about the image
image_base64: Base64 encoded image
fallback_model: Fallback model if primary fails
Returns:
Model's response text
"""
api_key = os.getenv("OPENROUTER_API_KEY")
if not api_key:
raise ValueError("OPENROUTER_API_KEY not found in environment")
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": question}
]
}
]
# Add image if provided
if image_base64:
messages[0]["content"].append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_base64}"
}
})
try:
response = requests.post(
"https://openrouter.ai/api/v1/chat/completions",
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
},
json={
"model": model,
"messages": messages,
"max_tokens": 2048
},
timeout=60
)
response.raise_for_status()
return response.json()["choices"][0]["message"]["content"]
except Exception as e:
if fallback_model and fallback_model != model:
print(f"Primary model {model} failed, trying fallback {fallback_model}: {e}")
return call_openrouter_vision(fallback_model, question, image_base64)
raise Exception(f"OpenRouter vision call failed: {e}")
@tool
def vision_analyze_image(question: str, image_path: str) -> str:
"""
Analyze an image using AI vision model to answer questions about it.
Use this for semantic understanding of images (chess positions, charts, diagrams, screenshots, etc.)
Args:
question: Question about the image
image_path: Path to image file
Returns:
Analysis result from vision model
"""
try:
# Load and encode image
with open(image_path, "rb") as f:
image_data = base64.b64encode(f.read()).decode("utf-8")
# Call OpenRouter vision model with fallback
result = call_openrouter_vision(
model="qwen/qwen3-vl-30b-a3b-thinking",
question=question,
image_base64=image_data,
fallback_model="google/gemini-2.5-flash"
)
return result
except Exception as e:
return f"Error analyzing image: {str(e)}"
import subprocess
import glob
# ... (call_openrouter_vision remains same)
@tool
def vision_analyze_video(question: str, video_path: str, num_frames: int = 5) -> str:
"""
Analyze a video file by extracting key frames using FFmpeg.
Args:
question: Question about the video
video_path: Path to video file
num_frames: Number of frames to extract
Returns:
Analysis result combining insights from all frames
"""
try:
with tempfile.TemporaryDirectory() as tmpdir:
# Use FFmpeg to extract frames at intervals
# fps=1/interval? Easier: just extract 5 frames uniformly?
# Strategy: Extract 5 frames at percentage intervals (0%, 20%, 40%...)
# First, extract frames
subprocess.run([
"ffmpeg", "-i", video_path,
"-vf", f"fps={num_frames}/(duration)", # approximate
# Better: select='not(mod(n,1000))' is hard without duration.
# Simplest: vf fps=1 to get 1 per second, then take N
"-vf", "fps=1",
os.path.join(tmpdir, "frame_%03d.jpg")
], check=False, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
# List generated frames
frames = sorted(glob.glob(os.path.join(tmpdir, "*.jpg")))
if not frames:
# Fallback: try extracting just 5 frames total using 'select' filter
# or just 1 frame if short
subprocess.run([
"ffmpeg", "-i", video_path, "-vframes", "5",
os.path.join(tmpdir, "thumb%d.jpg")
], check=False, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
frames = sorted(glob.glob(os.path.join(tmpdir, "*.jpg")))
# Pick num_frames evenly spaced
if len(frames) > num_frames:
indices = np.linspace(0, len(frames)-1, num_frames, dtype=int)
selected_frames = [frames[i] for i in indices]
else:
selected_frames = frames
frames_analysis = []
for idx, frame_path in enumerate(selected_frames):
with open(frame_path, "rb") as f:
frame_b64 = base64.b64encode(f.read()).decode("utf-8")
frame_question = f"Frame {idx+1}: {question}"
analysis = call_openrouter_vision(
model="qwen/qwen3-vl-30b-a3b-thinking",
question=frame_question,
image_base64=frame_b64,
fallback_model="google/gemini-2.5-flash"
)
frames_analysis.append(f"Frame {idx+1}: {analysis}")
combined = "\n\n".join(frames_analysis)
return f"Video analysis ({len(selected_frames)} frames extracted via FFmpeg):\n{combined}"
except Exception as e:
return f"Error analyzing video: {str(e)}"
@tool
def vision_analyze_document(question: str, file_path: str) -> str:
"""
Analyze a document (TXT/MD) using AI.
For PDF or other formats, please use Code Interpreter to extract text or convert to images first.
Args:
question: Question about the document
file_path: Path to document file
Returns:
Analysis result from document content
"""
try:
text_content = ""
# Extract text based on file type
if file_path.lower().endswith(('.txt', '.md', '.py', '.js', '.json', '.html', '.css', '.csv')):
with open(file_path, 'r', encoding='utf-8') as f:
text_content = f.read()
# Analyze with Gemini (good for documents)
result = call_openrouter_vision(
model="google/gemini-3-flash-preview",
question=f"{question}\n\nDocument content:\n{text_content[:15000]}", # Limit length
fallback_model="google/gemini-2.5-flash"
)
return result
else:
return f"Direct analysis for this file type ({os.path.basename(file_path)}) is not supported directly. Please use the Code Interpreter tool to read/convert this file first."
except Exception as e:
return f"Error analyzing document: {str(e)}"