First_agent_template

Sleeping

First_agent_template / multimodal_tools.py

LeVinh

fix...

c09239f 3 months ago

7.43 kB

	"""
	Multimodal Analysis Tools using OpenRouter Vision Models
	"""
	import os
	import base64
	import tempfile
	import requests
	import numpy as np
	from langchain_core.tools import tool


	def call_openrouter_vision(
	model: str,
	question: str,
	image_base64: str = None,
	fallback_model: str = None
	) -> str:
	"""
	Call OpenRouter vision model for image analysis.

	Args:
	model: Model ID (e.g., "qwen/qwen3-vl-30b-a3b-thinking")
	question: Question about the image
	image_base64: Base64 encoded image
	fallback_model: Fallback model if primary fails

	Returns:
	Model's response text
	"""
	api_key = os.getenv("OPENROUTER_API_KEY")
	if not api_key:
	raise ValueError("OPENROUTER_API_KEY not found in environment")

	messages = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": question}
	]
	}
	]

	# Add image if provided
	if image_base64:
	messages[0]["content"].append({
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{image_base64}"
	}
	})

	try:
	response = requests.post(
	"https://openrouter.ai/api/v1/chat/completions",
	headers={
	"Authorization": f"Bearer {api_key}",
	"Content-Type": "application/json"
	},
	json={
	"model": model,
	"messages": messages,
	"max_tokens": 2048
	},
	timeout=60
	)
	response.raise_for_status()
	return response.json()["choices"][0]["message"]["content"]
	except Exception as e:
	if fallback_model and fallback_model != model:
	print(f"Primary model {model} failed, trying fallback {fallback_model}: {e}")
	return call_openrouter_vision(fallback_model, question, image_base64)
	raise Exception(f"OpenRouter vision call failed: {e}")


	@tool
	def vision_analyze_image(question: str, image_path: str) -> str:
	"""
	Analyze an image using AI vision model to answer questions about it.
	Use this for semantic understanding of images (chess positions, charts, diagrams, screenshots, etc.)

	Args:
	question: Question about the image
	image_path: Path to image file

	Returns:
	Analysis result from vision model
	"""
	try:
	# Load and encode image
	with open(image_path, "rb") as f:
	image_data = base64.b64encode(f.read()).decode("utf-8")

	# Call OpenRouter vision model with fallback
	result = call_openrouter_vision(
	model="qwen/qwen3-vl-30b-a3b-thinking",
	question=question,
	image_base64=image_data,
	fallback_model="google/gemini-2.5-flash"
	)
	return result
	except Exception as e:
	return f"Error analyzing image: {str(e)}"


	import subprocess
	import glob

	# ... (call_openrouter_vision remains same)

	@tool
	def vision_analyze_video(question: str, video_path: str, num_frames: int = 5) -> str:
	"""
	Analyze a video file by extracting key frames using FFmpeg.

	Args:
	question: Question about the video
	video_path: Path to video file
	num_frames: Number of frames to extract

	Returns:
	Analysis result combining insights from all frames
	"""
	try:
	with tempfile.TemporaryDirectory() as tmpdir:
	# Use FFmpeg to extract frames at intervals
	# fps=1/interval? Easier: just extract 5 frames uniformly?
	# Strategy: Extract 5 frames at percentage intervals (0%, 20%, 40%...)

	# First, extract frames
	subprocess.run([
	"ffmpeg", "-i", video_path,
	"-vf", f"fps={num_frames}/(duration)", # approximate
	# Better: select='not(mod(n,1000))' is hard without duration.
	# Simplest: vf fps=1 to get 1 per second, then take N
	"-vf", "fps=1",
	os.path.join(tmpdir, "frame_%03d.jpg")
	], check=False, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

	# List generated frames
	frames = sorted(glob.glob(os.path.join(tmpdir, "*.jpg")))

	if not frames:
	# Fallback: try extracting just 5 frames total using 'select' filter
	# or just 1 frame if short
	subprocess.run([
	"ffmpeg", "-i", video_path, "-vframes", "5",
	os.path.join(tmpdir, "thumb%d.jpg")
	], check=False, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
	frames = sorted(glob.glob(os.path.join(tmpdir, "*.jpg")))

	# Pick num_frames evenly spaced
	if len(frames) > num_frames:
	indices = np.linspace(0, len(frames)-1, num_frames, dtype=int)
	selected_frames = [frames[i] for i in indices]
	else:
	selected_frames = frames

	frames_analysis = []
	for idx, frame_path in enumerate(selected_frames):
	with open(frame_path, "rb") as f:
	frame_b64 = base64.b64encode(f.read()).decode("utf-8")

	frame_question = f"Frame {idx+1}: {question}"
	analysis = call_openrouter_vision(
	model="qwen/qwen3-vl-30b-a3b-thinking",
	question=frame_question,
	image_base64=frame_b64,
	fallback_model="google/gemini-2.5-flash"
	)
	frames_analysis.append(f"Frame {idx+1}: {analysis}")

	combined = "\n\n".join(frames_analysis)
	return f"Video analysis ({len(selected_frames)} frames extracted via FFmpeg):\n{combined}"

	except Exception as e:
	return f"Error analyzing video: {str(e)}"




	@tool
	def vision_analyze_document(question: str, file_path: str) -> str:
	"""
	Analyze a document (TXT/MD) using AI.
	For PDF or other formats, please use Code Interpreter to extract text or convert to images first.

	Args:
	question: Question about the document
	file_path: Path to document file

	Returns:
	Analysis result from document content
	"""
	try:
	text_content = ""

	# Extract text based on file type
	if file_path.lower().endswith(('.txt', '.md', '.py', '.js', '.json', '.html', '.css', '.csv')):
	with open(file_path, 'r', encoding='utf-8') as f:
	text_content = f.read()

	# Analyze with Gemini (good for documents)
	result = call_openrouter_vision(
	model="google/gemini-3-flash-preview",
	question=f"{question}\n\nDocument content:\n{text_content[:15000]}", # Limit length
	fallback_model="google/gemini-2.5-flash"
	)
	return result

	else:
	return f"Direct analysis for this file type ({os.path.basename(file_path)}) is not supported directly. Please use the Code Interpreter tool to read/convert this file first."

	except Exception as e:
	return f"Error analyzing document: {str(e)}"