Spaces:
Sleeping
Sleeping
| """ | |
| Multimodal Analysis Tools using OpenRouter Vision Models | |
| """ | |
| import os | |
| import base64 | |
| import tempfile | |
| import requests | |
| import numpy as np | |
| from langchain_core.tools import tool | |
| def call_openrouter_vision( | |
| model: str, | |
| question: str, | |
| image_base64: str = None, | |
| fallback_model: str = None | |
| ) -> str: | |
| """ | |
| Call OpenRouter vision model for image analysis. | |
| Args: | |
| model: Model ID (e.g., "qwen/qwen3-vl-30b-a3b-thinking") | |
| question: Question about the image | |
| image_base64: Base64 encoded image | |
| fallback_model: Fallback model if primary fails | |
| Returns: | |
| Model's response text | |
| """ | |
| api_key = os.getenv("OPENROUTER_API_KEY") | |
| if not api_key: | |
| raise ValueError("OPENROUTER_API_KEY not found in environment") | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "text", "text": question} | |
| ] | |
| } | |
| ] | |
| # Add image if provided | |
| if image_base64: | |
| messages[0]["content"].append({ | |
| "type": "image_url", | |
| "image_url": { | |
| "url": f"data:image/jpeg;base64,{image_base64}" | |
| } | |
| }) | |
| try: | |
| response = requests.post( | |
| "https://openrouter.ai/api/v1/chat/completions", | |
| headers={ | |
| "Authorization": f"Bearer {api_key}", | |
| "Content-Type": "application/json" | |
| }, | |
| json={ | |
| "model": model, | |
| "messages": messages, | |
| "max_tokens": 2048 | |
| }, | |
| timeout=60 | |
| ) | |
| response.raise_for_status() | |
| return response.json()["choices"][0]["message"]["content"] | |
| except Exception as e: | |
| if fallback_model and fallback_model != model: | |
| print(f"Primary model {model} failed, trying fallback {fallback_model}: {e}") | |
| return call_openrouter_vision(fallback_model, question, image_base64) | |
| raise Exception(f"OpenRouter vision call failed: {e}") | |
| def vision_analyze_image(question: str, image_path: str) -> str: | |
| """ | |
| Analyze an image using AI vision model to answer questions about it. | |
| Use this for semantic understanding of images (chess positions, charts, diagrams, screenshots, etc.) | |
| Args: | |
| question: Question about the image | |
| image_path: Path to image file | |
| Returns: | |
| Analysis result from vision model | |
| """ | |
| try: | |
| # Load and encode image | |
| with open(image_path, "rb") as f: | |
| image_data = base64.b64encode(f.read()).decode("utf-8") | |
| # Call OpenRouter vision model with fallback | |
| result = call_openrouter_vision( | |
| model="qwen/qwen3-vl-30b-a3b-thinking", | |
| question=question, | |
| image_base64=image_data, | |
| fallback_model="google/gemini-2.5-flash" | |
| ) | |
| return result | |
| except Exception as e: | |
| return f"Error analyzing image: {str(e)}" | |
| import subprocess | |
| import glob | |
| # ... (call_openrouter_vision remains same) | |
| def vision_analyze_video(question: str, video_path: str, num_frames: int = 5) -> str: | |
| """ | |
| Analyze a video file by extracting key frames using FFmpeg. | |
| Args: | |
| question: Question about the video | |
| video_path: Path to video file | |
| num_frames: Number of frames to extract | |
| Returns: | |
| Analysis result combining insights from all frames | |
| """ | |
| try: | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| # Use FFmpeg to extract frames at intervals | |
| # fps=1/interval? Easier: just extract 5 frames uniformly? | |
| # Strategy: Extract 5 frames at percentage intervals (0%, 20%, 40%...) | |
| # First, extract frames | |
| subprocess.run([ | |
| "ffmpeg", "-i", video_path, | |
| "-vf", f"fps={num_frames}/(duration)", # approximate | |
| # Better: select='not(mod(n,1000))' is hard without duration. | |
| # Simplest: vf fps=1 to get 1 per second, then take N | |
| "-vf", "fps=1", | |
| os.path.join(tmpdir, "frame_%03d.jpg") | |
| ], check=False, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
| # List generated frames | |
| frames = sorted(glob.glob(os.path.join(tmpdir, "*.jpg"))) | |
| if not frames: | |
| # Fallback: try extracting just 5 frames total using 'select' filter | |
| # or just 1 frame if short | |
| subprocess.run([ | |
| "ffmpeg", "-i", video_path, "-vframes", "5", | |
| os.path.join(tmpdir, "thumb%d.jpg") | |
| ], check=False, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
| frames = sorted(glob.glob(os.path.join(tmpdir, "*.jpg"))) | |
| # Pick num_frames evenly spaced | |
| if len(frames) > num_frames: | |
| indices = np.linspace(0, len(frames)-1, num_frames, dtype=int) | |
| selected_frames = [frames[i] for i in indices] | |
| else: | |
| selected_frames = frames | |
| frames_analysis = [] | |
| for idx, frame_path in enumerate(selected_frames): | |
| with open(frame_path, "rb") as f: | |
| frame_b64 = base64.b64encode(f.read()).decode("utf-8") | |
| frame_question = f"Frame {idx+1}: {question}" | |
| analysis = call_openrouter_vision( | |
| model="qwen/qwen3-vl-30b-a3b-thinking", | |
| question=frame_question, | |
| image_base64=frame_b64, | |
| fallback_model="google/gemini-2.5-flash" | |
| ) | |
| frames_analysis.append(f"Frame {idx+1}: {analysis}") | |
| combined = "\n\n".join(frames_analysis) | |
| return f"Video analysis ({len(selected_frames)} frames extracted via FFmpeg):\n{combined}" | |
| except Exception as e: | |
| return f"Error analyzing video: {str(e)}" | |
| def vision_analyze_document(question: str, file_path: str) -> str: | |
| """ | |
| Analyze a document (TXT/MD) using AI. | |
| For PDF or other formats, please use Code Interpreter to extract text or convert to images first. | |
| Args: | |
| question: Question about the document | |
| file_path: Path to document file | |
| Returns: | |
| Analysis result from document content | |
| """ | |
| try: | |
| text_content = "" | |
| # Extract text based on file type | |
| if file_path.lower().endswith(('.txt', '.md', '.py', '.js', '.json', '.html', '.css', '.csv')): | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| text_content = f.read() | |
| # Analyze with Gemini (good for documents) | |
| result = call_openrouter_vision( | |
| model="google/gemini-3-flash-preview", | |
| question=f"{question}\n\nDocument content:\n{text_content[:15000]}", # Limit length | |
| fallback_model="google/gemini-2.5-flash" | |
| ) | |
| return result | |
| else: | |
| return f"Direct analysis for this file type ({os.path.basename(file_path)}) is not supported directly. Please use the Code Interpreter tool to read/convert this file first." | |
| except Exception as e: | |
| return f"Error analyzing document: {str(e)}" | |