Spaces:

robiul487
/

NCAkit

Sleeping

App Files Files Community

ismdrobiul489 commited on Dec 16, 2025

Commit

4c8e696

1 Parent(s): e224b41

Add complete Bar Race module with Brain, Scout, Surgeon, Artist, Director architecture

Browse files

Files changed (19) hide show

modules/bar_race/__init__.py +11 -2
modules/bar_race/assets/fonts/.gitkeep +1 -0
modules/bar_race/assets/images/.gitkeep +1 -0
modules/bar_race/assets/music/.gitkeep +3 -0
modules/bar_race/data/__init__.py +0 -1
modules/bar_race/data/topic_registry.py +0 -154
modules/bar_race/router.py +110 -81
modules/bar_race/schemas.py +32 -28
modules/bar_race/services/__init__.py +1 -1
modules/bar_race/services/artist.py +301 -0
modules/bar_race/services/bar_composer.py +0 -91
modules/bar_race/services/bar_frame.py +0 -301
modules/bar_race/services/brain.py +225 -0
modules/bar_race/services/data_fetcher.py +0 -134
modules/bar_race/services/director.py +333 -0
modules/bar_race/services/scout.py +267 -0
modules/bar_race/services/surgeon.py +327 -0
requirements.txt +7 -0
static/index.html +18 -52

modules/bar_race/__init__.py CHANGED Viewed

@@ -1,7 +1,15 @@
 """
 Bar Race Module
-Creates animated bar chart race videos.
-100% self-contained - no dependency on other modules.
 """
 import logging
 from fastapi import FastAPI
@@ -15,6 +23,7 @@ MODULE_DESCRIPTION = "Bar Chart Race Video Generator"
 _app = None
 def register(app: FastAPI, config=None):
     """Register Bar Race module routes"""
     global _app

 """
 Bar Race Module
+Intelligent Bar Chart Race Video Generator.
+Architecture:
+- Brain: LLM Planner (Gemini)
+- Scout: Data Fetcher (APIs + Scraping)
+- Surgeon: Data Cleaner
+- Artist: Image Processor
+- Director: Video Generator
+100% standalone - no dependency on other modules.
 """
 import logging
 from fastapi import FastAPI
 _app = None
 def register(app: FastAPI, config=None):
     """Register Bar Race module routes"""
     global _app

modules/bar_race/assets/fonts/.gitkeep ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Custom fonts for video rendering

modules/bar_race/assets/images/.gitkeep ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Entity images will be downloaded here during video generation

modules/bar_race/assets/music/.gitkeep ADDED Viewed

	@@ -0,0 +1,3 @@

+# Optional background music files
+# Supported formats: .mp3, .wav, .m4a, .ogg
+# Music will be automatically added if files exist here

modules/bar_race/data/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- # Data directory init

modules/bar_race/data/topic_registry.py DELETED Viewed

@@ -1,154 +0,0 @@
-"""
-Topic Registry
-50+ pre-configured topics for bar chart race videos.
-Each topic has: title, unit, category, data source info.
-"""
-TOPICS = {
-    # =============================================
-    # ECONOMICS & FINANCE (সবচেয়ে জনপ্রিয়)
-    # =============================================
-    "gdp_nominal": {
-        "id": "gdp_nominal",
-        "title": "Richest Countries by GDP (Nominal)",
-        "unit": "Trillion USD",
-        "category": "economics",
-        "year_range": (1960, 2024),
-        "top_n": 10,
-        "description": "বিশ্বের সবচেয়ে ধনী দেশগুলোর তালিকা",
-        "bar_color": "#4CAF50",  # Green
-    },
-    "gdp_ppp": {
-        "id": "gdp_ppp",
-        "title": "Countries by GDP (PPP)",
-        "unit": "Trillion USD",
-        "category": "economics",
-        "year_range": (1990, 2024),
-        "top_n": 10,
-        "description": "ক্রয়ক্ষমতার ভিত্তিতে ধনী দেশ",
-        "bar_color": "#2196F3",  # Blue
-    },
-    "gdp_per_capita": {
-        "id": "gdp_per_capita",
-        "title": "Richest Countries by GDP Per Capita",
-        "unit": "USD",
-        "category": "economics",
-        "year_range": (1960, 2024),
-        "top_n": 10,
-        "description": "মাথাপিছু আয়ে সবচেয়ে ধনী দেশ",
-        "bar_color": "#FF9800",  # Orange
-    },
-    # =============================================
-    # DEMOGRAPHICS & SOCIETY
-    # =============================================
-    "population": {
-        "id": "population",
-        "title": "Most Populated Countries",
-        "unit": "Million",
-        "category": "demographics",
-        "year_range": (1960, 2024),
-        "top_n": 10,
-        "description": "বিশ্বের সবচেয়ে জনবহুল দেশ",
-        "bar_color": "#9C27B0",  # Purple
-    },
-    "life_expectancy": {
-        "id": "life_expectancy",
-        "title": "Countries by Life Expectancy",
-        "unit": "Years",
-        "category": "demographics",
-        "year_range": (1960, 2024),
-        "top_n": 10,
-        "description": "কোন দেশের মানুষ গড়ে কতদিন বাঁচে",
-        "bar_color": "#E91E63",  # Pink
-    },
-    # =============================================
-    # TECH & DIGITAL
-    # =============================================
-    "social_media_users": {
-        "id": "social_media_users",
-        "title": "Social Media Platforms by Users",
-        "unit": "Billion Users",
-        "category": "tech",
-        "year_range": (2004, 2024),
-        "top_n": 10,
-        "description": "ফেসবুক, ইউটিউব, টিকটক ইউজার সংখ্যা",
-        "bar_color": "#00BCD4",  # Cyan
-    },
-    "browser_market_share": {
-        "id": "browser_market_share",
-        "title": "Browser Market Share",
-        "unit": "% Share",
-        "category": "tech",
-        "year_range": (2008, 2024),
-        "top_n": 8,
-        "description": "ক্রোম, ফায়ারফক্স, এজ মার্কেট শেয়ার",
-        "bar_color": "#3F51B5",  # Indigo
-    },
-    # =============================================
-    # ENTERTAINMENT
-    # =============================================
-    "youtube_subscribers": {
-        "id": "youtube_subscribers",
-        "title": "Most Subscribed YouTube Channels",
-        "unit": "Million Subscribers",
-        "category": "entertainment",
-        "year_range": (2010, 2024),
-        "top_n": 10,
-        "description": "টি-সিরিজ বনাম মিস্টার বিস্ট বনাম পিউডিপাই",
-        "bar_color": "#F44336",  # Red (YouTube)
-    },
-    # =============================================
-    # SPORTS
-    # =============================================
-    "olympic_medals": {
-        "id": "olympic_medals",
-        "title": "Countries by Olympic Gold Medals",
-        "unit": "Gold Medals",
-        "category": "sports",
-        "year_range": (1896, 2024),
-        "top_n": 10,
-        "description": "অলিম্পিকে কোন দেশ কত স্বর্ণ জিতেছে",
-        "bar_color": "#FFD700",  # Gold
-    },
-    # =============================================
-    # GEOPOLITICS
-    # =============================================
-    "military_spending": {
-        "id": "military_spending",
-        "title": "Countries by Military Expenditure",
-        "unit": "Billion USD",
-        "category": "geopolitics",
-        "year_range": (1990, 2024),
-        "top_n": 10,
-        "description": "কোন দেশ সেনাবাহিনীতে কত খরচ করে",
-        "bar_color": "#795548",  # Brown
-    },
-}
-def get_topic(topic_id: str) -> dict:
-    """Get topic configuration by ID"""
-    return TOPICS.get(topic_id.lower())
-def list_topics() -> list:
-    """List all available topics"""
-    return [
-        {
-            "id": t["id"],
-            "title": t["title"],
-            "category": t["category"],
-            "description": t["description"]
-        }
-        for t in TOPICS.values()
-    ]
-def get_topics_by_category(category: str) -> list:
-    """Get topics filtered by category"""
-    return [t for t in TOPICS.values() if t["category"] == category]

modules/bar_race/router.py CHANGED Viewed

@@ -5,16 +5,13 @@ API endpoints for bar chart race video generation.
 import logging
 import os
 import uuid
 import traceback
 from typing import Dict
 from fastapi import APIRouter, BackgroundTasks, HTTPException
-from fastapi.responses import FileResponse
-from .schemas import BarRaceRequest, JobResponse, JobStatus, TopicInfo
-from .data.topic_registry import TOPICS, get_topic, list_topics
-from .services.data_fetcher import DataFetcher
-from .services.bar_frame import BarFrameGenerator
-from .services.bar_composer import BarComposer
 logger = logging.getLogger(__name__)
@@ -24,15 +21,24 @@ router = APIRouter()
 jobs: Dict[str, dict] = {}
-def update_job(job_id: str, status: str, progress: int = 0, video_url: str = None, error: str = None):
     """Update job status"""
     if job_id in jobs:
         jobs[job_id].update({
             "status": status,
             "progress": progress,
             "video_url": video_url,
             "error": error
         })
 async def generate_bar_race_video(job_id: str, request: BarRaceRequest):
@@ -40,93 +46,119 @@ async def generate_bar_race_video(job_id: str, request: BarRaceRequest):
     temp_dir = f"temp/bar_race_{job_id}"
     try:
-        update_job(job_id, "processing", 5)
         os.makedirs(temp_dir, exist_ok=True)
-        # Get topic config
-        topic_config = get_topic(request.topic)
-        if not topic_config:
-            topic_config = {
-                "title": request.topic.replace("_", " ").title(),
-                "unit": "Value",
-            }
-        title = topic_config.get("title", request.topic)
-        unit = topic_config.get("unit", "")
-        update_job(job_id, "processing", 10)
-        logger.info(f"Fetching data for topic: {request.topic}")
-        # Fetch data
-        data_fetcher = DataFetcher()
-        all_data = data_fetcher.fetch_data(
-            topic_id=request.topic,
-            year_start=request.year_start,
-            year_end=request.year_end,
-            top_n=request.top_n
-        )
-        update_job(job_id, "processing", 20)
-        logger.info(f"Generating frames...")
-        # Generate frames
-        frame_generator = BarFrameGenerator()
-        frames_dir = os.path.join(temp_dir, "frames")
-        frame_paths = frame_generator.generate_frames(
-            title=title,
-            unit=unit,
-            all_data=all_data,
-            year_start=request.year_start,
-            year_end=request.year_end,
-            fps=request.fps,
-            duration_seconds=request.duration_seconds,
-            output_dir=frames_dir
-        )
-        update_job(job_id, "processing", 70)
-        logger.info(f"Composing video...")
-        # Compose video
-        composer = BarComposer()
-        output_name = f"bar_race_{job_id}.mp4"
-        video_path = composer.compose_video(
-            frame_paths=frame_paths,
-            output_name=output_name,
-            fps=request.fps
         )
-        update_job(job_id, "processing", 90)
-        # Generate video URL
-        video_url = f"/api/bar-race/video/{job_id}"
-        # Cleanup temp frames
-        composer.cleanup_frames(frames_dir)
-        update_job(job_id, "ready", 100, video_url=video_url)
         logger.info(f"Bar race video ready: {video_url}")
     except Exception as e:
         logger.error(f"Bar race generation failed: {e}")
         logger.error(traceback.format_exc())
         update_job(job_id, "failed", error=str(e))
-    finally:
-        # Cleanup temp directory (keep video)
-        if os.path.exists(temp_dir):
-            import shutil
-            shutil.rmtree(temp_dir, ignore_errors=True)
-@router.get("/topics")
-async def get_available_topics():
-    """Get list of available topics"""
-    return {
-        "topics": list_topics(),
-        "total": len(TOPICS)
-    }
 @router.post("/generate", response_model=JobResponse)
@@ -134,19 +166,16 @@ async def generate_bar_race(request: BarRaceRequest, background_tasks: Backgroun
     """
     Generate a bar chart race video.
-    Returns job_id to track progress.
     """
     job_id = str(uuid.uuid4())[:8]
-    # Validate year range
-    if request.year_start >= request.year_end:
-        raise HTTPException(400, "year_start must be less than year_end")
     # Initialize job
     jobs[job_id] = {
         "job_id": job_id,
         "status": "queued",
         "progress": 0,
         "video_url": None,
         "error": None
     }
@@ -157,7 +186,7 @@ async def generate_bar_race(request: BarRaceRequest, background_tasks: Backgroun
     return JobResponse(
         job_id=job_id,
         status="queued",
-        message=f"Bar race video generation started for topic: {request.topic}"
     )

 import logging
 import os
 import uuid
+import shutil
 import traceback
 from typing import Dict
 from fastapi import APIRouter, BackgroundTasks, HTTPException
+from fastapi.responses import FileResponse, RedirectResponse
+from .schemas import BarRaceRequest, JobResponse, JobStatus
 logger = logging.getLogger(__name__)
 jobs: Dict[str, dict] = {}
+def update_job(
+    job_id: str,
+    status: str,
+    progress: int = 0,
+    current_step: str = None,
+    video_url: str = None,
+    error: str = None
+):
     """Update job status"""
     if job_id in jobs:
         jobs[job_id].update({
             "status": status,
             "progress": progress,
+            "current_step": current_step,
             "video_url": video_url,
             "error": error
         })
+        logger.debug(f"Job {job_id}: {status} ({progress}%) - {current_step}")
 async def generate_bar_race_video(job_id: str, request: BarRaceRequest):
     temp_dir = f"temp/bar_race_{job_id}"
     try:
         os.makedirs(temp_dir, exist_ok=True)
+        # Get API key from environment
+        gemini_api_key = os.getenv("GEMINI_API_KEY")
+        # ============ BRAIN ============
+        update_job(job_id, "processing", 5, "Brain: Analyzing topic...")
+        from .services.brain import Brain
+        brain = Brain(gemini_api_key=gemini_api_key)
+        plan = brain.generate_plan(request.topic)
+        if not plan:
+            raise Exception("Brain failed to generate plan")
+        logger.info(f"Brain: Generated plan for entity_type={plan.get('entity_type')}")
+        # ============ SCOUT ============
+        update_job(job_id, "processing", 15, "Scout: Fetching data...")
+        from .services.scout import Scout
+        scout = Scout(temp_dir=temp_dir)
+        raw_df = scout.fetch_data(plan)
+        if raw_df is None or raw_df.empty:
+            raise Exception("Scout failed to fetch data")
+        logger.info(f"Scout: Fetched {len(raw_df)} rows")
+        # ============ SURGEON ============
+        update_job(job_id, "processing", 35, "Surgeon: Cleaning data...")
+        from .services.surgeon import Surgeon
+        surgeon = Surgeon(temp_dir=temp_dir)
+        clean_df = surgeon.clean_data(raw_df, plan)
+        if clean_df is None or clean_df.empty:
+            raise Exception("Surgeon failed to clean data")
+        logger.info(f"Surgeon: Cleaned data, {len(clean_df)} rows, {clean_df['name'].nunique()} entities")
+        # ============ ARTIST ============
+        update_job(job_id, "processing", 50, "Artist: Processing images...")
+        from .services.artist import Artist
+        artist = Artist(temp_dir=temp_dir)
+        entities = clean_df["name"].unique().tolist()
+        entity_type = plan.get("entity_type", "general")
+        image_paths = artist.process_entities(entities, entity_type)
+        logger.info(f"Artist: Processed {len(image_paths)} images")
+        # ============ DIRECTOR ============
+        update_job(job_id, "processing", 65, "Director: Generating video...")
+        from .services.director import Director
+        director = Director(temp_dir=temp_dir)
+        video_path = director.generate_video(
+            df=clean_df,
+            plan=plan,
+            image_paths=image_paths,
+            duration_seconds=request.duration_seconds,
+            job_id=job_id
         )
+        if not video_path or not os.path.exists(video_path):
+            raise Exception("Director failed to generate video")
+        logger.info(f"Director: Generated video at {video_path}")
+        # ============ UPLOAD TO HF ============
+        update_job(job_id, "processing", 85, "Uploading to cloud storage...")
+        video_url = None
+        try:
+            from modules.shared.services.hf_storage import get_hf_storage
+            hf_storage = get_hf_storage()
+            if hf_storage and hf_storage.enabled:
+                # Upload video
+                uploaded_url = hf_storage.upload_file(
+                    local_path=video_path,
+                    remote_path=f"bar_race/{job_id}.mp4"
+                )
+                if uploaded_url:
+                    video_url = uploaded_url
+                    logger.info(f"Uploaded to HF: {video_url}")
+        except Exception as e:
+            logger.warning(f"HF upload failed, using local: {e}")
+        # Fallback to local URL
+        if not video_url:
+            video_url = f"/api/bar-race/video/{job_id}"
+        # ============ SUCCESS ============
+        update_job(job_id, "ready", 100, "Complete", video_url=video_url)
         logger.info(f"Bar race video ready: {video_url}")
+        # Cleanup temp files (only on success)
+        try:
+            if os.path.exists(temp_dir):
+                shutil.rmtree(temp_dir)
+                logger.info(f"Cleaned up temp directory: {temp_dir}")
+        except Exception as e:
+            logger.warning(f"Cleanup failed: {e}")
     except Exception as e:
         logger.error(f"Bar race generation failed: {e}")
         logger.error(traceback.format_exc())
         update_job(job_id, "failed", error=str(e))
+        # Keep temp files for debugging on failure
+        logger.info(f"Keeping temp directory for debugging: {temp_dir}")
 @router.post("/generate", response_model=JobResponse)
     """
     Generate a bar chart race video.
+    Takes a topic and duration, returns job_id to track progress.
     """
     job_id = str(uuid.uuid4())[:8]
     # Initialize job
     jobs[job_id] = {
         "job_id": job_id,
         "status": "queued",
         "progress": 0,
+        "current_step": "Initializing...",
         "video_url": None,
         "error": None
     }
     return JobResponse(
         job_id=job_id,
         status="queued",
+        message=f"Bar race generation started for topic: {request.topic}"
     )

modules/bar_race/schemas.py CHANGED Viewed

@@ -3,40 +3,43 @@ Bar Race Schemas
 Pydantic models for bar chart race video generation.
 """
 from pydantic import BaseModel, Field
-from typing import Optional, List
 from enum import Enum
-class TopicCategory(str, Enum):
-    ECONOMICS = "economics"
-    TECH = "tech"
-    BUSINESS = "business"
-    ENTERTAINMENT = "entertainment"
-    DEMOGRAPHICS = "demographics"
-    SPORTS = "sports"
-    GEOPOLITICS = "geopolitics"
-    ENVIRONMENT = "environment"
-    UNIQUE = "unique"
 class BarRaceRequest(BaseModel):
     """Request to generate a bar chart race video"""
-    topic: str = Field(..., description="Topic ID from registry (e.g., 'gdp_nominal')")
-    year_start: int = Field(2000, ge=1900, le=2030, description="Start year")
-    year_end: int = Field(2024, ge=1900, le=2030, description="End year")
-    top_n: int = Field(10, ge=5, le=20, description="Number of bars to show")
-    duration_seconds: int = Field(60, ge=10, le=180, description="Video duration in seconds")
-    fps: int = Field(30, ge=24, le=60, description="Frames per second")
-class TopicInfo(BaseModel):
-    """Information about a topic"""
-    id: str
-    title: str
-    category: TopicCategory
-    unit: str
-    year_range: tuple
-    description: str
 class JobResponse(BaseModel):
@@ -49,7 +52,8 @@ class JobResponse(BaseModel):
 class JobStatus(BaseModel):
     """Job status response"""
     job_id: str
-    status: str  # queued, processing, ready, failed
     progress: int = 0
     video_url: Optional[str] = None
     error: Optional[str] = None

 Pydantic models for bar chart race video generation.
 """
 from pydantic import BaseModel, Field
+from typing import Optional, List, Dict, Any
 from enum import Enum
+class EntityType(str, Enum):
+    """Type of entities in the bar chart"""
+    PERSON = "person"
+    COUNTRY = "country"
+    COMPANY = "company"
+    GENERAL = "general"
 class BarRaceRequest(BaseModel):
     """Request to generate a bar chart race video"""
+    topic: str = Field(..., description="Topic/prompt for video (e.g., 'Top 10 richest cricketers')")
+    duration_seconds: int = Field(60, ge=30, le=120, description="Video duration in seconds")
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "topic": "Top 10 richest countries by GDP 2000-2024",
+                "duration_seconds": 60
+            }
+        }
+class BrainPlan(BaseModel):
+    """JSON plan generated by Brain (LLM)"""
+    topic: str
+    entity_type: EntityType
+    time_config: Dict[str, Any]
+    value_intent: Dict[str, Any]
+    search_strategies: List[Dict[str, Any]]
+    source_priority: List[str]
+    data_expectation: Dict[str, Any]
+    visualization: Dict[str, Any]
+    video_meta: Dict[str, Any]
 class JobResponse(BaseModel):
 class JobStatus(BaseModel):
     """Job status response"""
     job_id: str
+    status: str  # queued, brain, scout, surgeon, artist, director, uploading, ready, failed
     progress: int = 0
+    current_step: Optional[str] = None
     video_url: Optional[str] = None
     error: Optional[str] = None

modules/bar_race/services/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- # Services ~~init~~


1	+ # Services package

modules/bar_race/services/artist.py ADDED Viewed

	@@ -0,0 +1,301 @@

+"""
+Artist - Image Processor
+Downloads and processes entity images for bar chart race.
+"""
+import logging
+import requests
+import os
+from PIL import Image, ImageDraw
+from typing import Dict, Any, List, Optional
+from io import BytesIO
+logger = logging.getLogger(__name__)
+class Artist:
+    """
+    Image Processor for Bar Race video generation.
+    Responsibilities:
+    - Search and download entity images
+    - Background removal (optional, if rembg available)
+    - Face detection for person entities
+    - Circular mask application
+    """
+    HEADERS = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+    }
+    # Image size for bar chart
+    IMAGE_SIZE = 80
+    def __init__(self, temp_dir: str):
+        self.temp_dir = temp_dir
+        self.images_dir = os.path.join(temp_dir, "images")
+        os.makedirs(self.images_dir, exist_ok=True)
+        # Check if rembg is available
+        self.rembg_available = False
+        try:
+            import rembg
+            self.rembg_available = True
+            logger.info("Artist: rembg available for background removal")
+        except ImportError:
+            logger.info("Artist: rembg not available, skipping background removal")
+    def process_entities(self, entities: List[str], entity_type: str) -> Dict[str, str]:
+        """
+        Download and process images for all entities.
+        Args:
+            entities: List of entity names
+            entity_type: Type of entity (person, country, company, general)
+        Returns:
+            Dict mapping entity name to processed image path
+        """
+        logger.info(f"Artist: Processing images for {len(entities)} entities (type: {entity_type})")
+        image_paths = {}
+        for entity in entities:
+            try:
+                image_path = self._process_entity(entity, entity_type)
+                if image_path:
+                    image_paths[entity] = image_path
+                    logger.debug(f"Artist: Processed image for {entity}")
+                else:
+                    logger.warning(f"Artist: No image found for {entity}")
+            except Exception as e:
+                logger.warning(f"Artist: Failed to process {entity}: {e}")
+        logger.info(f"Artist: Processed {len(image_paths)}/{len(entities)} images")
+        return image_paths
+    def _process_entity(self, entity: str, entity_type: str) -> Optional[str]:
+        """Process a single entity's image"""
+        # Try to get image
+        image = self._get_image(entity, entity_type)
+        if image is None:
+            return None
+        # Process image
+        try:
+            # Resize to square
+            image = image.convert("RGBA")
+            image = self._resize_to_square(image)
+            # Remove background if rembg available and it's a person
+            if self.rembg_available and entity_type == "person":
+                image = self._remove_background(image)
+            # Apply circular mask
+            image = self._apply_circular_mask(image)
+            # Save processed image
+            safe_name = "".join(c if c.isalnum() else "_" for c in entity)
+            output_path = os.path.join(self.images_dir, f"{safe_name}.png")
+            image.save(output_path, "PNG")
+            return output_path
+        except Exception as e:
+            logger.error(f"Artist: Error processing image for {entity}: {e}")
+            return None
+    def _get_image(self, entity: str, entity_type: str) -> Optional[Image.Image]:
+        """Get image for an entity"""
+        # Priority 1: Wikipedia Commons
+        image = self._search_wikipedia_commons(entity, entity_type)
+        if image:
+            return image
+        # Priority 2: DuckDuckGo image search
+        image = self._search_duckduckgo(entity, entity_type)
+        if image:
+            return image
+        # Priority 3: Generate placeholder
+        return self._generate_placeholder(entity)
+    def _search_wikipedia_commons(self, entity: str, entity_type: str) -> Optional[Image.Image]:
+        """Search Wikipedia Commons for entity image"""
+        try:
+            # For countries, search for flag
+            if entity_type == "country":
+                search_query = f"Flag of {entity}"
+            else:
+                search_query = entity
+            # Wikipedia API search
+            search_url = "https://en.wikipedia.org/w/api.php"
+            params = {
+                "action": "query",
+                "titles": search_query,
+                "prop": "pageimages",
+                "format": "json",
+                "pithumbsize": 200
+            }
+            response = requests.get(search_url, params=params, headers=self.HEADERS, timeout=10)
+            if response.status_code == 200:
+                data = response.json()
+                pages = data.get("query", {}).get("pages", {})
+                for page_id, page_data in pages.items():
+                    if "thumbnail" in page_data:
+                        image_url = page_data["thumbnail"]["source"]
+                        return self._download_image(image_url)
+        except Exception as e:
+            logger.debug(f"Artist: Wikipedia Commons search failed for {entity}: {e}")
+        return None
+    def _search_duckduckgo(self, entity: str, entity_type: str) -> Optional[Image.Image]:
+        """Search DuckDuckGo for entity image"""
+        try:
+            from duckduckgo_search import DDGS
+            # Build search query
+            if entity_type == "country":
+                query = f"{entity} flag icon"
+            elif entity_type == "person":
+                query = f"{entity} portrait photo"
+            else:
+                query = f"{entity} logo"
+            with DDGS() as ddgs:
+                results = list(ddgs.images(query, max_results=3))
+                for result in results:
+                    image_url = result.get("image")
+                    if image_url:
+                        image = self._download_image(image_url)
+                        if image:
+                            return image
+        except ImportError:
+            logger.debug("Artist: duckduckgo-search not available")
+        except Exception as e:
+            logger.debug(f"Artist: DuckDuckGo search failed for {entity}: {e}")
+        return None
+    def _download_image(self, url: str) -> Optional[Image.Image]:
+        """Download image from URL"""
+        try:
+            response = requests.get(url, headers=self.HEADERS, timeout=10)
+            if response.status_code == 200:
+                return Image.open(BytesIO(response.content))
+        except Exception as e:
+            logger.debug(f"Artist: Failed to download image: {e}")
+        return None
+    def _resize_to_square(self, image: Image.Image) -> Image.Image:
+        """Resize image to square, center cropping if needed"""
+        width, height = image.size
+        # Determine crop box for square
+        if width > height:
+            left = (width - height) // 2
+            top = 0
+            right = left + height
+            bottom = height
+        else:
+            left = 0
+            top = (height - width) // 2
+            right = width
+            bottom = top + width
+        # Crop to square
+        image = image.crop((left, top, right, bottom))
+        # Resize to target size
+        image = image.resize((self.IMAGE_SIZE, self.IMAGE_SIZE), Image.Resampling.LANCZOS)
+        return image
+    def _remove_background(self, image: Image.Image) -> Image.Image:
+        """Remove background using rembg"""
+        try:
+            import rembg
+            # Convert to bytes
+            img_bytes = BytesIO()
+            image.save(img_bytes, format="PNG")
+            img_bytes.seek(0)
+            # Remove background
+            output = rembg.remove(img_bytes.getvalue())
+            return Image.open(BytesIO(output))
+        except Exception as e:
+            logger.warning(f"Artist: Background removal failed: {e}")
+            return image
+    def _apply_circular_mask(self, image: Image.Image) -> Image.Image:
+        """Apply circular mask to image"""
+        # Ensure RGBA
+        if image.mode != "RGBA":
+            image = image.convert("RGBA")
+        size = image.size[0]
+        # Create circular mask
+        mask = Image.new("L", (size, size), 0)
+        draw = ImageDraw.Draw(mask)
+        draw.ellipse((0, 0, size, size), fill=255)
+        # Apply mask
+        output = Image.new("RGBA", (size, size), (0, 0, 0, 0))
+        output.paste(image, (0, 0), mask)
+        return output
+    def _generate_placeholder(self, entity: str) -> Image.Image:
+        """Generate a placeholder image with entity initial"""
+        size = self.IMAGE_SIZE
+        # Create colored background
+        colors = [
+            (74, 222, 128),   # Green
+            (251, 191, 36),   # Yellow
+            (239, 68, 68),    # Red
+            (59, 130, 246),   # Blue
+            (168, 85, 247),   # Purple
+            (20, 184, 166),   # Teal
+        ]
+        # Pick color based on entity name hash
+        color = colors[hash(entity) % len(colors)]
+        # Create image
+        image = Image.new("RGBA", (size, size), color)
+        draw = ImageDraw.Draw(image)
+        # Draw initial
+        initial = entity[0].upper() if entity else "?"
+        # Use default font
+        try:
+            from PIL import ImageFont
+            font = ImageFont.truetype("arial.ttf", size // 2)
+        except:
+            font = ImageFont.load_default()
+        # Center text
+        bbox = draw.textbbox((0, 0), initial, font=font)
+        text_width = bbox[2] - bbox[0]
+        text_height = bbox[3] - bbox[1]
+        x = (size - text_width) // 2
+        y = (size - text_height) // 2 - bbox[1]
+        draw.text((x, y), initial, fill=(255, 255, 255), font=font)
+        return image

modules/bar_race/services/bar_composer.py DELETED Viewed

@@ -1,91 +0,0 @@
-"""
-Bar Composer Service
-Assembles bar race frames into final video.
-No TTS - pure visual animation with optional background music.
-"""
-import logging
-import os
-import shutil
-from pathlib import Path
-from typing import List, Optional
-from moviepy.editor import ImageSequenceClip
-logger = logging.getLogger(__name__)
-class BarComposer:
-    """
-    Composes bar race video from frames.
-    - No TTS dependency
-    - Optional background music support
-    """
-    FPS = 30
-    def __init__(self, output_dir: str = "videos/bar_race"):
-        self.output_dir = output_dir
-        os.makedirs(output_dir, exist_ok=True)
-    def compose_video(
-        self,
-        frame_paths: List[str],
-        output_name: str,
-        fps: int = 30,
-        music_path: Optional[str] = None
-    ) -> str:
-        """
-        Compose video from frame sequence.
-        Args:
-            frame_paths: List of frame image paths
-            output_name: Output video filename
-            fps: Frames per second
-            music_path: Optional background music path
-        Returns:
-            Path to output video file
-        """
-        output_path = os.path.join(self.output_dir, output_name)
-        try:
-            logger.info(f"Composing video from {len(frame_paths)} frames...")
-            # Create video clip from frames
-            video_clip = ImageSequenceClip(frame_paths, fps=fps)
-            # Add background music if provided
-            if music_path and os.path.exists(music_path):
-                from moviepy.editor import AudioFileClip
-                audio = AudioFileClip(music_path)
-                # Loop or trim audio to match video duration
-                if audio.duration > video_clip.duration:
-                    audio = audio.subclip(0, video_clip.duration)
-                video_clip = video_clip.set_audio(audio)
-            # Write video
-            logger.info(f"Writing video to {output_path}")
-            video_clip.write_videofile(
-                output_path,
-                fps=fps,
-                codec="libx264",
-                audio_codec="aac" if music_path else None,
-                preset="medium",
-                threads=4,
-                logger=None
-            )
-            # Cleanup
-            video_clip.close()
-            logger.info(f"Bar race video complete: {output_path}")
-            return output_path
-        except Exception as e:
-            logger.error(f"Failed to compose video: {e}")
-            raise
-    def cleanup_frames(self, frame_dir: str):
-        """Remove temporary frame directory"""
-        if os.path.exists(frame_dir):
-            shutil.rmtree(frame_dir)
-            logger.info(f"Cleaned up frames: {frame_dir}")

modules/bar_race/services/bar_frame.py DELETED Viewed

@@ -1,301 +0,0 @@
-"""
-Bar Frame Generator
-Creates animated bar chart race frames using Pillow.
-"""
-import logging
-import os
-from PIL import Image, ImageDraw, ImageFont
-from typing import Dict, List, Tuple
-logger = logging.getLogger(__name__)
-class BarFrameGenerator:
-    """
-    Generates frames for bar chart race animation.
-    - Horizontal racing bars
-    - Smooth interpolation between years
-    - Year counter display
-    """
-    # Canvas dimensions (9:16 vertical)
-    WIDTH = 1080
-    HEIGHT = 1920
-    # Colors
-    BG_COLOR = (18, 18, 28)          # Dark navy
-    TITLE_COLOR = (255, 255, 255)    # White
-    YEAR_COLOR = (100, 100, 120)     # Muted gray
-    BAR_LABEL_COLOR = (255, 255, 255) # White
-    VALUE_COLOR = (200, 200, 200)    # Light gray
-    # Bar colors (will cycle through these)
-    BAR_COLORS = [
-        (74, 222, 128),   # Green
-        (251, 191, 36),   # Yellow
-        (239, 68, 68),    # Red
-        (59, 130, 246),   # Blue
-        (168, 85, 247),   # Purple
-        (20, 184, 166),   # Teal
-        (249, 115, 22),   # Orange
-        (236, 72, 153),   # Pink
-        (34, 197, 94),    # Emerald
-        (99, 102, 241),   # Indigo
-    ]
-    # Layout
-    TITLE_Y = 80
-    YEAR_Y = 1700  # Large year at bottom
-    BAR_START_Y = 200
-    BAR_HEIGHT = 80
-    BAR_GAP = 30
-    BAR_MAX_WIDTH = 900
-    BAR_X_START = 160
-    def __init__(self):
-        self._load_fonts()
-        self.entity_colors = {}  # Cache colors for entities
-    def _load_fonts(self):
-        """Load fonts with fallbacks"""
-        font_paths = [
-            "C:/Windows/Fonts/arial.ttf",
-            "C:/Windows/Fonts/ArialBD.ttf",
-            "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
-        ]
-        self.font_title = None
-        self.font_label = None
-        self.font_value = None
-        self.font_year = None
-        for path in font_paths:
-            if os.path.exists(path):
-                try:
-                    self.font_title = ImageFont.truetype(path, 52)
-                    self.font_label = ImageFont.truetype(path, 32)
-                    self.font_value = ImageFont.truetype(path, 28)
-                    self.font_year = ImageFont.truetype(path, 200)
-                    logger.info(f"Loaded font: {path}")
-                    break
-                except Exception as e:
-                    logger.warning(f"Failed to load font {path}: {e}")
-        if not self.font_title:
-            self.font_title = ImageFont.load_default()
-            self.font_label = ImageFont.load_default()
-            self.font_value = ImageFont.load_default()
-            self.font_year = ImageFont.load_default()
-            logger.warning("Using default font")
-    def _get_entity_color(self, entity: str) -> Tuple[int, int, int]:
-        """Get consistent color for an entity"""
-        if entity not in self.entity_colors:
-            color_index = len(self.entity_colors) % len(self.BAR_COLORS)
-            self.entity_colors[entity] = self.BAR_COLORS[color_index]
-        return self.entity_colors[entity]
-    def _draw_rounded_rect(self, draw: ImageDraw, bbox: Tuple, fill: Tuple, radius: int = 15):
-        """Draw a rounded rectangle"""
-        draw.rounded_rectangle(bbox, radius=radius, fill=fill)
-    def create_frame(
-        self,
-        title: str,
-        unit: str,
-        year: float,  # Can be fractional for smooth animation
-        bars_data: List[Dict],  # [{name, value}, ...]
-        max_value: float
-    ) -> Image.Image:
-        """
-        Create a single frame of the bar chart race.
-        Args:
-            title: Chart title
-            unit: Value unit (e.g., "Trillion USD")
-            year: Current year (can be fractional)
-            bars_data: List of {name, value} sorted by value descending
-            max_value: Maximum value for scaling
-        """
-        img = Image.new('RGB', (self.WIDTH, self.HEIGHT), self.BG_COLOR)
-        draw = ImageDraw.Draw(img)
-        # Title
-        title_text = title
-        bbox = draw.textbbox((0, 0), title_text, font=self.font_title)
-        title_x = (self.WIDTH - (bbox[2] - bbox[0])) // 2
-        draw.text((title_x, self.TITLE_Y), title_text, fill=self.TITLE_COLOR, font=self.font_title)
-        # Year (large, at bottom)
-        year_text = str(int(year))
-        bbox = draw.textbbox((0, 0), year_text, font=self.font_year)
-        year_x = (self.WIDTH - (bbox[2] - bbox[0])) // 2
-        draw.text((year_x, self.YEAR_Y), year_text, fill=self.YEAR_COLOR, font=self.font_year)
-        # Draw bars
-        for i, bar in enumerate(bars_data[:10]):  # Max 10 bars
-            y = self.BAR_START_Y + i * (self.BAR_HEIGHT + self.BAR_GAP)
-            # Calculate bar width
-            bar_width = int((bar["value"] / max_value) * self.BAR_MAX_WIDTH)
-            bar_width = max(50, bar_width)  # Minimum width
-            # Get color
-            color = self._get_entity_color(bar["name"])
-            # Draw bar
-            self._draw_rounded_rect(
-                draw,
-                (self.BAR_X_START, y, self.BAR_X_START + bar_width, y + self.BAR_HEIGHT),
-                color,
-                radius=10
-            )
-            # Draw entity name (inside bar if fits, else to the left)
-            name_text = bar["name"]
-            name_bbox = draw.textbbox((0, 0), name_text, font=self.font_label)
-            name_width = name_bbox[2] - name_bbox[0]
-            if name_width < bar_width - 20:
-                # Inside bar
-                name_x = self.BAR_X_START + 15
-            else:
-                # To the left of bar
-                name_x = 10
-            name_y = y + (self.BAR_HEIGHT - (name_bbox[3] - name_bbox[1])) // 2
-            draw.text((name_x, name_y), name_text, fill=self.BAR_LABEL_COLOR, font=self.font_label)
-            # Draw value (to the right of bar)
-            value_text = f"{bar['value']:.1f} {unit}"
-            value_bbox = draw.textbbox((0, 0), value_text, font=self.font_value)
-            value_x = self.BAR_X_START + bar_width + 15
-            value_y = y + (self.BAR_HEIGHT - (value_bbox[3] - value_bbox[1])) // 2
-            draw.text((value_x, value_y), value_text, fill=self.VALUE_COLOR, font=self.font_value)
-        return img
-    def interpolate_data(
-        self,
-        data_start: List[Dict],
-        data_end: List[Dict],
-        progress: float  # 0.0 to 1.0
-    ) -> List[Dict]:
-        """
-        Interpolate between two years of data for smooth animation.
-        """
-        # Create lookup for end values
-        end_values = {d["name"]: d["value"] for d in data_end}
-        interpolated = []
-        for d in data_start:
-            name = d["name"]
-            start_val = d["value"]
-            end_val = end_values.get(name, start_val)
-            # Linear interpolation
-            current_val = start_val + (end_val - start_val) * progress
-            interpolated.append({
-                "name": name,
-                "value": current_val
-            })
-        # Sort by current value
-        interpolated.sort(key=lambda x: x["value"], reverse=True)
-        return interpolated
-    def generate_frames(
-        self,
-        title: str,
-        unit: str,
-        all_data: List[Dict],  # [{name, year, value}, ...]
-        year_start: int,
-        year_end: int,
-        fps: int = 30,
-        duration_seconds: int = 60,
-        output_dir: str = "temp_frames"
-    ) -> List[str]:
-        """
-        Generate all frames for the bar chart race.
-        Returns list of frame file paths.
-        """
-        os.makedirs(output_dir, exist_ok=True)
-        # Calculate frames per year
-        total_frames = fps * duration_seconds
-        years_count = year_end - year_start
-        frames_per_year = total_frames / years_count
-        # Find max value for consistent scaling
-        max_value = max(d["value"] for d in all_data) * 1.1  # 10% padding
-        # Group data by year
-        data_by_year = {}
-        for d in all_data:
-            year = d["year"]
-            if year not in data_by_year:
-                data_by_year[year] = []
-            data_by_year[year].append({"name": d["name"], "value": d["value"]})
-        # Sort each year's data
-        for year in data_by_year:
-            data_by_year[year].sort(key=lambda x: x["value"], reverse=True)
-        frame_paths = []
-        frame_num = 0
-        for year in range(year_start, year_end):
-            # Get data for current and next year
-            current_data = data_by_year.get(year, [])
-            next_data = data_by_year.get(year + 1, current_data)
-            # Generate frames for this year transition
-            frames_for_this_year = int(frames_per_year)
-            for f in range(frames_for_this_year):
-                progress = f / frames_for_this_year
-                # Interpolate data
-                interpolated = self.interpolate_data(current_data, next_data, progress)
-                # Calculate display year (fractional)
-                display_year = year + progress
-                # Create frame
-                frame = self.create_frame(
-                    title=title,
-                    unit=unit,
-                    year=display_year,
-                    bars_data=interpolated,
-                    max_value=max_value
-                )
-                # Save frame
-                frame_path = os.path.join(output_dir, f"frame_{frame_num:05d}.png")
-                frame.save(frame_path)
-                frame_paths.append(frame_path)
-                frame_num += 1
-                if frame_num % 100 == 0:
-                    logger.info(f"Generated {frame_num} frames...")
-        # Add final frames for end year
-        final_data = data_by_year.get(year_end, [])
-        for _ in range(fps):  # 1 second on final year
-            frame = self.create_frame(
-                title=title,
-                unit=unit,
-                year=year_end,
-                bars_data=final_data,
-                max_value=max_value
-            )
-            frame_path = os.path.join(output_dir, f"frame_{frame_num:05d}.png")
-            frame.save(frame_path)
-            frame_paths.append(frame_path)
-            frame_num += 1
-        logger.info(f"Generated total {len(frame_paths)} frames")
-        return frame_paths

modules/bar_race/services/brain.py ADDED Viewed

	@@ -0,0 +1,225 @@

+"""
+Brain - LLM Planner
+Uses Gemini API to understand user topic and generate structured JSON plan.
+"""
+import logging
+import json
+import os
+from typing import Dict, Any, Optional
+logger = logging.getLogger(__name__)
+class Brain:
+    """
+    LLM Planner for Bar Race video generation.
+    Responsibilities:
+    - Topic understanding & decomposition
+    - Entity type detection (person, country, company)
+    - Search strategy generation
+    - Data source priority determination
+    - Visualization config generation
+    """
+    GEMINI_MODEL = "gemma-3-27b-it"
+    SYSTEM_PROMPT = """You are an expert data analyst and video planner. Your task is to analyze a user's topic and create a structured JSON plan for generating a bar chart race video.
+Given a topic like "Top 10 richest cricketers history" or "GDP by country 2000-2024", you must output a JSON plan with:
+1. entity_type: "person", "country", "company", or "general"
+2. time_config: start_year, end_year, granularity (year/month)
+3. value_intent: what values to track (net worth, GDP, population, etc.)
+4. search_strategies: list of search queries to find data
+5. source_priority: ["wikipedia_rest", "worldbank_api", "web_scraping"]
+6. data_expectation: numeric, sparse, needs_interpolation
+7. visualization: type, top_n, smooth
+8. video_meta: title for the video
+IMPORTANT: Output ONLY valid JSON, no other text."""
+    PLAN_TEMPLATE = """{
+  "topic": "{topic}",
+  "entity_type": "country",
+  "time_config": {
+    "start_year": 2000,
+    "end_year": 2024,
+    "granularity": "year"
+  },
+  "value_intent": {
+    "primary": "GDP",
+    "unit": "Trillion USD",
+    "alternatives": ["gross domestic product", "economic output"]
+  },
+  "search_strategies": [
+    {
+      "intent": "ranking_history",
+      "queries": [
+        "GDP by country by year wikipedia",
+        "world GDP ranking history table"
+      ]
+    }
+  ],
+  "source_priority": ["wikipedia_rest", "worldbank_api", "web_scraping"],
+  "data_expectation": {
+    "numeric": true,
+    "sparse": false,
+    "needs_interpolation": false
+  },
+  "visualization": {
+    "type": "bar_chart_race",
+    "top_n": 10,
+    "smooth": true
+  },
+  "video_meta": {
+    "title": "Top 10 Countries by GDP (2000-2024)"
+  }
+}"""
+    def __init__(self, gemini_api_key: str = None):
+        self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY")
+        self.gemini_client = None
+        if self.gemini_api_key:
+            try:
+                from google import genai
+                self.gemini_client = genai.Client(api_key=self.gemini_api_key)
+                logger.info("Brain: Gemini client initialized")
+            except ImportError:
+                logger.warning("google-genai package not installed")
+        else:
+            logger.warning("Brain: No Gemini API key, will use template-based planning")
+    def generate_plan(self, topic: str) -> Dict[str, Any]:
+        """
+        Generate a structured plan from user topic.
+        Args:
+            topic: User's topic string
+        Returns:
+            Dict containing the structured plan
+        """
+        logger.info(f"Brain: Generating plan for topic: {topic}")
+        if self.gemini_client:
+            try:
+                plan = self._generate_with_gemini(topic)
+                if plan:
+                    logger.info("Brain: Plan generated with Gemini")
+                    return plan
+            except Exception as e:
+                logger.warning(f"Brain: Gemini failed, using fallback: {e}")
+        # Fallback: Template-based planning
+        return self._generate_fallback_plan(topic)
+    def _generate_with_gemini(self, topic: str) -> Optional[Dict[str, Any]]:
+        """Generate plan using Gemini API"""
+        prompt = f"""Analyze this topic and create a JSON plan for a bar chart race video:
+Topic: {topic}
+{self.SYSTEM_PROMPT}
+Output the JSON plan:"""
+        response = self.gemini_client.models.generate_content(
+            model=self.GEMINI_MODEL,
+            contents=prompt
+        )
+        # Parse JSON from response
+        text = response.text.strip()
+        # Extract JSON if wrapped in markdown
+        if "```json" in text:
+            text = text.split("```json")[1].split("```")[0].strip()
+        elif "```" in text:
+            text = text.split("```")[1].split("```")[0].strip()
+        try:
+            return json.loads(text)
+        except json.JSONDecodeError as e:
+            logger.error(f"Brain: Failed to parse Gemini response as JSON: {e}")
+            return None
+    def _generate_fallback_plan(self, topic: str) -> Dict[str, Any]:
+        """Generate plan using simple heuristics when Gemini fails"""
+        logger.info("Brain: Using fallback template-based planning")
+        topic_lower = topic.lower()
+        # Detect entity type
+        if any(word in topic_lower for word in ["person", "cricketer", "player", "actor", "singer", "celebrity", "billionaire"]):
+            entity_type = "person"
+        elif any(word in topic_lower for word in ["country", "nation", "gdp", "population", "military"]):
+            entity_type = "country"
+        elif any(word in topic_lower for word in ["company", "brand", "corporation", "business"]):
+            entity_type = "company"
+        else:
+            entity_type = "general"
+        # Detect value intent
+        if "gdp" in topic_lower:
+            value_primary = "GDP"
+            value_unit = "Trillion USD"
+        elif "population" in topic_lower:
+            value_primary = "population"
+            value_unit = "Million"
+        elif "rich" in topic_lower or "wealth" in topic_lower or "net worth" in topic_lower:
+            value_primary = "net worth"
+            value_unit = "Billion USD"
+        elif "subscriber" in topic_lower:
+            value_primary = "subscribers"
+            value_unit = "Million"
+        else:
+            value_primary = "value"
+            value_unit = ""
+        # Generate search queries
+        search_queries = [
+            f"{topic} wikipedia",
+            f"{topic} by year table",
+            f"{topic} history data"
+        ]
+        # Build plan
+        plan = {
+            "topic": topic,
+            "entity_type": entity_type,
+            "time_config": {
+                "start_year": 2000,
+                "end_year": 2024,
+                "granularity": "year"
+            },
+            "value_intent": {
+                "primary": value_primary,
+                "unit": value_unit,
+                "alternatives": []
+            },
+            "search_strategies": [
+                {
+                    "intent": "ranking_history",
+                    "queries": search_queries
+                }
+            ],
+            "source_priority": ["wikipedia_rest", "worldbank_api", "web_scraping"],
+            "data_expectation": {
+                "numeric": True,
+                "sparse": True,
+                "needs_interpolation": True
+            },
+            "visualization": {
+                "type": "bar_chart_race",
+                "top_n": 10,
+                "smooth": True
+            },
+            "video_meta": {
+                "title": f"{topic} Evolution"
+            }
+        }
+        logger.info(f"Brain: Generated fallback plan for entity_type={entity_type}")
+        return plan

modules/bar_race/services/data_fetcher.py DELETED Viewed

@@ -1,134 +0,0 @@
-"""
-Data Fetcher Service
-Fetches and normalizes data for bar chart race.
-Uses AI-generated realistic data for demo purposes.
-"""
-import logging
-from typing import List, Dict, Optional
-import random
-logger = logging.getLogger(__name__)
-class DataFetcher:
-    """
-    Fetches data for bar chart race topics.
-    Uses AI-generated realistic data based on topic configuration.
-    """
-    # Country data for various topics
-    COUNTRY_DATA = {
-        "gdp_nominal": {
-            "entities": ["USA", "China", "Japan", "Germany", "UK", "India", "France", "Italy", "Brazil", "Canada"],
-            "base_values": [10.0, 1.2, 4.5, 2.0, 1.5, 0.5, 1.4, 1.2, 0.6, 0.7],
-            "growth_rates": [0.03, 0.10, 0.02, 0.02, 0.02, 0.07, 0.02, 0.01, 0.03, 0.02],
-        },
-        "population": {
-            "entities": ["China", "India", "USA", "Indonesia", "Pakistan", "Brazil", "Nigeria", "Bangladesh", "Russia", "Mexico"],
-            "base_values": [1200, 1000, 280, 210, 140, 170, 120, 130, 145, 100],
-            "growth_rates": [0.005, 0.015, 0.008, 0.012, 0.020, 0.008, 0.025, 0.010, -0.002, 0.012],
-        },
-        "social_media_users": {
-            "entities": ["Facebook", "YouTube", "WhatsApp", "Instagram", "TikTok", "Snapchat", "Twitter", "LinkedIn", "Pinterest", "Reddit"],
-            "base_values": [0.1, 0.05, 0.0, 0.0, 0.0, 0.0, 0.01, 0.01, 0.0, 0.01],
-            "growth_rates": [0.35, 0.40, 0.50, 0.60, 0.80, 0.30, 0.20, 0.15, 0.25, 0.20],
-        },
-        "youtube_subscribers": {
-            "entities": ["T-Series", "MrBeast", "Cocomelon", "SET India", "PewDiePie", "Kids Diana Show", "Like Nastya", "Vlad and Niki", "Zee Music", "WWE"],
-            "base_values": [1, 0.1, 0.1, 0.5, 5, 0.1, 0.1, 0.1, 0.3, 10],
-            "growth_rates": [0.40, 0.50, 0.60, 0.30, 0.15, 0.55, 0.55, 0.50, 0.25, 0.05],
-        },
-        "military_spending": {
-            "entities": ["USA", "China", "Russia", "India", "UK", "Saudi Arabia", "Germany", "France", "Japan", "South Korea"],
-            "base_values": [300, 20, 50, 15, 35, 20, 30, 35, 40, 15],
-            "growth_rates": [0.03, 0.12, 0.05, 0.08, 0.02, 0.08, 0.02, 0.02, 0.01, 0.05],
-        },
-    }
-    def __init__(self):
-        pass
-    def fetch_data(
-        self,
-        topic_id: str,
-        year_start: int,
-        year_end: int,
-        top_n: int = 10
-    ) -> List[Dict]:
-        """
-        Fetch data for a topic.
-        Returns list of {name, year, value} dicts.
-        """
-        topic_data = self.COUNTRY_DATA.get(topic_id)
-        if not topic_data:
-            # Generate generic data for unknown topics
-            return self._generate_generic_data(topic_id, year_start, year_end, top_n)
-        return self._generate_realistic_data(topic_data, year_start, year_end, top_n)
-    def _generate_realistic_data(
-        self,
-        topic_data: Dict,
-        year_start: int,
-        year_end: int,
-        top_n: int
-    ) -> List[Dict]:
-        """Generate realistic data based on topic configuration"""
-        entities = topic_data["entities"][:top_n]
-        base_values = topic_data["base_values"][:top_n]
-        growth_rates = topic_data["growth_rates"][:top_n]
-        data = []
-        base_year = 2000  # Reference year for base values
-        for year in range(year_start, year_end + 1):
-            for i, entity in enumerate(entities):
-                # Calculate value based on growth from base year
-                years_diff = year - base_year
-                value = base_values[i] * ((1 + growth_rates[i]) ** years_diff)
-                # Add some random variation (±5%)
-                value *= (1 + random.uniform(-0.05, 0.05))
-                data.append({
-                    "name": entity,
-                    "year": year,
-                    "value": round(value, 2)
-                })
-        return data
-    def _generate_generic_data(
-        self,
-        topic_id: str,
-        year_start: int,
-        year_end: int,
-        top_n: int
-    ) -> List[Dict]:
-        """Generate generic data for unknown topics"""
-        logger.warning(f"No pre-configured data for topic: {topic_id}, generating generic data")
-        entities = [f"Entity_{i+1}" for i in range(top_n)]
-        data = []
-        for year in range(year_start, year_end + 1):
-            for i, entity in enumerate(entities):
-                # Random growth pattern
-                base = 100 - i * 5
-                value = base * (1 + 0.05 * (year - year_start))
-                value *= (1 + random.uniform(-0.1, 0.1))
-                data.append({
-                    "name": entity,
-                    "year": year,
-                    "value": round(value, 2)
-                })
-        return data
-    def get_data_for_year(self, data: List[Dict], year: int, top_n: int = 10) -> List[Dict]:
-        """Filter and sort data for a specific year"""
-        year_data = [d for d in data if d["year"] == year]
-        year_data.sort(key=lambda x: x["value"], reverse=True)
-        return year_data[:top_n]

modules/bar_race/services/director.py ADDED Viewed

	@@ -0,0 +1,333 @@

+"""
+Director - Video Generator
+Creates bar chart race animation and final video.
+"""
+import logging
+import pandas as pd
+import os
+from typing import Dict, Any, Optional
+import shutil
+logger = logging.getLogger(__name__)
+class Director:
+    """
+    Video Generator for Bar Race.
+    Creates animated bar chart race video using:
+    - bar_chart_race library for animation
+    - Entity images overlay
+    - Background music
+    - 9:16 vertical format (1080x1920)
+    """
+    # Video dimensions (9:16)
+    VIDEO_WIDTH = 1080
+    VIDEO_HEIGHT = 1920
+    FPS = 30
+    def __init__(self, temp_dir: str, output_dir: str = "videos/bar_race"):
+        self.temp_dir = temp_dir
+        self.output_dir = output_dir
+        os.makedirs(output_dir, exist_ok=True)
+    def generate_video(
+        self,
+        df: pd.DataFrame,
+        plan: Dict[str, Any],
+        image_paths: Dict[str, str],
+        duration_seconds: int = 60,
+        job_id: str = ""
+    ) -> Optional[str]:
+        """
+        Generate bar chart race video.
+        Args:
+            df: Cleaned data with columns: name, year, value
+            plan: Brain's plan with video_meta
+            image_paths: Dict mapping entity name to image path
+            duration_seconds: Video duration
+            job_id: Job ID for output filename
+        Returns:
+            Path to generated video, or None if failed
+        """
+        logger.info(f"Director: Starting video generation for {duration_seconds}s video")
+        try:
+            # Prepare data for bar_chart_race
+            df_pivot = self._prepare_data(df)
+            if df_pivot is None or df_pivot.empty:
+                logger.error("Director: Failed to prepare data")
+                return None
+            # Generate animation
+            video_path = self._generate_bar_race(
+                df_pivot=df_pivot,
+                plan=plan,
+                duration_seconds=duration_seconds,
+                job_id=job_id
+            )
+            if video_path and os.path.exists(video_path):
+                # Try to add background music (optional)
+                video_with_music = self._add_background_music(video_path, duration_seconds)
+                if video_with_music:
+                    return video_with_music
+            return video_path
+        except Exception as e:
+            logger.error(f"Director: Video generation failed: {e}")
+            import traceback
+            logger.error(traceback.format_exc())
+            return None
+    def _add_background_music(self, video_path: str, duration_seconds: int) -> Optional[str]:
+        """Add background music if available in assets/music folder"""
+        music_dir = "modules/bar_race/assets/music"
+        # Check if music directory exists
+        if not os.path.exists(music_dir):
+            logger.info("Director: No music folder found, skipping background music")
+            return None
+        # Find music files
+        music_files = []
+        for ext in [".mp3", ".wav", ".m4a", ".ogg"]:
+            for f in os.listdir(music_dir):
+                if f.lower().endswith(ext):
+                    music_files.append(os.path.join(music_dir, f))
+        if not music_files:
+            logger.info("Director: No music files found, skipping background music")
+            return None
+        try:
+            from moviepy.editor import VideoFileClip, AudioFileClip
+            import random
+            # Pick random music file
+            music_path = random.choice(music_files)
+            logger.info(f"Director: Adding background music: {music_path}")
+            # Load video and audio
+            video = VideoFileClip(video_path)
+            audio = AudioFileClip(music_path)
+            # Loop audio if shorter than video
+            if audio.duration < video.duration:
+                from moviepy.editor import concatenate_audioclips
+                loops_needed = int(video.duration / audio.duration) + 1
+                audio = concatenate_audioclips([audio] * loops_needed)
+            # Trim audio to video length and lower volume
+            audio = audio.subclip(0, video.duration).volumex(0.3)
+            # Add audio to video
+            video_with_audio = video.set_audio(audio)
+            # Save with music
+            output_path = video_path.replace(".mp4", "_music.mp4")
+            video_with_audio.write_videofile(
+                output_path,
+                codec="libx264",
+                audio_codec="aac",
+                fps=self.FPS,
+                logger=None
+            )
+            # Cleanup
+            video.close()
+            audio.close()
+            # Replace original with music version
+            os.remove(video_path)
+            os.rename(output_path, video_path)
+            logger.info(f"Director: Added background music to video")
+            return video_path
+        except Exception as e:
+            logger.warning(f"Director: Failed to add music: {e}")
+            return None
+    def _prepare_data(self, df: pd.DataFrame) -> Optional[pd.DataFrame]:
+        """Prepare data for bar_chart_race (pivoted format)"""
+        try:
+            # Pivot: rows=year, columns=entity, values=value
+            df_pivot = df.pivot(index="year", columns="name", values="value")
+            # Sort by year
+            df_pivot = df_pivot.sort_index()
+            # Fill NaN with 0
+            df_pivot = df_pivot.fillna(0)
+            logger.info(f"Director: Prepared pivot table with shape {df_pivot.shape}")
+            return df_pivot
+        except Exception as e:
+            logger.error(f"Director: Data preparation failed: {e}")
+            return None
+    def _generate_bar_race(
+        self,
+        df_pivot: pd.DataFrame,
+        plan: Dict[str, Any],
+        duration_seconds: int,
+        job_id: str
+    ) -> Optional[str]:
+        """Generate bar chart race animation"""
+        # Get video metadata
+        video_meta = plan.get("video_meta", {})
+        title = video_meta.get("title", "Bar Chart Race")
+        value_unit = plan.get("value_intent", {}).get("unit", "")
+        top_n = plan.get("visualization", {}).get("top_n", 10)
+        output_path = os.path.join(self.output_dir, f"bar_race_{job_id}.mp4")
+        try:
+            import bar_chart_race as bcr
+            # Calculate steps per period based on duration
+            num_years = len(df_pivot)
+            steps_per_period = max(10, (duration_seconds * self.FPS) // num_years)
+            logger.info(f"Director: Creating animation with steps_per_period={steps_per_period}")
+            # Generate bar chart race
+            bcr.bar_chart_race(
+                df=df_pivot,
+                filename=output_path,
+                orientation='h',
+                sort='desc',
+                n_bars=top_n,
+                fixed_order=False,
+                fixed_max=True,
+                steps_per_period=steps_per_period,
+                period_length=500,
+                interpolate_period=True,
+                period_label={'x': .95, 'y': .15, 'ha': 'right', 'size': 72},
+                period_fmt='{x:.0f}',
+                period_summary_func=None,
+                perpendicular_bar_func=None,
+                title=title,
+                title_size=36,
+                bar_size=.85,
+                bar_textposition='inside',
+                bar_texttemplate='{x:,.0f}',
+                bar_label_size=14,
+                tick_label_size=14,
+                scale='linear',
+                writer=None,
+                fig=None,
+                bar_kwargs={'alpha': .8},
+                filter_column_colors=False,
+                cmap='dark24',
+                dpi=144
+            )
+            logger.info(f"Director: Generated video at {output_path}")
+            return output_path
+        except ImportError:
+            logger.warning("Director: bar_chart_race not available, using fallback")
+            return self._generate_fallback_video(df_pivot, plan, duration_seconds, job_id)
+        except Exception as e:
+            logger.error(f"Director: bar_chart_race failed: {e}")
+            return self._generate_fallback_video(df_pivot, plan, duration_seconds, job_id)
+    def _generate_fallback_video(
+        self,
+        df_pivot: pd.DataFrame,
+        plan: Dict[str, Any],
+        duration_seconds: int,
+        job_id: str
+    ) -> Optional[str]:
+        """Fallback: Generate simple video using matplotlib and MoviePy"""
+        logger.info("Director: Using fallback matplotlib animation")
+        try:
+            import matplotlib
+            matplotlib.use('Agg')
+            import matplotlib.pyplot as plt
+            from matplotlib.animation import FuncAnimation
+            from moviepy.editor import VideoFileClip
+            import tempfile
+            video_meta = plan.get("video_meta", {})
+            title = video_meta.get("title", "Bar Chart Race")
+            top_n = plan.get("visualization", {}).get("top_n", 10)
+            # Create figure with dark theme
+            fig, ax = plt.subplots(figsize=(6, 10.67), facecolor='#121220')
+            ax.set_facecolor('#121220')
+            years = df_pivot.index.tolist()
+            num_frames = duration_seconds * self.FPS
+            frames_per_year = num_frames // len(years)
+            # Colors for bars
+            colors = plt.cm.viridis([i/top_n for i in range(top_n)])
+            def update(frame):
+                ax.clear()
+                ax.set_facecolor('#121220')
+                # Calculate current year and interpolation
+                year_idx = min(frame // frames_per_year, len(years) - 1)
+                year = years[year_idx]
+                # Get data for current year
+                data = df_pivot.loc[year].sort_values(ascending=True).tail(top_n)
+                # Draw horizontal bars
+                bars = ax.barh(range(len(data)), data.values, color=colors[:len(data)])
+                # Labels
+                ax.set_yticks(range(len(data)))
+                ax.set_yticklabels(data.index, fontsize=10, color='white')
+                ax.set_title(f"{title}\n{year}", fontsize=16, color='white', pad=20)
+                # Style
+                ax.spines['top'].set_visible(False)
+                ax.spines['right'].set_visible(False)
+                ax.spines['bottom'].set_color('#444')
+                ax.spines['left'].set_color('#444')
+                ax.tick_params(colors='#888')
+                plt.tight_layout()
+            # Create animation
+            anim = FuncAnimation(fig, update, frames=num_frames, interval=1000/self.FPS)
+            # Save to temp file
+            temp_path = os.path.join(self.temp_dir, f"temp_animation_{job_id}.mp4")
+            anim.save(temp_path, writer='ffmpeg', fps=self.FPS, dpi=100)
+            plt.close(fig)
+            # Move to output
+            output_path = os.path.join(self.output_dir, f"bar_race_{job_id}.mp4")
+            shutil.move(temp_path, output_path)
+            logger.info(f"Director: Generated fallback video at {output_path}")
+            return output_path
+        except Exception as e:
+            logger.error(f"Director: Fallback video generation failed: {e}")
+            import traceback
+            logger.error(traceback.format_exc())
+            return None
+    def cleanup(self):
+        """Clean up temporary files"""
+        try:
+            if os.path.exists(self.temp_dir):
+                shutil.rmtree(self.temp_dir)
+                logger.info(f"Director: Cleaned up temp directory: {self.temp_dir}")
+        except Exception as e:
+            logger.warning(f"Director: Cleanup failed: {e}")

modules/bar_race/services/scout.py ADDED Viewed

	@@ -0,0 +1,267 @@

+"""
+Scout - Data Fetcher
+Collects data from multiple sources based on Brain's plan.
+Priority:
+1. Wikipedia REST API
+2. World Bank API
+3. DuckDuckGo + BeautifulSoup scraping
+"""
+import logging
+import requests
+import pandas as pd
+from typing import Dict, Any, List, Optional
+from bs4 import BeautifulSoup
+import re
+import os
+logger = logging.getLogger(__name__)
+class Scout:
+    """
+    Data Fetcher for Bar Race video generation.
+    Tries APIs first, falls back to web scraping.
+    """
+    # API endpoints
+    WIKIPEDIA_API = "https://en.wikipedia.org/api/rest_v1"
+    WORLDBANK_API = "https://api.worldbank.org/v2"
+    # Common headers
+    HEADERS = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+    }
+    def __init__(self, temp_dir: str):
+        self.temp_dir = temp_dir
+        os.makedirs(temp_dir, exist_ok=True)
+    def fetch_data(self, plan: Dict[str, Any]) -> Optional[pd.DataFrame]:
+        """
+        Fetch data based on Brain's plan.
+        Tries sources in priority order:
+        1. Wikipedia REST API
+        2. World Bank API
+        3. Web scraping
+        Returns:
+            DataFrame with raw data, or None if all sources fail
+        """
+        source_priority = plan.get("source_priority", ["wikipedia_rest", "worldbank_api", "web_scraping"])
+        entity_type = plan.get("entity_type", "general")
+        topic = plan.get("topic", "")
+        value_intent = plan.get("value_intent", {})
+        df = None
+        for source in source_priority:
+            logger.info(f"Scout: Trying source: {source}")
+            try:
+                if source == "wikipedia_rest":
+                    df = self._fetch_wikipedia(plan)
+                elif source == "worldbank_api":
+                    df = self._fetch_worldbank(plan)
+                elif source == "web_scraping":
+                    df = self._fetch_scraping(plan)
+                if df is not None and not df.empty:
+                    logger.info(f"Scout: Success with {source}, got {len(df)} rows")
+                    break
+            except Exception as e:
+                logger.warning(f"Scout: {source} failed: {e}")
+                continue
+        if df is not None and not df.empty:
+            # Save raw data
+            raw_path = os.path.join(self.temp_dir, "raw_data.csv")
+            df.to_csv(raw_path, index=False)
+            logger.info(f"Scout: Saved raw data to {raw_path}")
+            return df
+        logger.error("Scout: All sources failed")
+        return None
+    def _fetch_wikipedia(self, plan: Dict[str, Any]) -> Optional[pd.DataFrame]:
+        """Fetch data from Wikipedia REST API or tables"""
+        queries = []
+        for strategy in plan.get("search_strategies", []):
+            queries.extend(strategy.get("queries", []))
+        if not queries:
+            queries = [plan.get("topic", "")]
+        # Try Wikipedia page tables
+        for query in queries:
+            try:
+                # Search Wikipedia
+                search_url = f"https://en.wikipedia.org/w/api.php"
+                params = {
+                    "action": "opensearch",
+                    "search": query.replace(" wikipedia", ""),
+                    "limit": 5,
+                    "format": "json"
+                }
+                response = requests.get(search_url, params=params, headers=self.HEADERS, timeout=10)
+                if response.status_code == 200:
+                    results = response.json()
+                    if len(results) >= 4 and results[3]:
+                        # Get first result URL
+                        page_url = results[3][0]
+                        # Fetch page and extract tables
+                        tables = pd.read_html(page_url)
+                        if tables:
+                            # Find table with year data
+                            for table in tables:
+                                if self._has_year_column(table):
+                                    logger.info(f"Scout: Found table with year data from Wikipedia")
+                                    return table
+                            # Return largest table if no year column found
+                            largest = max(tables, key=lambda t: len(t))
+                            return largest
+            except Exception as e:
+                logger.debug(f"Scout: Wikipedia query '{query}' failed: {e}")
+                continue
+        return None
+    def _fetch_worldbank(self, plan: Dict[str, Any]) -> Optional[pd.DataFrame]:
+        """Fetch data from World Bank API"""
+        value_intent = plan.get("value_intent", {})
+        primary_value = value_intent.get("primary", "").lower()
+        time_config = plan.get("time_config", {})
+        # Map common intents to World Bank indicators
+        indicator_map = {
+            "gdp": "NY.GDP.MKTP.CD",  # GDP (current USD)
+            "population": "SP.POP.TOTL",  # Total population
+            "life expectancy": "SP.DYN.LE00.IN",  # Life expectancy at birth
+            "inflation": "FP.CPI.TOTL.ZG",  # Inflation (consumer prices)
+            "military": "MS.MIL.XPND.CD",  # Military expenditure
+        }
+        indicator = None
+        for key, value in indicator_map.items():
+            if key in primary_value:
+                indicator = value
+                break
+        if not indicator:
+            logger.debug("Scout: No matching World Bank indicator found")
+            return None
+        try:
+            # Fetch data from World Bank API
+            start_year = time_config.get("start_year", 2000)
+            end_year = time_config.get("end_year", 2024)
+            url = f"{self.WORLDBANK_API}/country/all/indicator/{indicator}"
+            params = {
+                "format": "json",
+                "per_page": 500,
+                "date": f"{start_year}:{end_year}"
+            }
+            response = requests.get(url, params=params, headers=self.HEADERS, timeout=15)
+            if response.status_code == 200:
+                data = response.json()
+                if len(data) >= 2 and data[1]:
+                    records = data[1]
+                    # Convert to DataFrame
+                    rows = []
+                    for record in records:
+                        if record.get("value") is not None:
+                            rows.append({
+                                "name": record["country"]["value"],
+                                "year": int(record["date"]),
+                                "value": record["value"]
+                            })
+                    if rows:
+                        df = pd.DataFrame(rows)
+                        logger.info(f"Scout: Got {len(df)} rows from World Bank API")
+                        return df
+        except Exception as e:
+            logger.warning(f"Scout: World Bank API failed: {e}")
+        return None
+    def _fetch_scraping(self, plan: Dict[str, Any]) -> Optional[pd.DataFrame]:
+        """Fallback: Search and scrape web pages"""
+        queries = []
+        for strategy in plan.get("search_strategies", []):
+            queries.extend(strategy.get("queries", []))
+        if not queries:
+            queries = [f"{plan.get('topic', '')} data table"]
+        # Try DuckDuckGo search
+        try:
+            from duckduckgo_search import DDGS
+            with DDGS() as ddgs:
+                for query in queries[:3]:  # Limit to 3 queries
+                    results = list(ddgs.text(query, max_results=5))
+                    for result in results:
+                        url = result.get("href", "")
+                        if not url:
+                            continue
+                        try:
+                            # Fetch and parse tables
+                            response = requests.get(url, headers=self.HEADERS, timeout=10)
+                            if response.status_code == 200:
+                                tables = pd.read_html(response.text)
+                                if tables:
+                                    for table in tables:
+                                        if self._has_year_column(table):
+                                            logger.info(f"Scout: Found table from {url}")
+                                            return table
+                                    # Return largest table
+                                    largest = max(tables, key=lambda t: len(t))
+                                    if len(largest) > 5:
+                                        return largest
+                        except Exception as e:
+                            logger.debug(f"Scout: Failed to scrape {url}: {e}")
+                            continue
+        except ImportError:
+            logger.warning("Scout: duckduckgo-search not installed")
+        except Exception as e:
+            logger.warning(f"Scout: DuckDuckGo search failed: {e}")
+        return None
+    def _has_year_column(self, df: pd.DataFrame) -> bool:
+        """Check if DataFrame has a year-like column"""
+        for col in df.columns:
+            col_str = str(col).lower()
+            # Check if column name contains year-related words
+            if any(word in col_str for word in ["year", "date", "time"]):
+                return True
+            # Check if column values look like years
+            try:
+                sample = df[col].dropna().head(5)
+                for val in sample:
+                    if isinstance(val, (int, float)):
+                        if 1900 <= val <= 2100:
+                            return True
+                    elif isinstance(val, str):
+                        if re.match(r'^(19|20)\d{2}$', str(val)):
+                            return True
+            except:
+                pass
+        return False

modules/bar_race/services/surgeon.py ADDED Viewed

	@@ -0,0 +1,327 @@

+"""
+Surgeon - Data Cleaner & Formatter
+Cleans raw data and prepares it for bar chart race animation.
+"""
+import logging
+import pandas as pd
+import numpy as np
+import re
+import os
+from typing import Dict, Any, Optional, List
+logger = logging.getLogger(__name__)
+class Surgeon:
+    """
+    Data Cleaner for Bar Race video generation.
+    Responsibilities:
+    - Table selection (find year columns)
+    - Wide → Long / Long → Wide conversion
+    - Regex cleaning ($, €, commas, references)
+    - Convert strings to numeric
+    - Interpolate missing years
+    """
+    def __init__(self, temp_dir: str):
+        self.temp_dir = temp_dir
+    def clean_data(self, df: pd.DataFrame, plan: Dict[str, Any]) -> Optional[pd.DataFrame]:
+        """
+        Clean and format raw data for bar chart race.
+        Expected output format:
+        name | year | value
+        USA  | 2000 | 10.5
+        USA  | 2001 | 11.2
+        ...
+        Returns:
+            Cleaned DataFrame with columns: name, year, value
+        """
+        if df is None or df.empty:
+            logger.error("Surgeon: No data to clean")
+            return None
+        logger.info(f"Surgeon: Cleaning data with shape {df.shape}")
+        logger.debug(f"Surgeon: Columns: {list(df.columns)}")
+        try:
+            # Step 1: Identify data structure
+            structure = self._identify_structure(df)
+            logger.info(f"Surgeon: Data structure: {structure}")
+            # Step 2: Convert to long format
+            if structure == "wide":
+                df_long = self._wide_to_long(df, plan)
+            elif structure == "long":
+                df_long = self._normalize_long(df, plan)
+            else:
+                df_long = self._attempt_conversion(df, plan)
+            if df_long is None or df_long.empty:
+                logger.error("Surgeon: Failed to convert data to long format")
+                return None
+            # Step 3: Clean values
+            df_clean = self._clean_values(df_long)
+            # Step 4: Interpolate missing years
+            time_config = plan.get("time_config", {})
+            df_interpolated = self._interpolate_years(
+                df_clean,
+                start_year=time_config.get("start_year", 2000),
+                end_year=time_config.get("end_year", 2024)
+            )
+            # Step 5: Get top N entities
+            top_n = plan.get("visualization", {}).get("top_n", 10)
+            df_final = self._get_top_entities(df_interpolated, top_n)
+            # Save cleaned data
+            output_path = os.path.join(self.temp_dir, "bar_chart_ready.csv")
+            df_final.to_csv(output_path, index=False)
+            logger.info(f"Surgeon: Saved cleaned data to {output_path}, shape: {df_final.shape}")
+            return df_final
+        except Exception as e:
+            logger.error(f"Surgeon: Cleaning failed: {e}")
+            import traceback
+            logger.error(traceback.format_exc())
+            return None
+    def _identify_structure(self, df: pd.DataFrame) -> str:
+        """Identify if data is wide or long format"""
+        # Wide format: years as columns (2000, 2001, 2002...)
+        year_columns = []
+        for col in df.columns:
+            try:
+                year = int(str(col))
+                if 1900 <= year <= 2100:
+                    year_columns.append(col)
+            except:
+                pass
+        if len(year_columns) > 3:
+            return "wide"
+        # Long format: year column with values
+        for col in df.columns:
+            col_lower = str(col).lower()
+            if "year" in col_lower or "date" in col_lower:
+                return "long"
+        return "unknown"
+    def _wide_to_long(self, df: pd.DataFrame, plan: Dict[str, Any]) -> Optional[pd.DataFrame]:
+        """Convert wide format to long format"""
+        # Find year columns
+        year_columns = []
+        non_year_columns = []
+        for col in df.columns:
+            try:
+                year = int(str(col))
+                if 1900 <= year <= 2100:
+                    year_columns.append(col)
+                else:
+                    non_year_columns.append(col)
+            except:
+                non_year_columns.append(col)
+        if not year_columns:
+            return None
+        # Find name column (first non-year column with strings)
+        name_col = None
+        for col in non_year_columns:
+            if df[col].dtype == object:
+                name_col = col
+                break
+        if name_col is None and non_year_columns:
+            name_col = non_year_columns[0]
+        if name_col is None:
+            return None
+        # Melt to long format
+        df_long = df.melt(
+            id_vars=[name_col],
+            value_vars=year_columns,
+            var_name="year",
+            value_name="value"
+        )
+        # Rename columns
+        df_long.columns = ["name", "year", "value"]
+        return df_long
+    def _normalize_long(self, df: pd.DataFrame, plan: Dict[str, Any]) -> Optional[pd.DataFrame]:
+        """Normalize long format data"""
+        # Find relevant columns
+        name_col = None
+        year_col = None
+        value_col = None
+        for col in df.columns:
+            col_lower = str(col).lower()
+            if name_col is None and any(word in col_lower for word in ["name", "country", "entity", "player"]):
+                name_col = col
+            elif year_col is None and any(word in col_lower for word in ["year", "date", "time"]):
+                year_col = col
+            elif value_col is None and any(word in col_lower for word in ["value", "amount", "gdp", "population", "worth"]):
+                value_col = col
+        # Fallback: use first string column as name, numeric columns for year/value
+        if name_col is None:
+            for col in df.columns:
+                if df[col].dtype == object:
+                    name_col = col
+                    break
+        if value_col is None:
+            # Use last numeric column as value
+            for col in reversed(list(df.columns)):
+                if col != year_col and pd.api.types.is_numeric_dtype(df[col]):
+                    value_col = col
+                    break
+        if not all([name_col, year_col, value_col]):
+            logger.warning(f"Surgeon: Could not identify columns. name={name_col}, year={year_col}, value={value_col}")
+            return None
+        # Select and rename
+        df_long = df[[name_col, year_col, value_col]].copy()
+        df_long.columns = ["name", "year", "value"]
+        return df_long
+    def _attempt_conversion(self, df: pd.DataFrame, plan: Dict[str, Any]) -> Optional[pd.DataFrame]:
+        """Attempt to convert unknown format"""
+        # Try treating first column as name, rest as years/values
+        if len(df.columns) >= 2:
+            name_col = df.columns[0]
+            # Check if other columns might be years
+            potential_years = []
+            for col in df.columns[1:]:
+                try:
+                    year = int(str(col))
+                    if 1900 <= year <= 2100:
+                        potential_years.append(col)
+                except:
+                    pass
+            if potential_years:
+                return self._wide_to_long(df, plan)
+        return None
+    def _clean_values(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Clean values: remove symbols, convert to numeric"""
+        df = df.copy()
+        # Clean name column
+        df["name"] = df["name"].astype(str).str.strip()
+        df["name"] = df["name"].str.replace(r'\[.*?\]', '', regex=True)  # Remove references like [1]
+        # Clean year column
+        df["year"] = pd.to_numeric(df["year"], errors="coerce")
+        # Clean value column
+        def clean_value(val):
+            if pd.isna(val):
+                return np.nan
+            if isinstance(val, (int, float)):
+                return float(val)
+            # Convert to string and clean
+            val_str = str(val)
+            # Remove currency symbols and commas
+            val_str = re.sub(r'[$€£¥₹,]', '', val_str)
+            # Remove references like [1], [a]
+            val_str = re.sub(r'\[.*?\]', '', val_str)
+            # Handle multipliers (billion, million, trillion)
+            multiplier = 1
+            val_lower = val_str.lower()
+            if "trillion" in val_lower:
+                multiplier = 1e12
+                val_str = re.sub(r'trillion', '', val_str, flags=re.IGNORECASE)
+            elif "billion" in val_lower:
+                multiplier = 1e9
+                val_str = re.sub(r'billion', '', val_str, flags=re.IGNORECASE)
+            elif "million" in val_lower:
+                multiplier = 1e6
+                val_str = re.sub(r'million', '', val_str, flags=re.IGNORECASE)
+            # Extract numeric value
+            match = re.search(r'[-+]?\d*\.?\d+', val_str)
+            if match:
+                return float(match.group()) * multiplier
+            return np.nan
+        df["value"] = df["value"].apply(clean_value)
+        # Drop rows with missing data
+        df = df.dropna(subset=["name", "year", "value"])
+        # Convert year to int
+        df["year"] = df["year"].astype(int)
+        return df
+    def _interpolate_years(self, df: pd.DataFrame, start_year: int, end_year: int) -> pd.DataFrame:
+        """Interpolate missing years for each entity"""
+        entities = df["name"].unique()
+        all_years = list(range(start_year, end_year + 1))
+        result_dfs = []
+        for entity in entities:
+            entity_df = df[df["name"] == entity].copy()
+            if entity_df.empty:
+                continue
+            # Create full year index
+            full_df = pd.DataFrame({"year": all_years})
+            full_df["name"] = entity
+            # Merge with existing data
+            merged = full_df.merge(entity_df[["year", "value"]], on="year", how="left")
+            # Interpolate missing values
+            merged["value"] = merged["value"].interpolate(method="linear")
+            # Forward/backward fill remaining NaN
+            merged["value"] = merged["value"].fillna(method="ffill").fillna(method="bfill")
+            result_dfs.append(merged)
+        if result_dfs:
+            return pd.concat(result_dfs, ignore_index=True)
+        return df
+    def _get_top_entities(self, df: pd.DataFrame, top_n: int = 10) -> pd.DataFrame:
+        """Get top N entities based on maximum value"""
+        # Calculate max value for each entity
+        max_values = df.groupby("name")["value"].max().sort_values(ascending=False)
+        # Get top N entity names
+        top_entities = max_values.head(top_n).index.tolist()
+        # Filter dataframe
+        df_top = df[df["name"].isin(top_entities)]
+        logger.info(f"Surgeon: Selected top {len(top_entities)} entities: {top_entities}")
+        return df_top

requirements.txt CHANGED Viewed

@@ -29,3 +29,10 @@ imageio-ffmpeg>=0.4.9
 # Trends Analysis
 pytrends
 pandas

 # Trends Analysis
 pytrends
 pandas
+# Bar Race Module
+bar_chart_race
+beautifulsoup4
+lxml
+duckduckgo-search
+rembg

static/index.html CHANGED Viewed

@@ -656,50 +656,22 @@
             <form id="barRaceForm">
                 <div class="form-group">
-                    <label>Topic *</label>
-                    <select id="barRaceTopic" required>
-                        <option value="gdp_nominal">GDP (Nominal) - Richest Countries</option>
-                        <option value="population">Population - Most Populated Countries</option>
-                        <option value="gdp_per_capita">GDP Per Capita</option>
-                        <option value="social_media_users">Social Media Users</option>
-                        <option value="youtube_subscribers">YouTube Subscribers</option>
-                        <option value="military_spending">Military Expenditure</option>
-                        <option value="olympic_medals">Olympic Gold Medals</option>
-                        <option value="life_expectancy">Life Expectancy</option>
-                        <option value="browser_market_share">Browser Market Share</option>
-                    </select>
-                </div>
-                <div class="form-row" style="display: grid; grid-template-columns: 1fr 1fr; gap: 1rem;">
-                    <div class="form-group">
-                        <label>Start Year</label>
-                        <input type="number" id="barRaceYearStart" value="2000" min="1960" max="2024">
-                    </div>
-                    <div class="form-group">
-                        <label>End Year</label>
-                        <input type="number" id="barRaceYearEnd" value="2024" min="1960" max="2024">
-                    </div>
                 </div>
-                <div class="form-row" style="display: grid; grid-template-columns: 1fr 1fr; gap: 1rem;">
-                    <div class="form-group">
-                        <label>Top N (Bars)</label>
-                        <select id="barRaceTopN">
-                            <option value="5">5</option>
-                            <option value="8">8</option>
-                            <option value="10" selected>10</option>
-                            <option value="15">15</option>
-                        </select>
-                    </div>
-                    <div class="form-group">
-                        <label>Duration (seconds)</label>
-                        <select id="barRaceDuration">
-                            <option value="30">30s</option>
-                            <option value="60" selected>60s</option>
-                            <option value="90">90s</option>
-                            <option value="120">120s</option>
-                        </select>
-                    </div>
                 </div>
                 <button type="submit" class="btn btn-primary" style="width: 100%;">📊 Generate Bar Race Video</button>
@@ -1241,9 +1213,6 @@
             status.innerHTML = '⏳ Starting bar race generation...';
             const topic = document.getElementById('barRaceTopic').value;
-            const yearStart = parseInt(document.getElementById('barRaceYearStart').value);
-            const yearEnd = parseInt(document.getElementById('barRaceYearEnd').value);
-            const topN = parseInt(document.getElementById('barRaceTopN').value);
             const duration = parseInt(document.getElementById('barRaceDuration').value);
             try {
@@ -1252,18 +1221,14 @@
                     headers: { 'Content-Type': 'application/json' },
                     body: JSON.stringify({
                         topic: topic,
-                        year_start: yearStart,
-                        year_end: yearEnd,
-                        top_n: topN,
-                        duration_seconds: duration,
-                        fps: 30
                     })
                 });
                 const data = await response.json();
                 if (!response.ok) throw new Error(data.detail || 'Failed to start');
-                status.innerHTML = `⏳ Job started: ${data.job_id}. Generating frames...`;
                 pollBarRaceStatus(data.job_id);
             } catch (err) {
@@ -1286,7 +1251,8 @@
                         status.className = 'status error';
                         status.innerHTML = '❌ Failed: ' + (data.error || 'Unknown error');
                     } else {
-                        status.innerHTML = `⏳ ${data.status}... ${data.progress}%`;
                         setTimeout(poll, 2000);
                     }
                 } catch (err) {

             <form id="barRaceForm">
                 <div class="form-group">
+                    <label>Topic / Prompt *</label>
+                    <input type="text" id="barRaceTopic" placeholder="e.g., Top 10 richest countries by GDP 2000-2024"
+                        required>
+                    <small style="color: var(--text-secondary); display: block; margin-top: 0.5rem;">
+                        Enter any topic - the AI will find data and create the video
+                    </small>
                 </div>
+                <div class="form-group">
+                    <label>Duration</label>
+                    <select id="barRaceDuration">
+                        <option value="30">30 seconds</option>
+                        <option value="60" selected>60 seconds</option>
+                        <option value="90">90 seconds</option>
+                        <option value="120">120 seconds</option>
+                    </select>
                 </div>
                 <button type="submit" class="btn btn-primary" style="width: 100%;">📊 Generate Bar Race Video</button>
             status.innerHTML = '⏳ Starting bar race generation...';
             const topic = document.getElementById('barRaceTopic').value;
             const duration = parseInt(document.getElementById('barRaceDuration').value);
             try {
                     headers: { 'Content-Type': 'application/json' },
                     body: JSON.stringify({
                         topic: topic,
+                        duration_seconds: duration
                     })
                 });
                 const data = await response.json();
                 if (!response.ok) throw new Error(data.detail || 'Failed to start');
+                status.innerHTML = `⏳ Job started: ${data.job_id}. Analyzing topic...`;
                 pollBarRaceStatus(data.job_id);
             } catch (err) {
                         status.className = 'status error';
                         status.innerHTML = '❌ Failed: ' + (data.error || 'Unknown error');
                     } else {
+                        const step = data.current_step || data.status;
+                        status.innerHTML = `⏳ ${step} (${data.progress}%)`;
                         setTimeout(poll, 2000);
                     }
                 } catch (err) {