ismdrobiul489 commited on
Commit
4c8e696
·
1 Parent(s): e224b41

Add complete Bar Race module with Brain, Scout, Surgeon, Artist, Director architecture

Browse files
modules/bar_race/__init__.py CHANGED
@@ -1,7 +1,15 @@
1
  """
2
  Bar Race Module
3
- Creates animated bar chart race videos.
4
- 100% self-contained - no dependency on other modules.
 
 
 
 
 
 
 
 
5
  """
6
  import logging
7
  from fastapi import FastAPI
@@ -15,6 +23,7 @@ MODULE_DESCRIPTION = "Bar Chart Race Video Generator"
15
 
16
  _app = None
17
 
 
18
  def register(app: FastAPI, config=None):
19
  """Register Bar Race module routes"""
20
  global _app
 
1
  """
2
  Bar Race Module
3
+ Intelligent Bar Chart Race Video Generator.
4
+
5
+ Architecture:
6
+ - Brain: LLM Planner (Gemini)
7
+ - Scout: Data Fetcher (APIs + Scraping)
8
+ - Surgeon: Data Cleaner
9
+ - Artist: Image Processor
10
+ - Director: Video Generator
11
+
12
+ 100% standalone - no dependency on other modules.
13
  """
14
  import logging
15
  from fastapi import FastAPI
 
23
 
24
  _app = None
25
 
26
+
27
  def register(app: FastAPI, config=None):
28
  """Register Bar Race module routes"""
29
  global _app
modules/bar_race/assets/fonts/.gitkeep ADDED
@@ -0,0 +1 @@
 
 
1
+ # Custom fonts for video rendering
modules/bar_race/assets/images/.gitkeep ADDED
@@ -0,0 +1 @@
 
 
1
+ # Entity images will be downloaded here during video generation
modules/bar_race/assets/music/.gitkeep ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Optional background music files
2
+ # Supported formats: .mp3, .wav, .m4a, .ogg
3
+ # Music will be automatically added if files exist here
modules/bar_race/data/__init__.py DELETED
@@ -1 +0,0 @@
1
- # Data directory init
 
 
modules/bar_race/data/topic_registry.py DELETED
@@ -1,154 +0,0 @@
1
- """
2
- Topic Registry
3
- 50+ pre-configured topics for bar chart race videos.
4
- Each topic has: title, unit, category, data source info.
5
- """
6
-
7
- TOPICS = {
8
- # =============================================
9
- # ECONOMICS & FINANCE (সবচেয়ে জনপ্রিয়)
10
- # =============================================
11
- "gdp_nominal": {
12
- "id": "gdp_nominal",
13
- "title": "Richest Countries by GDP (Nominal)",
14
- "unit": "Trillion USD",
15
- "category": "economics",
16
- "year_range": (1960, 2024),
17
- "top_n": 10,
18
- "description": "বিশ্বের সবচেয়ে ধনী দেশগুলোর তালিকা",
19
- "bar_color": "#4CAF50", # Green
20
- },
21
- "gdp_ppp": {
22
- "id": "gdp_ppp",
23
- "title": "Countries by GDP (PPP)",
24
- "unit": "Trillion USD",
25
- "category": "economics",
26
- "year_range": (1990, 2024),
27
- "top_n": 10,
28
- "description": "ক্রয়ক্ষমতার ভিত্তিতে ধনী দেশ",
29
- "bar_color": "#2196F3", # Blue
30
- },
31
- "gdp_per_capita": {
32
- "id": "gdp_per_capita",
33
- "title": "Richest Countries by GDP Per Capita",
34
- "unit": "USD",
35
- "category": "economics",
36
- "year_range": (1960, 2024),
37
- "top_n": 10,
38
- "description": "মাথাপিছু আয়ে সবচেয়ে ধনী দেশ",
39
- "bar_color": "#FF9800", # Orange
40
- },
41
-
42
- # =============================================
43
- # DEMOGRAPHICS & SOCIETY
44
- # =============================================
45
- "population": {
46
- "id": "population",
47
- "title": "Most Populated Countries",
48
- "unit": "Million",
49
- "category": "demographics",
50
- "year_range": (1960, 2024),
51
- "top_n": 10,
52
- "description": "বিশ্বের সবচেয়ে জনবহুল দেশ",
53
- "bar_color": "#9C27B0", # Purple
54
- },
55
- "life_expectancy": {
56
- "id": "life_expectancy",
57
- "title": "Countries by Life Expectancy",
58
- "unit": "Years",
59
- "category": "demographics",
60
- "year_range": (1960, 2024),
61
- "top_n": 10,
62
- "description": "কোন দেশের মানুষ গড়ে কতদিন বাঁচে",
63
- "bar_color": "#E91E63", # Pink
64
- },
65
-
66
- # =============================================
67
- # TECH & DIGITAL
68
- # =============================================
69
- "social_media_users": {
70
- "id": "social_media_users",
71
- "title": "Social Media Platforms by Users",
72
- "unit": "Billion Users",
73
- "category": "tech",
74
- "year_range": (2004, 2024),
75
- "top_n": 10,
76
- "description": "ফেসবুক, ইউটিউব, টিকটক ইউজার সংখ্যা",
77
- "bar_color": "#00BCD4", # Cyan
78
- },
79
- "browser_market_share": {
80
- "id": "browser_market_share",
81
- "title": "Browser Market Share",
82
- "unit": "% Share",
83
- "category": "tech",
84
- "year_range": (2008, 2024),
85
- "top_n": 8,
86
- "description": "ক্রোম, ফায়ারফক্স, এজ মার্কেট শেয়ার",
87
- "bar_color": "#3F51B5", # Indigo
88
- },
89
-
90
- # =============================================
91
- # ENTERTAINMENT
92
- # =============================================
93
- "youtube_subscribers": {
94
- "id": "youtube_subscribers",
95
- "title": "Most Subscribed YouTube Channels",
96
- "unit": "Million Subscribers",
97
- "category": "entertainment",
98
- "year_range": (2010, 2024),
99
- "top_n": 10,
100
- "description": "টি-সিরিজ বনাম মিস্টার বিস্ট বনাম পিউডিপাই",
101
- "bar_color": "#F44336", # Red (YouTube)
102
- },
103
-
104
- # =============================================
105
- # SPORTS
106
- # =============================================
107
- "olympic_medals": {
108
- "id": "olympic_medals",
109
- "title": "Countries by Olympic Gold Medals",
110
- "unit": "Gold Medals",
111
- "category": "sports",
112
- "year_range": (1896, 2024),
113
- "top_n": 10,
114
- "description": "অলিম্পিকে কোন দেশ কত স্বর্ণ জিতেছে",
115
- "bar_color": "#FFD700", # Gold
116
- },
117
-
118
- # =============================================
119
- # GEOPOLITICS
120
- # =============================================
121
- "military_spending": {
122
- "id": "military_spending",
123
- "title": "Countries by Military Expenditure",
124
- "unit": "Billion USD",
125
- "category": "geopolitics",
126
- "year_range": (1990, 2024),
127
- "top_n": 10,
128
- "description": "কোন দেশ সেনাবাহিনীতে কত খরচ করে",
129
- "bar_color": "#795548", # Brown
130
- },
131
- }
132
-
133
-
134
- def get_topic(topic_id: str) -> dict:
135
- """Get topic configuration by ID"""
136
- return TOPICS.get(topic_id.lower())
137
-
138
-
139
- def list_topics() -> list:
140
- """List all available topics"""
141
- return [
142
- {
143
- "id": t["id"],
144
- "title": t["title"],
145
- "category": t["category"],
146
- "description": t["description"]
147
- }
148
- for t in TOPICS.values()
149
- ]
150
-
151
-
152
- def get_topics_by_category(category: str) -> list:
153
- """Get topics filtered by category"""
154
- return [t for t in TOPICS.values() if t["category"] == category]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
modules/bar_race/router.py CHANGED
@@ -5,16 +5,13 @@ API endpoints for bar chart race video generation.
5
  import logging
6
  import os
7
  import uuid
 
8
  import traceback
9
  from typing import Dict
10
  from fastapi import APIRouter, BackgroundTasks, HTTPException
11
- from fastapi.responses import FileResponse
12
 
13
- from .schemas import BarRaceRequest, JobResponse, JobStatus, TopicInfo
14
- from .data.topic_registry import TOPICS, get_topic, list_topics
15
- from .services.data_fetcher import DataFetcher
16
- from .services.bar_frame import BarFrameGenerator
17
- from .services.bar_composer import BarComposer
18
 
19
  logger = logging.getLogger(__name__)
20
 
@@ -24,15 +21,24 @@ router = APIRouter()
24
  jobs: Dict[str, dict] = {}
25
 
26
 
27
- def update_job(job_id: str, status: str, progress: int = 0, video_url: str = None, error: str = None):
 
 
 
 
 
 
 
28
  """Update job status"""
29
  if job_id in jobs:
30
  jobs[job_id].update({
31
  "status": status,
32
  "progress": progress,
 
33
  "video_url": video_url,
34
  "error": error
35
  })
 
36
 
37
 
38
  async def generate_bar_race_video(job_id: str, request: BarRaceRequest):
@@ -40,93 +46,119 @@ async def generate_bar_race_video(job_id: str, request: BarRaceRequest):
40
  temp_dir = f"temp/bar_race_{job_id}"
41
 
42
  try:
43
- update_job(job_id, "processing", 5)
44
  os.makedirs(temp_dir, exist_ok=True)
45
 
46
- # Get topic config
47
- topic_config = get_topic(request.topic)
48
- if not topic_config:
49
- topic_config = {
50
- "title": request.topic.replace("_", " ").title(),
51
- "unit": "Value",
52
- }
53
-
54
- title = topic_config.get("title", request.topic)
55
- unit = topic_config.get("unit", "")
56
-
57
- update_job(job_id, "processing", 10)
58
- logger.info(f"Fetching data for topic: {request.topic}")
59
-
60
- # Fetch data
61
- data_fetcher = DataFetcher()
62
- all_data = data_fetcher.fetch_data(
63
- topic_id=request.topic,
64
- year_start=request.year_start,
65
- year_end=request.year_end,
66
- top_n=request.top_n
67
- )
68
 
69
- update_job(job_id, "processing", 20)
70
- logger.info(f"Generating frames...")
71
 
72
- # Generate frames
73
- frame_generator = BarFrameGenerator()
74
- frames_dir = os.path.join(temp_dir, "frames")
75
 
76
- frame_paths = frame_generator.generate_frames(
77
- title=title,
78
- unit=unit,
79
- all_data=all_data,
80
- year_start=request.year_start,
81
- year_end=request.year_end,
82
- fps=request.fps,
83
- duration_seconds=request.duration_seconds,
84
- output_dir=frames_dir
85
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
- update_job(job_id, "processing", 70)
88
- logger.info(f"Composing video...")
89
 
90
- # Compose video
91
- composer = BarComposer()
92
- output_name = f"bar_race_{job_id}.mp4"
 
 
93
 
94
- video_path = composer.compose_video(
95
- frame_paths=frame_paths,
96
- output_name=output_name,
97
- fps=request.fps
 
 
 
 
 
 
 
 
 
98
  )
99
 
100
- update_job(job_id, "processing", 90)
 
101
 
102
- # Generate video URL
103
- video_url = f"/api/bar-race/video/{job_id}"
104
 
105
- # Cleanup temp frames
106
- composer.cleanup_frames(frames_dir)
107
 
108
- update_job(job_id, "ready", 100, video_url=video_url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  logger.info(f"Bar race video ready: {video_url}")
110
 
 
 
 
 
 
 
 
 
111
  except Exception as e:
112
  logger.error(f"Bar race generation failed: {e}")
113
  logger.error(traceback.format_exc())
114
  update_job(job_id, "failed", error=str(e))
115
-
116
- finally:
117
- # Cleanup temp directory (keep video)
118
- if os.path.exists(temp_dir):
119
- import shutil
120
- shutil.rmtree(temp_dir, ignore_errors=True)
121
-
122
-
123
- @router.get("/topics")
124
- async def get_available_topics():
125
- """Get list of available topics"""
126
- return {
127
- "topics": list_topics(),
128
- "total": len(TOPICS)
129
- }
130
 
131
 
132
  @router.post("/generate", response_model=JobResponse)
@@ -134,19 +166,16 @@ async def generate_bar_race(request: BarRaceRequest, background_tasks: Backgroun
134
  """
135
  Generate a bar chart race video.
136
 
137
- Returns job_id to track progress.
138
  """
139
  job_id = str(uuid.uuid4())[:8]
140
 
141
- # Validate year range
142
- if request.year_start >= request.year_end:
143
- raise HTTPException(400, "year_start must be less than year_end")
144
-
145
  # Initialize job
146
  jobs[job_id] = {
147
  "job_id": job_id,
148
  "status": "queued",
149
  "progress": 0,
 
150
  "video_url": None,
151
  "error": None
152
  }
@@ -157,7 +186,7 @@ async def generate_bar_race(request: BarRaceRequest, background_tasks: Backgroun
157
  return JobResponse(
158
  job_id=job_id,
159
  status="queued",
160
- message=f"Bar race video generation started for topic: {request.topic}"
161
  )
162
 
163
 
 
5
  import logging
6
  import os
7
  import uuid
8
+ import shutil
9
  import traceback
10
  from typing import Dict
11
  from fastapi import APIRouter, BackgroundTasks, HTTPException
12
+ from fastapi.responses import FileResponse, RedirectResponse
13
 
14
+ from .schemas import BarRaceRequest, JobResponse, JobStatus
 
 
 
 
15
 
16
  logger = logging.getLogger(__name__)
17
 
 
21
  jobs: Dict[str, dict] = {}
22
 
23
 
24
+ def update_job(
25
+ job_id: str,
26
+ status: str,
27
+ progress: int = 0,
28
+ current_step: str = None,
29
+ video_url: str = None,
30
+ error: str = None
31
+ ):
32
  """Update job status"""
33
  if job_id in jobs:
34
  jobs[job_id].update({
35
  "status": status,
36
  "progress": progress,
37
+ "current_step": current_step,
38
  "video_url": video_url,
39
  "error": error
40
  })
41
+ logger.debug(f"Job {job_id}: {status} ({progress}%) - {current_step}")
42
 
43
 
44
  async def generate_bar_race_video(job_id: str, request: BarRaceRequest):
 
46
  temp_dir = f"temp/bar_race_{job_id}"
47
 
48
  try:
 
49
  os.makedirs(temp_dir, exist_ok=True)
50
 
51
+ # Get API key from environment
52
+ gemini_api_key = os.getenv("GEMINI_API_KEY")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ # ============ BRAIN ============
55
+ update_job(job_id, "processing", 5, "Brain: Analyzing topic...")
56
 
57
+ from .services.brain import Brain
58
+ brain = Brain(gemini_api_key=gemini_api_key)
59
+ plan = brain.generate_plan(request.topic)
60
 
61
+ if not plan:
62
+ raise Exception("Brain failed to generate plan")
63
+
64
+ logger.info(f"Brain: Generated plan for entity_type={plan.get('entity_type')}")
65
+
66
+ # ============ SCOUT ============
67
+ update_job(job_id, "processing", 15, "Scout: Fetching data...")
68
+
69
+ from .services.scout import Scout
70
+ scout = Scout(temp_dir=temp_dir)
71
+ raw_df = scout.fetch_data(plan)
72
+
73
+ if raw_df is None or raw_df.empty:
74
+ raise Exception("Scout failed to fetch data")
75
+
76
+ logger.info(f"Scout: Fetched {len(raw_df)} rows")
77
+
78
+ # ============ SURGEON ============
79
+ update_job(job_id, "processing", 35, "Surgeon: Cleaning data...")
80
+
81
+ from .services.surgeon import Surgeon
82
+ surgeon = Surgeon(temp_dir=temp_dir)
83
+ clean_df = surgeon.clean_data(raw_df, plan)
84
+
85
+ if clean_df is None or clean_df.empty:
86
+ raise Exception("Surgeon failed to clean data")
87
+
88
+ logger.info(f"Surgeon: Cleaned data, {len(clean_df)} rows, {clean_df['name'].nunique()} entities")
89
 
90
+ # ============ ARTIST ============
91
+ update_job(job_id, "processing", 50, "Artist: Processing images...")
92
 
93
+ from .services.artist import Artist
94
+ artist = Artist(temp_dir=temp_dir)
95
+ entities = clean_df["name"].unique().tolist()
96
+ entity_type = plan.get("entity_type", "general")
97
+ image_paths = artist.process_entities(entities, entity_type)
98
 
99
+ logger.info(f"Artist: Processed {len(image_paths)} images")
100
+
101
+ # ============ DIRECTOR ============
102
+ update_job(job_id, "processing", 65, "Director: Generating video...")
103
+
104
+ from .services.director import Director
105
+ director = Director(temp_dir=temp_dir)
106
+ video_path = director.generate_video(
107
+ df=clean_df,
108
+ plan=plan,
109
+ image_paths=image_paths,
110
+ duration_seconds=request.duration_seconds,
111
+ job_id=job_id
112
  )
113
 
114
+ if not video_path or not os.path.exists(video_path):
115
+ raise Exception("Director failed to generate video")
116
 
117
+ logger.info(f"Director: Generated video at {video_path}")
 
118
 
119
+ # ============ UPLOAD TO HF ============
120
+ update_job(job_id, "processing", 85, "Uploading to cloud storage...")
121
 
122
+ video_url = None
123
+ try:
124
+ from modules.shared.services.hf_storage import get_hf_storage
125
+ hf_storage = get_hf_storage()
126
+
127
+ if hf_storage and hf_storage.enabled:
128
+ # Upload video
129
+ uploaded_url = hf_storage.upload_file(
130
+ local_path=video_path,
131
+ remote_path=f"bar_race/{job_id}.mp4"
132
+ )
133
+ if uploaded_url:
134
+ video_url = uploaded_url
135
+ logger.info(f"Uploaded to HF: {video_url}")
136
+ except Exception as e:
137
+ logger.warning(f"HF upload failed, using local: {e}")
138
+
139
+ # Fallback to local URL
140
+ if not video_url:
141
+ video_url = f"/api/bar-race/video/{job_id}"
142
+
143
+ # ============ SUCCESS ============
144
+ update_job(job_id, "ready", 100, "Complete", video_url=video_url)
145
  logger.info(f"Bar race video ready: {video_url}")
146
 
147
+ # Cleanup temp files (only on success)
148
+ try:
149
+ if os.path.exists(temp_dir):
150
+ shutil.rmtree(temp_dir)
151
+ logger.info(f"Cleaned up temp directory: {temp_dir}")
152
+ except Exception as e:
153
+ logger.warning(f"Cleanup failed: {e}")
154
+
155
  except Exception as e:
156
  logger.error(f"Bar race generation failed: {e}")
157
  logger.error(traceback.format_exc())
158
  update_job(job_id, "failed", error=str(e))
159
+
160
+ # Keep temp files for debugging on failure
161
+ logger.info(f"Keeping temp directory for debugging: {temp_dir}")
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
 
164
  @router.post("/generate", response_model=JobResponse)
 
166
  """
167
  Generate a bar chart race video.
168
 
169
+ Takes a topic and duration, returns job_id to track progress.
170
  """
171
  job_id = str(uuid.uuid4())[:8]
172
 
 
 
 
 
173
  # Initialize job
174
  jobs[job_id] = {
175
  "job_id": job_id,
176
  "status": "queued",
177
  "progress": 0,
178
+ "current_step": "Initializing...",
179
  "video_url": None,
180
  "error": None
181
  }
 
186
  return JobResponse(
187
  job_id=job_id,
188
  status="queued",
189
+ message=f"Bar race generation started for topic: {request.topic}"
190
  )
191
 
192
 
modules/bar_race/schemas.py CHANGED
@@ -3,40 +3,43 @@ Bar Race Schemas
3
  Pydantic models for bar chart race video generation.
4
  """
5
  from pydantic import BaseModel, Field
6
- from typing import Optional, List
7
  from enum import Enum
8
 
9
 
10
- class TopicCategory(str, Enum):
11
- ECONOMICS = "economics"
12
- TECH = "tech"
13
- BUSINESS = "business"
14
- ENTERTAINMENT = "entertainment"
15
- DEMOGRAPHICS = "demographics"
16
- SPORTS = "sports"
17
- GEOPOLITICS = "geopolitics"
18
- ENVIRONMENT = "environment"
19
- UNIQUE = "unique"
20
 
21
 
22
  class BarRaceRequest(BaseModel):
23
  """Request to generate a bar chart race video"""
24
- topic: str = Field(..., description="Topic ID from registry (e.g., 'gdp_nominal')")
25
- year_start: int = Field(2000, ge=1900, le=2030, description="Start year")
26
- year_end: int = Field(2024, ge=1900, le=2030, description="End year")
27
- top_n: int = Field(10, ge=5, le=20, description="Number of bars to show")
28
- duration_seconds: int = Field(60, ge=10, le=180, description="Video duration in seconds")
29
- fps: int = Field(30, ge=24, le=60, description="Frames per second")
30
-
31
-
32
- class TopicInfo(BaseModel):
33
- """Information about a topic"""
34
- id: str
35
- title: str
36
- category: TopicCategory
37
- unit: str
38
- year_range: tuple
39
- description: str
 
 
 
 
 
 
 
40
 
41
 
42
  class JobResponse(BaseModel):
@@ -49,7 +52,8 @@ class JobResponse(BaseModel):
49
  class JobStatus(BaseModel):
50
  """Job status response"""
51
  job_id: str
52
- status: str # queued, processing, ready, failed
53
  progress: int = 0
 
54
  video_url: Optional[str] = None
55
  error: Optional[str] = None
 
3
  Pydantic models for bar chart race video generation.
4
  """
5
  from pydantic import BaseModel, Field
6
+ from typing import Optional, List, Dict, Any
7
  from enum import Enum
8
 
9
 
10
+ class EntityType(str, Enum):
11
+ """Type of entities in the bar chart"""
12
+ PERSON = "person"
13
+ COUNTRY = "country"
14
+ COMPANY = "company"
15
+ GENERAL = "general"
 
 
 
 
16
 
17
 
18
  class BarRaceRequest(BaseModel):
19
  """Request to generate a bar chart race video"""
20
+ topic: str = Field(..., description="Topic/prompt for video (e.g., 'Top 10 richest cricketers')")
21
+ duration_seconds: int = Field(60, ge=30, le=120, description="Video duration in seconds")
22
+
23
+ class Config:
24
+ json_schema_extra = {
25
+ "example": {
26
+ "topic": "Top 10 richest countries by GDP 2000-2024",
27
+ "duration_seconds": 60
28
+ }
29
+ }
30
+
31
+
32
+ class BrainPlan(BaseModel):
33
+ """JSON plan generated by Brain (LLM)"""
34
+ topic: str
35
+ entity_type: EntityType
36
+ time_config: Dict[str, Any]
37
+ value_intent: Dict[str, Any]
38
+ search_strategies: List[Dict[str, Any]]
39
+ source_priority: List[str]
40
+ data_expectation: Dict[str, Any]
41
+ visualization: Dict[str, Any]
42
+ video_meta: Dict[str, Any]
43
 
44
 
45
  class JobResponse(BaseModel):
 
52
  class JobStatus(BaseModel):
53
  """Job status response"""
54
  job_id: str
55
+ status: str # queued, brain, scout, surgeon, artist, director, uploading, ready, failed
56
  progress: int = 0
57
+ current_step: Optional[str] = None
58
  video_url: Optional[str] = None
59
  error: Optional[str] = None
modules/bar_race/services/__init__.py CHANGED
@@ -1 +1 @@
1
- # Services init
 
1
+ # Services package
modules/bar_race/services/artist.py ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Artist - Image Processor
3
+ Downloads and processes entity images for bar chart race.
4
+ """
5
+ import logging
6
+ import requests
7
+ import os
8
+ from PIL import Image, ImageDraw
9
+ from typing import Dict, Any, List, Optional
10
+ from io import BytesIO
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class Artist:
16
+ """
17
+ Image Processor for Bar Race video generation.
18
+
19
+ Responsibilities:
20
+ - Search and download entity images
21
+ - Background removal (optional, if rembg available)
22
+ - Face detection for person entities
23
+ - Circular mask application
24
+ """
25
+
26
+ HEADERS = {
27
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
28
+ }
29
+
30
+ # Image size for bar chart
31
+ IMAGE_SIZE = 80
32
+
33
+ def __init__(self, temp_dir: str):
34
+ self.temp_dir = temp_dir
35
+ self.images_dir = os.path.join(temp_dir, "images")
36
+ os.makedirs(self.images_dir, exist_ok=True)
37
+
38
+ # Check if rembg is available
39
+ self.rembg_available = False
40
+ try:
41
+ import rembg
42
+ self.rembg_available = True
43
+ logger.info("Artist: rembg available for background removal")
44
+ except ImportError:
45
+ logger.info("Artist: rembg not available, skipping background removal")
46
+
47
+ def process_entities(self, entities: List[str], entity_type: str) -> Dict[str, str]:
48
+ """
49
+ Download and process images for all entities.
50
+
51
+ Args:
52
+ entities: List of entity names
53
+ entity_type: Type of entity (person, country, company, general)
54
+
55
+ Returns:
56
+ Dict mapping entity name to processed image path
57
+ """
58
+ logger.info(f"Artist: Processing images for {len(entities)} entities (type: {entity_type})")
59
+
60
+ image_paths = {}
61
+
62
+ for entity in entities:
63
+ try:
64
+ image_path = self._process_entity(entity, entity_type)
65
+ if image_path:
66
+ image_paths[entity] = image_path
67
+ logger.debug(f"Artist: Processed image for {entity}")
68
+ else:
69
+ logger.warning(f"Artist: No image found for {entity}")
70
+ except Exception as e:
71
+ logger.warning(f"Artist: Failed to process {entity}: {e}")
72
+
73
+ logger.info(f"Artist: Processed {len(image_paths)}/{len(entities)} images")
74
+ return image_paths
75
+
76
+ def _process_entity(self, entity: str, entity_type: str) -> Optional[str]:
77
+ """Process a single entity's image"""
78
+ # Try to get image
79
+ image = self._get_image(entity, entity_type)
80
+
81
+ if image is None:
82
+ return None
83
+
84
+ # Process image
85
+ try:
86
+ # Resize to square
87
+ image = image.convert("RGBA")
88
+ image = self._resize_to_square(image)
89
+
90
+ # Remove background if rembg available and it's a person
91
+ if self.rembg_available and entity_type == "person":
92
+ image = self._remove_background(image)
93
+
94
+ # Apply circular mask
95
+ image = self._apply_circular_mask(image)
96
+
97
+ # Save processed image
98
+ safe_name = "".join(c if c.isalnum() else "_" for c in entity)
99
+ output_path = os.path.join(self.images_dir, f"{safe_name}.png")
100
+ image.save(output_path, "PNG")
101
+
102
+ return output_path
103
+
104
+ except Exception as e:
105
+ logger.error(f"Artist: Error processing image for {entity}: {e}")
106
+ return None
107
+
108
+ def _get_image(self, entity: str, entity_type: str) -> Optional[Image.Image]:
109
+ """Get image for an entity"""
110
+
111
+ # Priority 1: Wikipedia Commons
112
+ image = self._search_wikipedia_commons(entity, entity_type)
113
+ if image:
114
+ return image
115
+
116
+ # Priority 2: DuckDuckGo image search
117
+ image = self._search_duckduckgo(entity, entity_type)
118
+ if image:
119
+ return image
120
+
121
+ # Priority 3: Generate placeholder
122
+ return self._generate_placeholder(entity)
123
+
124
+ def _search_wikipedia_commons(self, entity: str, entity_type: str) -> Optional[Image.Image]:
125
+ """Search Wikipedia Commons for entity image"""
126
+ try:
127
+ # For countries, search for flag
128
+ if entity_type == "country":
129
+ search_query = f"Flag of {entity}"
130
+ else:
131
+ search_query = entity
132
+
133
+ # Wikipedia API search
134
+ search_url = "https://en.wikipedia.org/w/api.php"
135
+ params = {
136
+ "action": "query",
137
+ "titles": search_query,
138
+ "prop": "pageimages",
139
+ "format": "json",
140
+ "pithumbsize": 200
141
+ }
142
+
143
+ response = requests.get(search_url, params=params, headers=self.HEADERS, timeout=10)
144
+ if response.status_code == 200:
145
+ data = response.json()
146
+ pages = data.get("query", {}).get("pages", {})
147
+
148
+ for page_id, page_data in pages.items():
149
+ if "thumbnail" in page_data:
150
+ image_url = page_data["thumbnail"]["source"]
151
+ return self._download_image(image_url)
152
+
153
+ except Exception as e:
154
+ logger.debug(f"Artist: Wikipedia Commons search failed for {entity}: {e}")
155
+
156
+ return None
157
+
158
+ def _search_duckduckgo(self, entity: str, entity_type: str) -> Optional[Image.Image]:
159
+ """Search DuckDuckGo for entity image"""
160
+ try:
161
+ from duckduckgo_search import DDGS
162
+
163
+ # Build search query
164
+ if entity_type == "country":
165
+ query = f"{entity} flag icon"
166
+ elif entity_type == "person":
167
+ query = f"{entity} portrait photo"
168
+ else:
169
+ query = f"{entity} logo"
170
+
171
+ with DDGS() as ddgs:
172
+ results = list(ddgs.images(query, max_results=3))
173
+
174
+ for result in results:
175
+ image_url = result.get("image")
176
+ if image_url:
177
+ image = self._download_image(image_url)
178
+ if image:
179
+ return image
180
+
181
+ except ImportError:
182
+ logger.debug("Artist: duckduckgo-search not available")
183
+ except Exception as e:
184
+ logger.debug(f"Artist: DuckDuckGo search failed for {entity}: {e}")
185
+
186
+ return None
187
+
188
+ def _download_image(self, url: str) -> Optional[Image.Image]:
189
+ """Download image from URL"""
190
+ try:
191
+ response = requests.get(url, headers=self.HEADERS, timeout=10)
192
+ if response.status_code == 200:
193
+ return Image.open(BytesIO(response.content))
194
+ except Exception as e:
195
+ logger.debug(f"Artist: Failed to download image: {e}")
196
+
197
+ return None
198
+
199
+ def _resize_to_square(self, image: Image.Image) -> Image.Image:
200
+ """Resize image to square, center cropping if needed"""
201
+ width, height = image.size
202
+
203
+ # Determine crop box for square
204
+ if width > height:
205
+ left = (width - height) // 2
206
+ top = 0
207
+ right = left + height
208
+ bottom = height
209
+ else:
210
+ left = 0
211
+ top = (height - width) // 2
212
+ right = width
213
+ bottom = top + width
214
+
215
+ # Crop to square
216
+ image = image.crop((left, top, right, bottom))
217
+
218
+ # Resize to target size
219
+ image = image.resize((self.IMAGE_SIZE, self.IMAGE_SIZE), Image.Resampling.LANCZOS)
220
+
221
+ return image
222
+
223
+ def _remove_background(self, image: Image.Image) -> Image.Image:
224
+ """Remove background using rembg"""
225
+ try:
226
+ import rembg
227
+
228
+ # Convert to bytes
229
+ img_bytes = BytesIO()
230
+ image.save(img_bytes, format="PNG")
231
+ img_bytes.seek(0)
232
+
233
+ # Remove background
234
+ output = rembg.remove(img_bytes.getvalue())
235
+
236
+ return Image.open(BytesIO(output))
237
+
238
+ except Exception as e:
239
+ logger.warning(f"Artist: Background removal failed: {e}")
240
+ return image
241
+
242
+ def _apply_circular_mask(self, image: Image.Image) -> Image.Image:
243
+ """Apply circular mask to image"""
244
+ # Ensure RGBA
245
+ if image.mode != "RGBA":
246
+ image = image.convert("RGBA")
247
+
248
+ size = image.size[0]
249
+
250
+ # Create circular mask
251
+ mask = Image.new("L", (size, size), 0)
252
+ draw = ImageDraw.Draw(mask)
253
+ draw.ellipse((0, 0, size, size), fill=255)
254
+
255
+ # Apply mask
256
+ output = Image.new("RGBA", (size, size), (0, 0, 0, 0))
257
+ output.paste(image, (0, 0), mask)
258
+
259
+ return output
260
+
261
+ def _generate_placeholder(self, entity: str) -> Image.Image:
262
+ """Generate a placeholder image with entity initial"""
263
+ size = self.IMAGE_SIZE
264
+
265
+ # Create colored background
266
+ colors = [
267
+ (74, 222, 128), # Green
268
+ (251, 191, 36), # Yellow
269
+ (239, 68, 68), # Red
270
+ (59, 130, 246), # Blue
271
+ (168, 85, 247), # Purple
272
+ (20, 184, 166), # Teal
273
+ ]
274
+
275
+ # Pick color based on entity name hash
276
+ color = colors[hash(entity) % len(colors)]
277
+
278
+ # Create image
279
+ image = Image.new("RGBA", (size, size), color)
280
+ draw = ImageDraw.Draw(image)
281
+
282
+ # Draw initial
283
+ initial = entity[0].upper() if entity else "?"
284
+
285
+ # Use default font
286
+ try:
287
+ from PIL import ImageFont
288
+ font = ImageFont.truetype("arial.ttf", size // 2)
289
+ except:
290
+ font = ImageFont.load_default()
291
+
292
+ # Center text
293
+ bbox = draw.textbbox((0, 0), initial, font=font)
294
+ text_width = bbox[2] - bbox[0]
295
+ text_height = bbox[3] - bbox[1]
296
+ x = (size - text_width) // 2
297
+ y = (size - text_height) // 2 - bbox[1]
298
+
299
+ draw.text((x, y), initial, fill=(255, 255, 255), font=font)
300
+
301
+ return image
modules/bar_race/services/bar_composer.py DELETED
@@ -1,91 +0,0 @@
1
- """
2
- Bar Composer Service
3
- Assembles bar race frames into final video.
4
- No TTS - pure visual animation with optional background music.
5
- """
6
- import logging
7
- import os
8
- import shutil
9
- from pathlib import Path
10
- from typing import List, Optional
11
- from moviepy.editor import ImageSequenceClip
12
-
13
- logger = logging.getLogger(__name__)
14
-
15
-
16
- class BarComposer:
17
- """
18
- Composes bar race video from frames.
19
- - No TTS dependency
20
- - Optional background music support
21
- """
22
-
23
- FPS = 30
24
-
25
- def __init__(self, output_dir: str = "videos/bar_race"):
26
- self.output_dir = output_dir
27
- os.makedirs(output_dir, exist_ok=True)
28
-
29
- def compose_video(
30
- self,
31
- frame_paths: List[str],
32
- output_name: str,
33
- fps: int = 30,
34
- music_path: Optional[str] = None
35
- ) -> str:
36
- """
37
- Compose video from frame sequence.
38
-
39
- Args:
40
- frame_paths: List of frame image paths
41
- output_name: Output video filename
42
- fps: Frames per second
43
- music_path: Optional background music path
44
-
45
- Returns:
46
- Path to output video file
47
- """
48
- output_path = os.path.join(self.output_dir, output_name)
49
-
50
- try:
51
- logger.info(f"Composing video from {len(frame_paths)} frames...")
52
-
53
- # Create video clip from frames
54
- video_clip = ImageSequenceClip(frame_paths, fps=fps)
55
-
56
- # Add background music if provided
57
- if music_path and os.path.exists(music_path):
58
- from moviepy.editor import AudioFileClip
59
- audio = AudioFileClip(music_path)
60
- # Loop or trim audio to match video duration
61
- if audio.duration > video_clip.duration:
62
- audio = audio.subclip(0, video_clip.duration)
63
- video_clip = video_clip.set_audio(audio)
64
-
65
- # Write video
66
- logger.info(f"Writing video to {output_path}")
67
- video_clip.write_videofile(
68
- output_path,
69
- fps=fps,
70
- codec="libx264",
71
- audio_codec="aac" if music_path else None,
72
- preset="medium",
73
- threads=4,
74
- logger=None
75
- )
76
-
77
- # Cleanup
78
- video_clip.close()
79
-
80
- logger.info(f"Bar race video complete: {output_path}")
81
- return output_path
82
-
83
- except Exception as e:
84
- logger.error(f"Failed to compose video: {e}")
85
- raise
86
-
87
- def cleanup_frames(self, frame_dir: str):
88
- """Remove temporary frame directory"""
89
- if os.path.exists(frame_dir):
90
- shutil.rmtree(frame_dir)
91
- logger.info(f"Cleaned up frames: {frame_dir}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
modules/bar_race/services/bar_frame.py DELETED
@@ -1,301 +0,0 @@
1
- """
2
- Bar Frame Generator
3
- Creates animated bar chart race frames using Pillow.
4
- """
5
- import logging
6
- import os
7
- from PIL import Image, ImageDraw, ImageFont
8
- from typing import Dict, List, Tuple
9
-
10
- logger = logging.getLogger(__name__)
11
-
12
-
13
- class BarFrameGenerator:
14
- """
15
- Generates frames for bar chart race animation.
16
- - Horizontal racing bars
17
- - Smooth interpolation between years
18
- - Year counter display
19
- """
20
-
21
- # Canvas dimensions (9:16 vertical)
22
- WIDTH = 1080
23
- HEIGHT = 1920
24
-
25
- # Colors
26
- BG_COLOR = (18, 18, 28) # Dark navy
27
- TITLE_COLOR = (255, 255, 255) # White
28
- YEAR_COLOR = (100, 100, 120) # Muted gray
29
- BAR_LABEL_COLOR = (255, 255, 255) # White
30
- VALUE_COLOR = (200, 200, 200) # Light gray
31
-
32
- # Bar colors (will cycle through these)
33
- BAR_COLORS = [
34
- (74, 222, 128), # Green
35
- (251, 191, 36), # Yellow
36
- (239, 68, 68), # Red
37
- (59, 130, 246), # Blue
38
- (168, 85, 247), # Purple
39
- (20, 184, 166), # Teal
40
- (249, 115, 22), # Orange
41
- (236, 72, 153), # Pink
42
- (34, 197, 94), # Emerald
43
- (99, 102, 241), # Indigo
44
- ]
45
-
46
- # Layout
47
- TITLE_Y = 80
48
- YEAR_Y = 1700 # Large year at bottom
49
- BAR_START_Y = 200
50
- BAR_HEIGHT = 80
51
- BAR_GAP = 30
52
- BAR_MAX_WIDTH = 900
53
- BAR_X_START = 160
54
-
55
- def __init__(self):
56
- self._load_fonts()
57
- self.entity_colors = {} # Cache colors for entities
58
-
59
- def _load_fonts(self):
60
- """Load fonts with fallbacks"""
61
- font_paths = [
62
- "C:/Windows/Fonts/arial.ttf",
63
- "C:/Windows/Fonts/ArialBD.ttf",
64
- "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
65
- ]
66
-
67
- self.font_title = None
68
- self.font_label = None
69
- self.font_value = None
70
- self.font_year = None
71
-
72
- for path in font_paths:
73
- if os.path.exists(path):
74
- try:
75
- self.font_title = ImageFont.truetype(path, 52)
76
- self.font_label = ImageFont.truetype(path, 32)
77
- self.font_value = ImageFont.truetype(path, 28)
78
- self.font_year = ImageFont.truetype(path, 200)
79
- logger.info(f"Loaded font: {path}")
80
- break
81
- except Exception as e:
82
- logger.warning(f"Failed to load font {path}: {e}")
83
-
84
- if not self.font_title:
85
- self.font_title = ImageFont.load_default()
86
- self.font_label = ImageFont.load_default()
87
- self.font_value = ImageFont.load_default()
88
- self.font_year = ImageFont.load_default()
89
- logger.warning("Using default font")
90
-
91
- def _get_entity_color(self, entity: str) -> Tuple[int, int, int]:
92
- """Get consistent color for an entity"""
93
- if entity not in self.entity_colors:
94
- color_index = len(self.entity_colors) % len(self.BAR_COLORS)
95
- self.entity_colors[entity] = self.BAR_COLORS[color_index]
96
- return self.entity_colors[entity]
97
-
98
- def _draw_rounded_rect(self, draw: ImageDraw, bbox: Tuple, fill: Tuple, radius: int = 15):
99
- """Draw a rounded rectangle"""
100
- draw.rounded_rectangle(bbox, radius=radius, fill=fill)
101
-
102
- def create_frame(
103
- self,
104
- title: str,
105
- unit: str,
106
- year: float, # Can be fractional for smooth animation
107
- bars_data: List[Dict], # [{name, value}, ...]
108
- max_value: float
109
- ) -> Image.Image:
110
- """
111
- Create a single frame of the bar chart race.
112
-
113
- Args:
114
- title: Chart title
115
- unit: Value unit (e.g., "Trillion USD")
116
- year: Current year (can be fractional)
117
- bars_data: List of {name, value} sorted by value descending
118
- max_value: Maximum value for scaling
119
- """
120
- img = Image.new('RGB', (self.WIDTH, self.HEIGHT), self.BG_COLOR)
121
- draw = ImageDraw.Draw(img)
122
-
123
- # Title
124
- title_text = title
125
- bbox = draw.textbbox((0, 0), title_text, font=self.font_title)
126
- title_x = (self.WIDTH - (bbox[2] - bbox[0])) // 2
127
- draw.text((title_x, self.TITLE_Y), title_text, fill=self.TITLE_COLOR, font=self.font_title)
128
-
129
- # Year (large, at bottom)
130
- year_text = str(int(year))
131
- bbox = draw.textbbox((0, 0), year_text, font=self.font_year)
132
- year_x = (self.WIDTH - (bbox[2] - bbox[0])) // 2
133
- draw.text((year_x, self.YEAR_Y), year_text, fill=self.YEAR_COLOR, font=self.font_year)
134
-
135
- # Draw bars
136
- for i, bar in enumerate(bars_data[:10]): # Max 10 bars
137
- y = self.BAR_START_Y + i * (self.BAR_HEIGHT + self.BAR_GAP)
138
-
139
- # Calculate bar width
140
- bar_width = int((bar["value"] / max_value) * self.BAR_MAX_WIDTH)
141
- bar_width = max(50, bar_width) # Minimum width
142
-
143
- # Get color
144
- color = self._get_entity_color(bar["name"])
145
-
146
- # Draw bar
147
- self._draw_rounded_rect(
148
- draw,
149
- (self.BAR_X_START, y, self.BAR_X_START + bar_width, y + self.BAR_HEIGHT),
150
- color,
151
- radius=10
152
- )
153
-
154
- # Draw entity name (inside bar if fits, else to the left)
155
- name_text = bar["name"]
156
- name_bbox = draw.textbbox((0, 0), name_text, font=self.font_label)
157
- name_width = name_bbox[2] - name_bbox[0]
158
-
159
- if name_width < bar_width - 20:
160
- # Inside bar
161
- name_x = self.BAR_X_START + 15
162
- else:
163
- # To the left of bar
164
- name_x = 10
165
-
166
- name_y = y + (self.BAR_HEIGHT - (name_bbox[3] - name_bbox[1])) // 2
167
- draw.text((name_x, name_y), name_text, fill=self.BAR_LABEL_COLOR, font=self.font_label)
168
-
169
- # Draw value (to the right of bar)
170
- value_text = f"{bar['value']:.1f} {unit}"
171
- value_bbox = draw.textbbox((0, 0), value_text, font=self.font_value)
172
- value_x = self.BAR_X_START + bar_width + 15
173
- value_y = y + (self.BAR_HEIGHT - (value_bbox[3] - value_bbox[1])) // 2
174
- draw.text((value_x, value_y), value_text, fill=self.VALUE_COLOR, font=self.font_value)
175
-
176
- return img
177
-
178
- def interpolate_data(
179
- self,
180
- data_start: List[Dict],
181
- data_end: List[Dict],
182
- progress: float # 0.0 to 1.0
183
- ) -> List[Dict]:
184
- """
185
- Interpolate between two years of data for smooth animation.
186
- """
187
- # Create lookup for end values
188
- end_values = {d["name"]: d["value"] for d in data_end}
189
-
190
- interpolated = []
191
- for d in data_start:
192
- name = d["name"]
193
- start_val = d["value"]
194
- end_val = end_values.get(name, start_val)
195
-
196
- # Linear interpolation
197
- current_val = start_val + (end_val - start_val) * progress
198
-
199
- interpolated.append({
200
- "name": name,
201
- "value": current_val
202
- })
203
-
204
- # Sort by current value
205
- interpolated.sort(key=lambda x: x["value"], reverse=True)
206
-
207
- return interpolated
208
-
209
- def generate_frames(
210
- self,
211
- title: str,
212
- unit: str,
213
- all_data: List[Dict], # [{name, year, value}, ...]
214
- year_start: int,
215
- year_end: int,
216
- fps: int = 30,
217
- duration_seconds: int = 60,
218
- output_dir: str = "temp_frames"
219
- ) -> List[str]:
220
- """
221
- Generate all frames for the bar chart race.
222
-
223
- Returns list of frame file paths.
224
- """
225
- os.makedirs(output_dir, exist_ok=True)
226
-
227
- # Calculate frames per year
228
- total_frames = fps * duration_seconds
229
- years_count = year_end - year_start
230
- frames_per_year = total_frames / years_count
231
-
232
- # Find max value for consistent scaling
233
- max_value = max(d["value"] for d in all_data) * 1.1 # 10% padding
234
-
235
- # Group data by year
236
- data_by_year = {}
237
- for d in all_data:
238
- year = d["year"]
239
- if year not in data_by_year:
240
- data_by_year[year] = []
241
- data_by_year[year].append({"name": d["name"], "value": d["value"]})
242
-
243
- # Sort each year's data
244
- for year in data_by_year:
245
- data_by_year[year].sort(key=lambda x: x["value"], reverse=True)
246
-
247
- frame_paths = []
248
- frame_num = 0
249
-
250
- for year in range(year_start, year_end):
251
- # Get data for current and next year
252
- current_data = data_by_year.get(year, [])
253
- next_data = data_by_year.get(year + 1, current_data)
254
-
255
- # Generate frames for this year transition
256
- frames_for_this_year = int(frames_per_year)
257
-
258
- for f in range(frames_for_this_year):
259
- progress = f / frames_for_this_year
260
-
261
- # Interpolate data
262
- interpolated = self.interpolate_data(current_data, next_data, progress)
263
-
264
- # Calculate display year (fractional)
265
- display_year = year + progress
266
-
267
- # Create frame
268
- frame = self.create_frame(
269
- title=title,
270
- unit=unit,
271
- year=display_year,
272
- bars_data=interpolated,
273
- max_value=max_value
274
- )
275
-
276
- # Save frame
277
- frame_path = os.path.join(output_dir, f"frame_{frame_num:05d}.png")
278
- frame.save(frame_path)
279
- frame_paths.append(frame_path)
280
- frame_num += 1
281
-
282
- if frame_num % 100 == 0:
283
- logger.info(f"Generated {frame_num} frames...")
284
-
285
- # Add final frames for end year
286
- final_data = data_by_year.get(year_end, [])
287
- for _ in range(fps): # 1 second on final year
288
- frame = self.create_frame(
289
- title=title,
290
- unit=unit,
291
- year=year_end,
292
- bars_data=final_data,
293
- max_value=max_value
294
- )
295
- frame_path = os.path.join(output_dir, f"frame_{frame_num:05d}.png")
296
- frame.save(frame_path)
297
- frame_paths.append(frame_path)
298
- frame_num += 1
299
-
300
- logger.info(f"Generated total {len(frame_paths)} frames")
301
- return frame_paths
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
modules/bar_race/services/brain.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Brain - LLM Planner
3
+ Uses Gemini API to understand user topic and generate structured JSON plan.
4
+ """
5
+ import logging
6
+ import json
7
+ import os
8
+ from typing import Dict, Any, Optional
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class Brain:
14
+ """
15
+ LLM Planner for Bar Race video generation.
16
+
17
+ Responsibilities:
18
+ - Topic understanding & decomposition
19
+ - Entity type detection (person, country, company)
20
+ - Search strategy generation
21
+ - Data source priority determination
22
+ - Visualization config generation
23
+ """
24
+
25
+ GEMINI_MODEL = "gemma-3-27b-it"
26
+
27
+ SYSTEM_PROMPT = """You are an expert data analyst and video planner. Your task is to analyze a user's topic and create a structured JSON plan for generating a bar chart race video.
28
+
29
+ Given a topic like "Top 10 richest cricketers history" or "GDP by country 2000-2024", you must output a JSON plan with:
30
+
31
+ 1. entity_type: "person", "country", "company", or "general"
32
+ 2. time_config: start_year, end_year, granularity (year/month)
33
+ 3. value_intent: what values to track (net worth, GDP, population, etc.)
34
+ 4. search_strategies: list of search queries to find data
35
+ 5. source_priority: ["wikipedia_rest", "worldbank_api", "web_scraping"]
36
+ 6. data_expectation: numeric, sparse, needs_interpolation
37
+ 7. visualization: type, top_n, smooth
38
+ 8. video_meta: title for the video
39
+
40
+ IMPORTANT: Output ONLY valid JSON, no other text."""
41
+
42
+ PLAN_TEMPLATE = """{
43
+ "topic": "{topic}",
44
+ "entity_type": "country",
45
+ "time_config": {
46
+ "start_year": 2000,
47
+ "end_year": 2024,
48
+ "granularity": "year"
49
+ },
50
+ "value_intent": {
51
+ "primary": "GDP",
52
+ "unit": "Trillion USD",
53
+ "alternatives": ["gross domestic product", "economic output"]
54
+ },
55
+ "search_strategies": [
56
+ {
57
+ "intent": "ranking_history",
58
+ "queries": [
59
+ "GDP by country by year wikipedia",
60
+ "world GDP ranking history table"
61
+ ]
62
+ }
63
+ ],
64
+ "source_priority": ["wikipedia_rest", "worldbank_api", "web_scraping"],
65
+ "data_expectation": {
66
+ "numeric": true,
67
+ "sparse": false,
68
+ "needs_interpolation": false
69
+ },
70
+ "visualization": {
71
+ "type": "bar_chart_race",
72
+ "top_n": 10,
73
+ "smooth": true
74
+ },
75
+ "video_meta": {
76
+ "title": "Top 10 Countries by GDP (2000-2024)"
77
+ }
78
+ }"""
79
+
80
+ def __init__(self, gemini_api_key: str = None):
81
+ self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY")
82
+ self.gemini_client = None
83
+
84
+ if self.gemini_api_key:
85
+ try:
86
+ from google import genai
87
+ self.gemini_client = genai.Client(api_key=self.gemini_api_key)
88
+ logger.info("Brain: Gemini client initialized")
89
+ except ImportError:
90
+ logger.warning("google-genai package not installed")
91
+ else:
92
+ logger.warning("Brain: No Gemini API key, will use template-based planning")
93
+
94
+ def generate_plan(self, topic: str) -> Dict[str, Any]:
95
+ """
96
+ Generate a structured plan from user topic.
97
+
98
+ Args:
99
+ topic: User's topic string
100
+
101
+ Returns:
102
+ Dict containing the structured plan
103
+ """
104
+ logger.info(f"Brain: Generating plan for topic: {topic}")
105
+
106
+ if self.gemini_client:
107
+ try:
108
+ plan = self._generate_with_gemini(topic)
109
+ if plan:
110
+ logger.info("Brain: Plan generated with Gemini")
111
+ return plan
112
+ except Exception as e:
113
+ logger.warning(f"Brain: Gemini failed, using fallback: {e}")
114
+
115
+ # Fallback: Template-based planning
116
+ return self._generate_fallback_plan(topic)
117
+
118
+ def _generate_with_gemini(self, topic: str) -> Optional[Dict[str, Any]]:
119
+ """Generate plan using Gemini API"""
120
+ prompt = f"""Analyze this topic and create a JSON plan for a bar chart race video:
121
+
122
+ Topic: {topic}
123
+
124
+ {self.SYSTEM_PROMPT}
125
+
126
+ Output the JSON plan:"""
127
+
128
+ response = self.gemini_client.models.generate_content(
129
+ model=self.GEMINI_MODEL,
130
+ contents=prompt
131
+ )
132
+
133
+ # Parse JSON from response
134
+ text = response.text.strip()
135
+
136
+ # Extract JSON if wrapped in markdown
137
+ if "```json" in text:
138
+ text = text.split("```json")[1].split("```")[0].strip()
139
+ elif "```" in text:
140
+ text = text.split("```")[1].split("```")[0].strip()
141
+
142
+ try:
143
+ return json.loads(text)
144
+ except json.JSONDecodeError as e:
145
+ logger.error(f"Brain: Failed to parse Gemini response as JSON: {e}")
146
+ return None
147
+
148
+ def _generate_fallback_plan(self, topic: str) -> Dict[str, Any]:
149
+ """Generate plan using simple heuristics when Gemini fails"""
150
+ logger.info("Brain: Using fallback template-based planning")
151
+
152
+ topic_lower = topic.lower()
153
+
154
+ # Detect entity type
155
+ if any(word in topic_lower for word in ["person", "cricketer", "player", "actor", "singer", "celebrity", "billionaire"]):
156
+ entity_type = "person"
157
+ elif any(word in topic_lower for word in ["country", "nation", "gdp", "population", "military"]):
158
+ entity_type = "country"
159
+ elif any(word in topic_lower for word in ["company", "brand", "corporation", "business"]):
160
+ entity_type = "company"
161
+ else:
162
+ entity_type = "general"
163
+
164
+ # Detect value intent
165
+ if "gdp" in topic_lower:
166
+ value_primary = "GDP"
167
+ value_unit = "Trillion USD"
168
+ elif "population" in topic_lower:
169
+ value_primary = "population"
170
+ value_unit = "Million"
171
+ elif "rich" in topic_lower or "wealth" in topic_lower or "net worth" in topic_lower:
172
+ value_primary = "net worth"
173
+ value_unit = "Billion USD"
174
+ elif "subscriber" in topic_lower:
175
+ value_primary = "subscribers"
176
+ value_unit = "Million"
177
+ else:
178
+ value_primary = "value"
179
+ value_unit = ""
180
+
181
+ # Generate search queries
182
+ search_queries = [
183
+ f"{topic} wikipedia",
184
+ f"{topic} by year table",
185
+ f"{topic} history data"
186
+ ]
187
+
188
+ # Build plan
189
+ plan = {
190
+ "topic": topic,
191
+ "entity_type": entity_type,
192
+ "time_config": {
193
+ "start_year": 2000,
194
+ "end_year": 2024,
195
+ "granularity": "year"
196
+ },
197
+ "value_intent": {
198
+ "primary": value_primary,
199
+ "unit": value_unit,
200
+ "alternatives": []
201
+ },
202
+ "search_strategies": [
203
+ {
204
+ "intent": "ranking_history",
205
+ "queries": search_queries
206
+ }
207
+ ],
208
+ "source_priority": ["wikipedia_rest", "worldbank_api", "web_scraping"],
209
+ "data_expectation": {
210
+ "numeric": True,
211
+ "sparse": True,
212
+ "needs_interpolation": True
213
+ },
214
+ "visualization": {
215
+ "type": "bar_chart_race",
216
+ "top_n": 10,
217
+ "smooth": True
218
+ },
219
+ "video_meta": {
220
+ "title": f"{topic} Evolution"
221
+ }
222
+ }
223
+
224
+ logger.info(f"Brain: Generated fallback plan for entity_type={entity_type}")
225
+ return plan
modules/bar_race/services/data_fetcher.py DELETED
@@ -1,134 +0,0 @@
1
- """
2
- Data Fetcher Service
3
- Fetches and normalizes data for bar chart race.
4
- Uses AI-generated realistic data for demo purposes.
5
- """
6
- import logging
7
- from typing import List, Dict, Optional
8
- import random
9
-
10
- logger = logging.getLogger(__name__)
11
-
12
-
13
- class DataFetcher:
14
- """
15
- Fetches data for bar chart race topics.
16
- Uses AI-generated realistic data based on topic configuration.
17
- """
18
-
19
- # Country data for various topics
20
- COUNTRY_DATA = {
21
- "gdp_nominal": {
22
- "entities": ["USA", "China", "Japan", "Germany", "UK", "India", "France", "Italy", "Brazil", "Canada"],
23
- "base_values": [10.0, 1.2, 4.5, 2.0, 1.5, 0.5, 1.4, 1.2, 0.6, 0.7],
24
- "growth_rates": [0.03, 0.10, 0.02, 0.02, 0.02, 0.07, 0.02, 0.01, 0.03, 0.02],
25
- },
26
- "population": {
27
- "entities": ["China", "India", "USA", "Indonesia", "Pakistan", "Brazil", "Nigeria", "Bangladesh", "Russia", "Mexico"],
28
- "base_values": [1200, 1000, 280, 210, 140, 170, 120, 130, 145, 100],
29
- "growth_rates": [0.005, 0.015, 0.008, 0.012, 0.020, 0.008, 0.025, 0.010, -0.002, 0.012],
30
- },
31
- "social_media_users": {
32
- "entities": ["Facebook", "YouTube", "WhatsApp", "Instagram", "TikTok", "Snapchat", "Twitter", "LinkedIn", "Pinterest", "Reddit"],
33
- "base_values": [0.1, 0.05, 0.0, 0.0, 0.0, 0.0, 0.01, 0.01, 0.0, 0.01],
34
- "growth_rates": [0.35, 0.40, 0.50, 0.60, 0.80, 0.30, 0.20, 0.15, 0.25, 0.20],
35
- },
36
- "youtube_subscribers": {
37
- "entities": ["T-Series", "MrBeast", "Cocomelon", "SET India", "PewDiePie", "Kids Diana Show", "Like Nastya", "Vlad and Niki", "Zee Music", "WWE"],
38
- "base_values": [1, 0.1, 0.1, 0.5, 5, 0.1, 0.1, 0.1, 0.3, 10],
39
- "growth_rates": [0.40, 0.50, 0.60, 0.30, 0.15, 0.55, 0.55, 0.50, 0.25, 0.05],
40
- },
41
- "military_spending": {
42
- "entities": ["USA", "China", "Russia", "India", "UK", "Saudi Arabia", "Germany", "France", "Japan", "South Korea"],
43
- "base_values": [300, 20, 50, 15, 35, 20, 30, 35, 40, 15],
44
- "growth_rates": [0.03, 0.12, 0.05, 0.08, 0.02, 0.08, 0.02, 0.02, 0.01, 0.05],
45
- },
46
- }
47
-
48
- def __init__(self):
49
- pass
50
-
51
- def fetch_data(
52
- self,
53
- topic_id: str,
54
- year_start: int,
55
- year_end: int,
56
- top_n: int = 10
57
- ) -> List[Dict]:
58
- """
59
- Fetch data for a topic.
60
- Returns list of {name, year, value} dicts.
61
- """
62
- topic_data = self.COUNTRY_DATA.get(topic_id)
63
-
64
- if not topic_data:
65
- # Generate generic data for unknown topics
66
- return self._generate_generic_data(topic_id, year_start, year_end, top_n)
67
-
68
- return self._generate_realistic_data(topic_data, year_start, year_end, top_n)
69
-
70
- def _generate_realistic_data(
71
- self,
72
- topic_data: Dict,
73
- year_start: int,
74
- year_end: int,
75
- top_n: int
76
- ) -> List[Dict]:
77
- """Generate realistic data based on topic configuration"""
78
- entities = topic_data["entities"][:top_n]
79
- base_values = topic_data["base_values"][:top_n]
80
- growth_rates = topic_data["growth_rates"][:top_n]
81
-
82
- data = []
83
- base_year = 2000 # Reference year for base values
84
-
85
- for year in range(year_start, year_end + 1):
86
- for i, entity in enumerate(entities):
87
- # Calculate value based on growth from base year
88
- years_diff = year - base_year
89
- value = base_values[i] * ((1 + growth_rates[i]) ** years_diff)
90
-
91
- # Add some random variation (±5%)
92
- value *= (1 + random.uniform(-0.05, 0.05))
93
-
94
- data.append({
95
- "name": entity,
96
- "year": year,
97
- "value": round(value, 2)
98
- })
99
-
100
- return data
101
-
102
- def _generate_generic_data(
103
- self,
104
- topic_id: str,
105
- year_start: int,
106
- year_end: int,
107
- top_n: int
108
- ) -> List[Dict]:
109
- """Generate generic data for unknown topics"""
110
- logger.warning(f"No pre-configured data for topic: {topic_id}, generating generic data")
111
-
112
- entities = [f"Entity_{i+1}" for i in range(top_n)]
113
-
114
- data = []
115
- for year in range(year_start, year_end + 1):
116
- for i, entity in enumerate(entities):
117
- # Random growth pattern
118
- base = 100 - i * 5
119
- value = base * (1 + 0.05 * (year - year_start))
120
- value *= (1 + random.uniform(-0.1, 0.1))
121
-
122
- data.append({
123
- "name": entity,
124
- "year": year,
125
- "value": round(value, 2)
126
- })
127
-
128
- return data
129
-
130
- def get_data_for_year(self, data: List[Dict], year: int, top_n: int = 10) -> List[Dict]:
131
- """Filter and sort data for a specific year"""
132
- year_data = [d for d in data if d["year"] == year]
133
- year_data.sort(key=lambda x: x["value"], reverse=True)
134
- return year_data[:top_n]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
modules/bar_race/services/director.py ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Director - Video Generator
3
+ Creates bar chart race animation and final video.
4
+ """
5
+ import logging
6
+ import pandas as pd
7
+ import os
8
+ from typing import Dict, Any, Optional
9
+ import shutil
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class Director:
15
+ """
16
+ Video Generator for Bar Race.
17
+
18
+ Creates animated bar chart race video using:
19
+ - bar_chart_race library for animation
20
+ - Entity images overlay
21
+ - Background music
22
+ - 9:16 vertical format (1080x1920)
23
+ """
24
+
25
+ # Video dimensions (9:16)
26
+ VIDEO_WIDTH = 1080
27
+ VIDEO_HEIGHT = 1920
28
+ FPS = 30
29
+
30
+ def __init__(self, temp_dir: str, output_dir: str = "videos/bar_race"):
31
+ self.temp_dir = temp_dir
32
+ self.output_dir = output_dir
33
+ os.makedirs(output_dir, exist_ok=True)
34
+
35
+ def generate_video(
36
+ self,
37
+ df: pd.DataFrame,
38
+ plan: Dict[str, Any],
39
+ image_paths: Dict[str, str],
40
+ duration_seconds: int = 60,
41
+ job_id: str = ""
42
+ ) -> Optional[str]:
43
+ """
44
+ Generate bar chart race video.
45
+
46
+ Args:
47
+ df: Cleaned data with columns: name, year, value
48
+ plan: Brain's plan with video_meta
49
+ image_paths: Dict mapping entity name to image path
50
+ duration_seconds: Video duration
51
+ job_id: Job ID for output filename
52
+
53
+ Returns:
54
+ Path to generated video, or None if failed
55
+ """
56
+ logger.info(f"Director: Starting video generation for {duration_seconds}s video")
57
+
58
+ try:
59
+ # Prepare data for bar_chart_race
60
+ df_pivot = self._prepare_data(df)
61
+
62
+ if df_pivot is None or df_pivot.empty:
63
+ logger.error("Director: Failed to prepare data")
64
+ return None
65
+
66
+ # Generate animation
67
+ video_path = self._generate_bar_race(
68
+ df_pivot=df_pivot,
69
+ plan=plan,
70
+ duration_seconds=duration_seconds,
71
+ job_id=job_id
72
+ )
73
+
74
+ if video_path and os.path.exists(video_path):
75
+ # Try to add background music (optional)
76
+ video_with_music = self._add_background_music(video_path, duration_seconds)
77
+ if video_with_music:
78
+ return video_with_music
79
+
80
+ return video_path
81
+
82
+ except Exception as e:
83
+ logger.error(f"Director: Video generation failed: {e}")
84
+ import traceback
85
+ logger.error(traceback.format_exc())
86
+ return None
87
+
88
+ def _add_background_music(self, video_path: str, duration_seconds: int) -> Optional[str]:
89
+ """Add background music if available in assets/music folder"""
90
+ music_dir = "modules/bar_race/assets/music"
91
+
92
+ # Check if music directory exists
93
+ if not os.path.exists(music_dir):
94
+ logger.info("Director: No music folder found, skipping background music")
95
+ return None
96
+
97
+ # Find music files
98
+ music_files = []
99
+ for ext in [".mp3", ".wav", ".m4a", ".ogg"]:
100
+ for f in os.listdir(music_dir):
101
+ if f.lower().endswith(ext):
102
+ music_files.append(os.path.join(music_dir, f))
103
+
104
+ if not music_files:
105
+ logger.info("Director: No music files found, skipping background music")
106
+ return None
107
+
108
+ try:
109
+ from moviepy.editor import VideoFileClip, AudioFileClip
110
+ import random
111
+
112
+ # Pick random music file
113
+ music_path = random.choice(music_files)
114
+ logger.info(f"Director: Adding background music: {music_path}")
115
+
116
+ # Load video and audio
117
+ video = VideoFileClip(video_path)
118
+ audio = AudioFileClip(music_path)
119
+
120
+ # Loop audio if shorter than video
121
+ if audio.duration < video.duration:
122
+ from moviepy.editor import concatenate_audioclips
123
+ loops_needed = int(video.duration / audio.duration) + 1
124
+ audio = concatenate_audioclips([audio] * loops_needed)
125
+
126
+ # Trim audio to video length and lower volume
127
+ audio = audio.subclip(0, video.duration).volumex(0.3)
128
+
129
+ # Add audio to video
130
+ video_with_audio = video.set_audio(audio)
131
+
132
+ # Save with music
133
+ output_path = video_path.replace(".mp4", "_music.mp4")
134
+ video_with_audio.write_videofile(
135
+ output_path,
136
+ codec="libx264",
137
+ audio_codec="aac",
138
+ fps=self.FPS,
139
+ logger=None
140
+ )
141
+
142
+ # Cleanup
143
+ video.close()
144
+ audio.close()
145
+
146
+ # Replace original with music version
147
+ os.remove(video_path)
148
+ os.rename(output_path, video_path)
149
+
150
+ logger.info(f"Director: Added background music to video")
151
+ return video_path
152
+
153
+ except Exception as e:
154
+ logger.warning(f"Director: Failed to add music: {e}")
155
+ return None
156
+
157
+ def _prepare_data(self, df: pd.DataFrame) -> Optional[pd.DataFrame]:
158
+ """Prepare data for bar_chart_race (pivoted format)"""
159
+ try:
160
+ # Pivot: rows=year, columns=entity, values=value
161
+ df_pivot = df.pivot(index="year", columns="name", values="value")
162
+
163
+ # Sort by year
164
+ df_pivot = df_pivot.sort_index()
165
+
166
+ # Fill NaN with 0
167
+ df_pivot = df_pivot.fillna(0)
168
+
169
+ logger.info(f"Director: Prepared pivot table with shape {df_pivot.shape}")
170
+ return df_pivot
171
+
172
+ except Exception as e:
173
+ logger.error(f"Director: Data preparation failed: {e}")
174
+ return None
175
+
176
+ def _generate_bar_race(
177
+ self,
178
+ df_pivot: pd.DataFrame,
179
+ plan: Dict[str, Any],
180
+ duration_seconds: int,
181
+ job_id: str
182
+ ) -> Optional[str]:
183
+ """Generate bar chart race animation"""
184
+
185
+ # Get video metadata
186
+ video_meta = plan.get("video_meta", {})
187
+ title = video_meta.get("title", "Bar Chart Race")
188
+ value_unit = plan.get("value_intent", {}).get("unit", "")
189
+ top_n = plan.get("visualization", {}).get("top_n", 10)
190
+
191
+ output_path = os.path.join(self.output_dir, f"bar_race_{job_id}.mp4")
192
+
193
+ try:
194
+ import bar_chart_race as bcr
195
+
196
+ # Calculate steps per period based on duration
197
+ num_years = len(df_pivot)
198
+ steps_per_period = max(10, (duration_seconds * self.FPS) // num_years)
199
+
200
+ logger.info(f"Director: Creating animation with steps_per_period={steps_per_period}")
201
+
202
+ # Generate bar chart race
203
+ bcr.bar_chart_race(
204
+ df=df_pivot,
205
+ filename=output_path,
206
+ orientation='h',
207
+ sort='desc',
208
+ n_bars=top_n,
209
+ fixed_order=False,
210
+ fixed_max=True,
211
+ steps_per_period=steps_per_period,
212
+ period_length=500,
213
+ interpolate_period=True,
214
+ period_label={'x': .95, 'y': .15, 'ha': 'right', 'size': 72},
215
+ period_fmt='{x:.0f}',
216
+ period_summary_func=None,
217
+ perpendicular_bar_func=None,
218
+ title=title,
219
+ title_size=36,
220
+ bar_size=.85,
221
+ bar_textposition='inside',
222
+ bar_texttemplate='{x:,.0f}',
223
+ bar_label_size=14,
224
+ tick_label_size=14,
225
+ scale='linear',
226
+ writer=None,
227
+ fig=None,
228
+ bar_kwargs={'alpha': .8},
229
+ filter_column_colors=False,
230
+ cmap='dark24',
231
+ dpi=144
232
+ )
233
+
234
+ logger.info(f"Director: Generated video at {output_path}")
235
+ return output_path
236
+
237
+ except ImportError:
238
+ logger.warning("Director: bar_chart_race not available, using fallback")
239
+ return self._generate_fallback_video(df_pivot, plan, duration_seconds, job_id)
240
+ except Exception as e:
241
+ logger.error(f"Director: bar_chart_race failed: {e}")
242
+ return self._generate_fallback_video(df_pivot, plan, duration_seconds, job_id)
243
+
244
+ def _generate_fallback_video(
245
+ self,
246
+ df_pivot: pd.DataFrame,
247
+ plan: Dict[str, Any],
248
+ duration_seconds: int,
249
+ job_id: str
250
+ ) -> Optional[str]:
251
+ """Fallback: Generate simple video using matplotlib and MoviePy"""
252
+ logger.info("Director: Using fallback matplotlib animation")
253
+
254
+ try:
255
+ import matplotlib
256
+ matplotlib.use('Agg')
257
+ import matplotlib.pyplot as plt
258
+ from matplotlib.animation import FuncAnimation
259
+ from moviepy.editor import VideoFileClip
260
+ import tempfile
261
+
262
+ video_meta = plan.get("video_meta", {})
263
+ title = video_meta.get("title", "Bar Chart Race")
264
+ top_n = plan.get("visualization", {}).get("top_n", 10)
265
+
266
+ # Create figure with dark theme
267
+ fig, ax = plt.subplots(figsize=(6, 10.67), facecolor='#121220')
268
+ ax.set_facecolor('#121220')
269
+
270
+ years = df_pivot.index.tolist()
271
+ num_frames = duration_seconds * self.FPS
272
+ frames_per_year = num_frames // len(years)
273
+
274
+ # Colors for bars
275
+ colors = plt.cm.viridis([i/top_n for i in range(top_n)])
276
+
277
+ def update(frame):
278
+ ax.clear()
279
+ ax.set_facecolor('#121220')
280
+
281
+ # Calculate current year and interpolation
282
+ year_idx = min(frame // frames_per_year, len(years) - 1)
283
+ year = years[year_idx]
284
+
285
+ # Get data for current year
286
+ data = df_pivot.loc[year].sort_values(ascending=True).tail(top_n)
287
+
288
+ # Draw horizontal bars
289
+ bars = ax.barh(range(len(data)), data.values, color=colors[:len(data)])
290
+
291
+ # Labels
292
+ ax.set_yticks(range(len(data)))
293
+ ax.set_yticklabels(data.index, fontsize=10, color='white')
294
+ ax.set_title(f"{title}\n{year}", fontsize=16, color='white', pad=20)
295
+
296
+ # Style
297
+ ax.spines['top'].set_visible(False)
298
+ ax.spines['right'].set_visible(False)
299
+ ax.spines['bottom'].set_color('#444')
300
+ ax.spines['left'].set_color('#444')
301
+ ax.tick_params(colors='#888')
302
+
303
+ plt.tight_layout()
304
+
305
+ # Create animation
306
+ anim = FuncAnimation(fig, update, frames=num_frames, interval=1000/self.FPS)
307
+
308
+ # Save to temp file
309
+ temp_path = os.path.join(self.temp_dir, f"temp_animation_{job_id}.mp4")
310
+ anim.save(temp_path, writer='ffmpeg', fps=self.FPS, dpi=100)
311
+ plt.close(fig)
312
+
313
+ # Move to output
314
+ output_path = os.path.join(self.output_dir, f"bar_race_{job_id}.mp4")
315
+ shutil.move(temp_path, output_path)
316
+
317
+ logger.info(f"Director: Generated fallback video at {output_path}")
318
+ return output_path
319
+
320
+ except Exception as e:
321
+ logger.error(f"Director: Fallback video generation failed: {e}")
322
+ import traceback
323
+ logger.error(traceback.format_exc())
324
+ return None
325
+
326
+ def cleanup(self):
327
+ """Clean up temporary files"""
328
+ try:
329
+ if os.path.exists(self.temp_dir):
330
+ shutil.rmtree(self.temp_dir)
331
+ logger.info(f"Director: Cleaned up temp directory: {self.temp_dir}")
332
+ except Exception as e:
333
+ logger.warning(f"Director: Cleanup failed: {e}")
modules/bar_race/services/scout.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Scout - Data Fetcher
3
+ Collects data from multiple sources based on Brain's plan.
4
+
5
+ Priority:
6
+ 1. Wikipedia REST API
7
+ 2. World Bank API
8
+ 3. DuckDuckGo + BeautifulSoup scraping
9
+ """
10
+ import logging
11
+ import requests
12
+ import pandas as pd
13
+ from typing import Dict, Any, List, Optional
14
+ from bs4 import BeautifulSoup
15
+ import re
16
+ import os
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class Scout:
22
+ """
23
+ Data Fetcher for Bar Race video generation.
24
+
25
+ Tries APIs first, falls back to web scraping.
26
+ """
27
+
28
+ # API endpoints
29
+ WIKIPEDIA_API = "https://en.wikipedia.org/api/rest_v1"
30
+ WORLDBANK_API = "https://api.worldbank.org/v2"
31
+
32
+ # Common headers
33
+ HEADERS = {
34
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
35
+ }
36
+
37
+ def __init__(self, temp_dir: str):
38
+ self.temp_dir = temp_dir
39
+ os.makedirs(temp_dir, exist_ok=True)
40
+
41
+ def fetch_data(self, plan: Dict[str, Any]) -> Optional[pd.DataFrame]:
42
+ """
43
+ Fetch data based on Brain's plan.
44
+
45
+ Tries sources in priority order:
46
+ 1. Wikipedia REST API
47
+ 2. World Bank API
48
+ 3. Web scraping
49
+
50
+ Returns:
51
+ DataFrame with raw data, or None if all sources fail
52
+ """
53
+ source_priority = plan.get("source_priority", ["wikipedia_rest", "worldbank_api", "web_scraping"])
54
+ entity_type = plan.get("entity_type", "general")
55
+ topic = plan.get("topic", "")
56
+ value_intent = plan.get("value_intent", {})
57
+
58
+ df = None
59
+
60
+ for source in source_priority:
61
+ logger.info(f"Scout: Trying source: {source}")
62
+
63
+ try:
64
+ if source == "wikipedia_rest":
65
+ df = self._fetch_wikipedia(plan)
66
+ elif source == "worldbank_api":
67
+ df = self._fetch_worldbank(plan)
68
+ elif source == "web_scraping":
69
+ df = self._fetch_scraping(plan)
70
+
71
+ if df is not None and not df.empty:
72
+ logger.info(f"Scout: Success with {source}, got {len(df)} rows")
73
+ break
74
+
75
+ except Exception as e:
76
+ logger.warning(f"Scout: {source} failed: {e}")
77
+ continue
78
+
79
+ if df is not None and not df.empty:
80
+ # Save raw data
81
+ raw_path = os.path.join(self.temp_dir, "raw_data.csv")
82
+ df.to_csv(raw_path, index=False)
83
+ logger.info(f"Scout: Saved raw data to {raw_path}")
84
+ return df
85
+
86
+ logger.error("Scout: All sources failed")
87
+ return None
88
+
89
+ def _fetch_wikipedia(self, plan: Dict[str, Any]) -> Optional[pd.DataFrame]:
90
+ """Fetch data from Wikipedia REST API or tables"""
91
+ queries = []
92
+ for strategy in plan.get("search_strategies", []):
93
+ queries.extend(strategy.get("queries", []))
94
+
95
+ if not queries:
96
+ queries = [plan.get("topic", "")]
97
+
98
+ # Try Wikipedia page tables
99
+ for query in queries:
100
+ try:
101
+ # Search Wikipedia
102
+ search_url = f"https://en.wikipedia.org/w/api.php"
103
+ params = {
104
+ "action": "opensearch",
105
+ "search": query.replace(" wikipedia", ""),
106
+ "limit": 5,
107
+ "format": "json"
108
+ }
109
+
110
+ response = requests.get(search_url, params=params, headers=self.HEADERS, timeout=10)
111
+ if response.status_code == 200:
112
+ results = response.json()
113
+ if len(results) >= 4 and results[3]:
114
+ # Get first result URL
115
+ page_url = results[3][0]
116
+
117
+ # Fetch page and extract tables
118
+ tables = pd.read_html(page_url)
119
+ if tables:
120
+ # Find table with year data
121
+ for table in tables:
122
+ if self._has_year_column(table):
123
+ logger.info(f"Scout: Found table with year data from Wikipedia")
124
+ return table
125
+
126
+ # Return largest table if no year column found
127
+ largest = max(tables, key=lambda t: len(t))
128
+ return largest
129
+
130
+ except Exception as e:
131
+ logger.debug(f"Scout: Wikipedia query '{query}' failed: {e}")
132
+ continue
133
+
134
+ return None
135
+
136
+ def _fetch_worldbank(self, plan: Dict[str, Any]) -> Optional[pd.DataFrame]:
137
+ """Fetch data from World Bank API"""
138
+ value_intent = plan.get("value_intent", {})
139
+ primary_value = value_intent.get("primary", "").lower()
140
+ time_config = plan.get("time_config", {})
141
+
142
+ # Map common intents to World Bank indicators
143
+ indicator_map = {
144
+ "gdp": "NY.GDP.MKTP.CD", # GDP (current USD)
145
+ "population": "SP.POP.TOTL", # Total population
146
+ "life expectancy": "SP.DYN.LE00.IN", # Life expectancy at birth
147
+ "inflation": "FP.CPI.TOTL.ZG", # Inflation (consumer prices)
148
+ "military": "MS.MIL.XPND.CD", # Military expenditure
149
+ }
150
+
151
+ indicator = None
152
+ for key, value in indicator_map.items():
153
+ if key in primary_value:
154
+ indicator = value
155
+ break
156
+
157
+ if not indicator:
158
+ logger.debug("Scout: No matching World Bank indicator found")
159
+ return None
160
+
161
+ try:
162
+ # Fetch data from World Bank API
163
+ start_year = time_config.get("start_year", 2000)
164
+ end_year = time_config.get("end_year", 2024)
165
+
166
+ url = f"{self.WORLDBANK_API}/country/all/indicator/{indicator}"
167
+ params = {
168
+ "format": "json",
169
+ "per_page": 500,
170
+ "date": f"{start_year}:{end_year}"
171
+ }
172
+
173
+ response = requests.get(url, params=params, headers=self.HEADERS, timeout=15)
174
+ if response.status_code == 200:
175
+ data = response.json()
176
+ if len(data) >= 2 and data[1]:
177
+ records = data[1]
178
+
179
+ # Convert to DataFrame
180
+ rows = []
181
+ for record in records:
182
+ if record.get("value") is not None:
183
+ rows.append({
184
+ "name": record["country"]["value"],
185
+ "year": int(record["date"]),
186
+ "value": record["value"]
187
+ })
188
+
189
+ if rows:
190
+ df = pd.DataFrame(rows)
191
+ logger.info(f"Scout: Got {len(df)} rows from World Bank API")
192
+ return df
193
+
194
+ except Exception as e:
195
+ logger.warning(f"Scout: World Bank API failed: {e}")
196
+
197
+ return None
198
+
199
+ def _fetch_scraping(self, plan: Dict[str, Any]) -> Optional[pd.DataFrame]:
200
+ """Fallback: Search and scrape web pages"""
201
+ queries = []
202
+ for strategy in plan.get("search_strategies", []):
203
+ queries.extend(strategy.get("queries", []))
204
+
205
+ if not queries:
206
+ queries = [f"{plan.get('topic', '')} data table"]
207
+
208
+ # Try DuckDuckGo search
209
+ try:
210
+ from duckduckgo_search import DDGS
211
+
212
+ with DDGS() as ddgs:
213
+ for query in queries[:3]: # Limit to 3 queries
214
+ results = list(ddgs.text(query, max_results=5))
215
+
216
+ for result in results:
217
+ url = result.get("href", "")
218
+ if not url:
219
+ continue
220
+
221
+ try:
222
+ # Fetch and parse tables
223
+ response = requests.get(url, headers=self.HEADERS, timeout=10)
224
+ if response.status_code == 200:
225
+ tables = pd.read_html(response.text)
226
+ if tables:
227
+ for table in tables:
228
+ if self._has_year_column(table):
229
+ logger.info(f"Scout: Found table from {url}")
230
+ return table
231
+
232
+ # Return largest table
233
+ largest = max(tables, key=lambda t: len(t))
234
+ if len(largest) > 5:
235
+ return largest
236
+
237
+ except Exception as e:
238
+ logger.debug(f"Scout: Failed to scrape {url}: {e}")
239
+ continue
240
+
241
+ except ImportError:
242
+ logger.warning("Scout: duckduckgo-search not installed")
243
+ except Exception as e:
244
+ logger.warning(f"Scout: DuckDuckGo search failed: {e}")
245
+
246
+ return None
247
+
248
+ def _has_year_column(self, df: pd.DataFrame) -> bool:
249
+ """Check if DataFrame has a year-like column"""
250
+ for col in df.columns:
251
+ col_str = str(col).lower()
252
+ # Check if column name contains year-related words
253
+ if any(word in col_str for word in ["year", "date", "time"]):
254
+ return True
255
+ # Check if column values look like years
256
+ try:
257
+ sample = df[col].dropna().head(5)
258
+ for val in sample:
259
+ if isinstance(val, (int, float)):
260
+ if 1900 <= val <= 2100:
261
+ return True
262
+ elif isinstance(val, str):
263
+ if re.match(r'^(19|20)\d{2}$', str(val)):
264
+ return True
265
+ except:
266
+ pass
267
+ return False
modules/bar_race/services/surgeon.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Surgeon - Data Cleaner & Formatter
3
+ Cleans raw data and prepares it for bar chart race animation.
4
+ """
5
+ import logging
6
+ import pandas as pd
7
+ import numpy as np
8
+ import re
9
+ import os
10
+ from typing import Dict, Any, Optional, List
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class Surgeon:
16
+ """
17
+ Data Cleaner for Bar Race video generation.
18
+
19
+ Responsibilities:
20
+ - Table selection (find year columns)
21
+ - Wide → Long / Long → Wide conversion
22
+ - Regex cleaning ($, €, commas, references)
23
+ - Convert strings to numeric
24
+ - Interpolate missing years
25
+ """
26
+
27
+ def __init__(self, temp_dir: str):
28
+ self.temp_dir = temp_dir
29
+
30
+ def clean_data(self, df: pd.DataFrame, plan: Dict[str, Any]) -> Optional[pd.DataFrame]:
31
+ """
32
+ Clean and format raw data for bar chart race.
33
+
34
+ Expected output format:
35
+ name | year | value
36
+ USA | 2000 | 10.5
37
+ USA | 2001 | 11.2
38
+ ...
39
+
40
+ Returns:
41
+ Cleaned DataFrame with columns: name, year, value
42
+ """
43
+ if df is None or df.empty:
44
+ logger.error("Surgeon: No data to clean")
45
+ return None
46
+
47
+ logger.info(f"Surgeon: Cleaning data with shape {df.shape}")
48
+ logger.debug(f"Surgeon: Columns: {list(df.columns)}")
49
+
50
+ try:
51
+ # Step 1: Identify data structure
52
+ structure = self._identify_structure(df)
53
+ logger.info(f"Surgeon: Data structure: {structure}")
54
+
55
+ # Step 2: Convert to long format
56
+ if structure == "wide":
57
+ df_long = self._wide_to_long(df, plan)
58
+ elif structure == "long":
59
+ df_long = self._normalize_long(df, plan)
60
+ else:
61
+ df_long = self._attempt_conversion(df, plan)
62
+
63
+ if df_long is None or df_long.empty:
64
+ logger.error("Surgeon: Failed to convert data to long format")
65
+ return None
66
+
67
+ # Step 3: Clean values
68
+ df_clean = self._clean_values(df_long)
69
+
70
+ # Step 4: Interpolate missing years
71
+ time_config = plan.get("time_config", {})
72
+ df_interpolated = self._interpolate_years(
73
+ df_clean,
74
+ start_year=time_config.get("start_year", 2000),
75
+ end_year=time_config.get("end_year", 2024)
76
+ )
77
+
78
+ # Step 5: Get top N entities
79
+ top_n = plan.get("visualization", {}).get("top_n", 10)
80
+ df_final = self._get_top_entities(df_interpolated, top_n)
81
+
82
+ # Save cleaned data
83
+ output_path = os.path.join(self.temp_dir, "bar_chart_ready.csv")
84
+ df_final.to_csv(output_path, index=False)
85
+ logger.info(f"Surgeon: Saved cleaned data to {output_path}, shape: {df_final.shape}")
86
+
87
+ return df_final
88
+
89
+ except Exception as e:
90
+ logger.error(f"Surgeon: Cleaning failed: {e}")
91
+ import traceback
92
+ logger.error(traceback.format_exc())
93
+ return None
94
+
95
+ def _identify_structure(self, df: pd.DataFrame) -> str:
96
+ """Identify if data is wide or long format"""
97
+ # Wide format: years as columns (2000, 2001, 2002...)
98
+ year_columns = []
99
+ for col in df.columns:
100
+ try:
101
+ year = int(str(col))
102
+ if 1900 <= year <= 2100:
103
+ year_columns.append(col)
104
+ except:
105
+ pass
106
+
107
+ if len(year_columns) > 3:
108
+ return "wide"
109
+
110
+ # Long format: year column with values
111
+ for col in df.columns:
112
+ col_lower = str(col).lower()
113
+ if "year" in col_lower or "date" in col_lower:
114
+ return "long"
115
+
116
+ return "unknown"
117
+
118
+ def _wide_to_long(self, df: pd.DataFrame, plan: Dict[str, Any]) -> Optional[pd.DataFrame]:
119
+ """Convert wide format to long format"""
120
+ # Find year columns
121
+ year_columns = []
122
+ non_year_columns = []
123
+
124
+ for col in df.columns:
125
+ try:
126
+ year = int(str(col))
127
+ if 1900 <= year <= 2100:
128
+ year_columns.append(col)
129
+ else:
130
+ non_year_columns.append(col)
131
+ except:
132
+ non_year_columns.append(col)
133
+
134
+ if not year_columns:
135
+ return None
136
+
137
+ # Find name column (first non-year column with strings)
138
+ name_col = None
139
+ for col in non_year_columns:
140
+ if df[col].dtype == object:
141
+ name_col = col
142
+ break
143
+
144
+ if name_col is None and non_year_columns:
145
+ name_col = non_year_columns[0]
146
+
147
+ if name_col is None:
148
+ return None
149
+
150
+ # Melt to long format
151
+ df_long = df.melt(
152
+ id_vars=[name_col],
153
+ value_vars=year_columns,
154
+ var_name="year",
155
+ value_name="value"
156
+ )
157
+
158
+ # Rename columns
159
+ df_long.columns = ["name", "year", "value"]
160
+
161
+ return df_long
162
+
163
+ def _normalize_long(self, df: pd.DataFrame, plan: Dict[str, Any]) -> Optional[pd.DataFrame]:
164
+ """Normalize long format data"""
165
+ # Find relevant columns
166
+ name_col = None
167
+ year_col = None
168
+ value_col = None
169
+
170
+ for col in df.columns:
171
+ col_lower = str(col).lower()
172
+
173
+ if name_col is None and any(word in col_lower for word in ["name", "country", "entity", "player"]):
174
+ name_col = col
175
+ elif year_col is None and any(word in col_lower for word in ["year", "date", "time"]):
176
+ year_col = col
177
+ elif value_col is None and any(word in col_lower for word in ["value", "amount", "gdp", "population", "worth"]):
178
+ value_col = col
179
+
180
+ # Fallback: use first string column as name, numeric columns for year/value
181
+ if name_col is None:
182
+ for col in df.columns:
183
+ if df[col].dtype == object:
184
+ name_col = col
185
+ break
186
+
187
+ if value_col is None:
188
+ # Use last numeric column as value
189
+ for col in reversed(list(df.columns)):
190
+ if col != year_col and pd.api.types.is_numeric_dtype(df[col]):
191
+ value_col = col
192
+ break
193
+
194
+ if not all([name_col, year_col, value_col]):
195
+ logger.warning(f"Surgeon: Could not identify columns. name={name_col}, year={year_col}, value={value_col}")
196
+ return None
197
+
198
+ # Select and rename
199
+ df_long = df[[name_col, year_col, value_col]].copy()
200
+ df_long.columns = ["name", "year", "value"]
201
+
202
+ return df_long
203
+
204
+ def _attempt_conversion(self, df: pd.DataFrame, plan: Dict[str, Any]) -> Optional[pd.DataFrame]:
205
+ """Attempt to convert unknown format"""
206
+ # Try treating first column as name, rest as years/values
207
+ if len(df.columns) >= 2:
208
+ name_col = df.columns[0]
209
+
210
+ # Check if other columns might be years
211
+ potential_years = []
212
+ for col in df.columns[1:]:
213
+ try:
214
+ year = int(str(col))
215
+ if 1900 <= year <= 2100:
216
+ potential_years.append(col)
217
+ except:
218
+ pass
219
+
220
+ if potential_years:
221
+ return self._wide_to_long(df, plan)
222
+
223
+ return None
224
+
225
+ def _clean_values(self, df: pd.DataFrame) -> pd.DataFrame:
226
+ """Clean values: remove symbols, convert to numeric"""
227
+ df = df.copy()
228
+
229
+ # Clean name column
230
+ df["name"] = df["name"].astype(str).str.strip()
231
+ df["name"] = df["name"].str.replace(r'\[.*?\]', '', regex=True) # Remove references like [1]
232
+
233
+ # Clean year column
234
+ df["year"] = pd.to_numeric(df["year"], errors="coerce")
235
+
236
+ # Clean value column
237
+ def clean_value(val):
238
+ if pd.isna(val):
239
+ return np.nan
240
+ if isinstance(val, (int, float)):
241
+ return float(val)
242
+
243
+ # Convert to string and clean
244
+ val_str = str(val)
245
+
246
+ # Remove currency symbols and commas
247
+ val_str = re.sub(r'[$€£¥₹,]', '', val_str)
248
+
249
+ # Remove references like [1], [a]
250
+ val_str = re.sub(r'\[.*?\]', '', val_str)
251
+
252
+ # Handle multipliers (billion, million, trillion)
253
+ multiplier = 1
254
+ val_lower = val_str.lower()
255
+ if "trillion" in val_lower:
256
+ multiplier = 1e12
257
+ val_str = re.sub(r'trillion', '', val_str, flags=re.IGNORECASE)
258
+ elif "billion" in val_lower:
259
+ multiplier = 1e9
260
+ val_str = re.sub(r'billion', '', val_str, flags=re.IGNORECASE)
261
+ elif "million" in val_lower:
262
+ multiplier = 1e6
263
+ val_str = re.sub(r'million', '', val_str, flags=re.IGNORECASE)
264
+
265
+ # Extract numeric value
266
+ match = re.search(r'[-+]?\d*\.?\d+', val_str)
267
+ if match:
268
+ return float(match.group()) * multiplier
269
+
270
+ return np.nan
271
+
272
+ df["value"] = df["value"].apply(clean_value)
273
+
274
+ # Drop rows with missing data
275
+ df = df.dropna(subset=["name", "year", "value"])
276
+
277
+ # Convert year to int
278
+ df["year"] = df["year"].astype(int)
279
+
280
+ return df
281
+
282
+ def _interpolate_years(self, df: pd.DataFrame, start_year: int, end_year: int) -> pd.DataFrame:
283
+ """Interpolate missing years for each entity"""
284
+ entities = df["name"].unique()
285
+ all_years = list(range(start_year, end_year + 1))
286
+
287
+ result_dfs = []
288
+
289
+ for entity in entities:
290
+ entity_df = df[df["name"] == entity].copy()
291
+
292
+ if entity_df.empty:
293
+ continue
294
+
295
+ # Create full year index
296
+ full_df = pd.DataFrame({"year": all_years})
297
+ full_df["name"] = entity
298
+
299
+ # Merge with existing data
300
+ merged = full_df.merge(entity_df[["year", "value"]], on="year", how="left")
301
+
302
+ # Interpolate missing values
303
+ merged["value"] = merged["value"].interpolate(method="linear")
304
+
305
+ # Forward/backward fill remaining NaN
306
+ merged["value"] = merged["value"].fillna(method="ffill").fillna(method="bfill")
307
+
308
+ result_dfs.append(merged)
309
+
310
+ if result_dfs:
311
+ return pd.concat(result_dfs, ignore_index=True)
312
+ return df
313
+
314
+ def _get_top_entities(self, df: pd.DataFrame, top_n: int = 10) -> pd.DataFrame:
315
+ """Get top N entities based on maximum value"""
316
+ # Calculate max value for each entity
317
+ max_values = df.groupby("name")["value"].max().sort_values(ascending=False)
318
+
319
+ # Get top N entity names
320
+ top_entities = max_values.head(top_n).index.tolist()
321
+
322
+ # Filter dataframe
323
+ df_top = df[df["name"].isin(top_entities)]
324
+
325
+ logger.info(f"Surgeon: Selected top {len(top_entities)} entities: {top_entities}")
326
+
327
+ return df_top
requirements.txt CHANGED
@@ -29,3 +29,10 @@ imageio-ffmpeg>=0.4.9
29
  # Trends Analysis
30
  pytrends
31
  pandas
 
 
 
 
 
 
 
 
29
  # Trends Analysis
30
  pytrends
31
  pandas
32
+
33
+ # Bar Race Module
34
+ bar_chart_race
35
+ beautifulsoup4
36
+ lxml
37
+ duckduckgo-search
38
+ rembg
static/index.html CHANGED
@@ -656,50 +656,22 @@
656
 
657
  <form id="barRaceForm">
658
  <div class="form-group">
659
- <label>Topic *</label>
660
- <select id="barRaceTopic" required>
661
- <option value="gdp_nominal">GDP (Nominal) - Richest Countries</option>
662
- <option value="population">Population - Most Populated Countries</option>
663
- <option value="gdp_per_capita">GDP Per Capita</option>
664
- <option value="social_media_users">Social Media Users</option>
665
- <option value="youtube_subscribers">YouTube Subscribers</option>
666
- <option value="military_spending">Military Expenditure</option>
667
- <option value="olympic_medals">Olympic Gold Medals</option>
668
- <option value="life_expectancy">Life Expectancy</option>
669
- <option value="browser_market_share">Browser Market Share</option>
670
- </select>
671
- </div>
672
-
673
- <div class="form-row" style="display: grid; grid-template-columns: 1fr 1fr; gap: 1rem;">
674
- <div class="form-group">
675
- <label>Start Year</label>
676
- <input type="number" id="barRaceYearStart" value="2000" min="1960" max="2024">
677
- </div>
678
- <div class="form-group">
679
- <label>End Year</label>
680
- <input type="number" id="barRaceYearEnd" value="2024" min="1960" max="2024">
681
- </div>
682
  </div>
683
 
684
- <div class="form-row" style="display: grid; grid-template-columns: 1fr 1fr; gap: 1rem;">
685
- <div class="form-group">
686
- <label>Top N (Bars)</label>
687
- <select id="barRaceTopN">
688
- <option value="5">5</option>
689
- <option value="8">8</option>
690
- <option value="10" selected>10</option>
691
- <option value="15">15</option>
692
- </select>
693
- </div>
694
- <div class="form-group">
695
- <label>Duration (seconds)</label>
696
- <select id="barRaceDuration">
697
- <option value="30">30s</option>
698
- <option value="60" selected>60s</option>
699
- <option value="90">90s</option>
700
- <option value="120">120s</option>
701
- </select>
702
- </div>
703
  </div>
704
 
705
  <button type="submit" class="btn btn-primary" style="width: 100%;">📊 Generate Bar Race Video</button>
@@ -1241,9 +1213,6 @@
1241
  status.innerHTML = '⏳ Starting bar race generation...';
1242
 
1243
  const topic = document.getElementById('barRaceTopic').value;
1244
- const yearStart = parseInt(document.getElementById('barRaceYearStart').value);
1245
- const yearEnd = parseInt(document.getElementById('barRaceYearEnd').value);
1246
- const topN = parseInt(document.getElementById('barRaceTopN').value);
1247
  const duration = parseInt(document.getElementById('barRaceDuration').value);
1248
 
1249
  try {
@@ -1252,18 +1221,14 @@
1252
  headers: { 'Content-Type': 'application/json' },
1253
  body: JSON.stringify({
1254
  topic: topic,
1255
- year_start: yearStart,
1256
- year_end: yearEnd,
1257
- top_n: topN,
1258
- duration_seconds: duration,
1259
- fps: 30
1260
  })
1261
  });
1262
 
1263
  const data = await response.json();
1264
  if (!response.ok) throw new Error(data.detail || 'Failed to start');
1265
 
1266
- status.innerHTML = `⏳ Job started: ${data.job_id}. Generating frames...`;
1267
  pollBarRaceStatus(data.job_id);
1268
 
1269
  } catch (err) {
@@ -1286,7 +1251,8 @@
1286
  status.className = 'status error';
1287
  status.innerHTML = '❌ Failed: ' + (data.error || 'Unknown error');
1288
  } else {
1289
- status.innerHTML = `⏳ ${data.status}... ${data.progress}%`;
 
1290
  setTimeout(poll, 2000);
1291
  }
1292
  } catch (err) {
 
656
 
657
  <form id="barRaceForm">
658
  <div class="form-group">
659
+ <label>Topic / Prompt *</label>
660
+ <input type="text" id="barRaceTopic" placeholder="e.g., Top 10 richest countries by GDP 2000-2024"
661
+ required>
662
+ <small style="color: var(--text-secondary); display: block; margin-top: 0.5rem;">
663
+ Enter any topic - the AI will find data and create the video
664
+ </small>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
665
  </div>
666
 
667
+ <div class="form-group">
668
+ <label>Duration</label>
669
+ <select id="barRaceDuration">
670
+ <option value="30">30 seconds</option>
671
+ <option value="60" selected>60 seconds</option>
672
+ <option value="90">90 seconds</option>
673
+ <option value="120">120 seconds</option>
674
+ </select>
 
 
 
 
 
 
 
 
 
 
 
675
  </div>
676
 
677
  <button type="submit" class="btn btn-primary" style="width: 100%;">📊 Generate Bar Race Video</button>
 
1213
  status.innerHTML = '⏳ Starting bar race generation...';
1214
 
1215
  const topic = document.getElementById('barRaceTopic').value;
 
 
 
1216
  const duration = parseInt(document.getElementById('barRaceDuration').value);
1217
 
1218
  try {
 
1221
  headers: { 'Content-Type': 'application/json' },
1222
  body: JSON.stringify({
1223
  topic: topic,
1224
+ duration_seconds: duration
 
 
 
 
1225
  })
1226
  });
1227
 
1228
  const data = await response.json();
1229
  if (!response.ok) throw new Error(data.detail || 'Failed to start');
1230
 
1231
+ status.innerHTML = `⏳ Job started: ${data.job_id}. Analyzing topic...`;
1232
  pollBarRaceStatus(data.job_id);
1233
 
1234
  } catch (err) {
 
1251
  status.className = 'status error';
1252
  status.innerHTML = '❌ Failed: ' + (data.error || 'Unknown error');
1253
  } else {
1254
+ const step = data.current_step || data.status;
1255
+ status.innerHTML = `⏳ ${step} (${data.progress}%)`;
1256
  setTimeout(poll, 2000);
1257
  }
1258
  } catch (err) {