daemon03 commited on
Commit
edd00ca
Β·
1 Parent(s): 07d7d9a

content_generator v1.0

Browse files
src/image_generation_functions.py ADDED
@@ -0,0 +1,453 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import time
4
+ import re
5
+ import mimetypes
6
+ from io import BytesIO
7
+ from PIL import Image as PILImage
8
+ import google.generativeai as genai
9
+ from google.cloud import storage
10
+ from google import genai as google_genai
11
+ from google.genai import types
12
+ from tenacity import retry, stop_after_attempt, wait_exponential
13
+ from dotenv import load_dotenv
14
+
15
+ load_dotenv()
16
+
17
+ # ============================================================
18
+ # IMAGE GENERATION CONFIGURATION (FIXED - Two separate keys)
19
+ # ============================================================
20
+
21
+ # For text correction (Gemini 2.5 Flash)
22
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
23
+
24
+ # For image generation (Gemini 2.5 Flash Image - NEW API)
25
+ IMAGE_API_KEY = os.getenv("IMAGE_API_KEY")
26
+
27
+ GCP_CREDENTIALS_JSON = os.getenv("GCP_CREDENTIALS_JSON")
28
+ GCP_PROJECT_ID = os.getenv("GCP_PROJECT_ID")
29
+ GCP_BUCKET_NAME = os.getenv("GCP_BUCKET_NAME")
30
+
31
+ # Initialize Gemini for correction (old API - works for text)
32
+ if GEMINI_API_KEY:
33
+ genai.configure(api_key=GEMINI_API_KEY)
34
+ else:
35
+ print("⚠️ GEMINI_API_KEY not set - text correction will fail")
36
+
37
+ # Initialize GCP Storage
38
+ try:
39
+ if GCP_CREDENTIALS_JSON and GCP_PROJECT_ID and GCP_BUCKET_NAME:
40
+ import json as json_lib
41
+ from google.oauth2 import service_account
42
+
43
+ credentials_dict = json_lib.loads(GCP_CREDENTIALS_JSON)
44
+ credentials = service_account.Credentials.from_service_account_info(credentials_dict)
45
+ gcp_client = storage.Client(credentials=credentials, project=GCP_PROJECT_ID)
46
+ gcp_bucket = gcp_client.bucket(GCP_BUCKET_NAME)
47
+ GCP_AVAILABLE = True
48
+ print("βœ“ GCP Storage configured for image uploads")
49
+ else:
50
+ GCP_AVAILABLE = False
51
+ print("⚠️ GCP credentials not fully configured - image upload disabled")
52
+ except Exception as e:
53
+ GCP_AVAILABLE = False
54
+ print(f"⚠️ GCP configuration error: {e}")
55
+
56
+ # ============================================================
57
+ # AUTOCROP FUNCTION (Proper implementation)
58
+ # ============================================================
59
+
60
+ def autocrop_tight_vertical(image_path, output_path=None):
61
+ """
62
+ Remove excess white space from top and bottom of image while keeping left/right margins.
63
+ FIXED: Proper PIL implementation with margin preservation.
64
+ """
65
+ try:
66
+ img = PILImage.open(image_path)
67
+ img_array = img.convert('RGB')
68
+
69
+ # Get image dimensions
70
+ width, height = img_array.size
71
+
72
+ # Define white threshold (pure white or very close)
73
+ white_threshold = 250
74
+
75
+ # Find first non-white row from top
76
+ top_crop = 0
77
+ for y in range(height):
78
+ row_pixels = []
79
+ for x in range(width):
80
+ r, g, b = img_array.getpixel((x, y))
81
+ row_pixels.append((r + g + b) / 3)
82
+
83
+ avg_brightness = sum(row_pixels) / len(row_pixels)
84
+ if avg_brightness < white_threshold:
85
+ top_crop = y
86
+ break
87
+
88
+ # Find first non-white row from bottom
89
+ bottom_crop = height
90
+ for y in range(height - 1, -1, -1):
91
+ row_pixels = []
92
+ for x in range(width):
93
+ r, g, b = img_array.getpixel((x, y))
94
+ row_pixels.append((r + g + b) / 3)
95
+
96
+ avg_brightness = sum(row_pixels) / len(row_pixels)
97
+ if avg_brightness < white_threshold:
98
+ bottom_crop = y + 1
99
+ break
100
+
101
+ # Crop image with small margin
102
+ margin = 10
103
+ top_crop = max(0, top_crop - margin)
104
+ bottom_crop = min(height, bottom_crop + margin)
105
+
106
+ # Make sure we have at least some height
107
+ if bottom_crop <= top_crop:
108
+ print(" ⚠️ Autocrop: No content found, returning original")
109
+ return img_array
110
+
111
+ cropped_img = img_array.crop((0, top_crop, width, bottom_crop))
112
+
113
+ if output_path:
114
+ cropped_img.save(output_path)
115
+
116
+ print(f" βœ“ Autocropped from {height}px to {cropped_img.size[1]}px")
117
+ return cropped_img
118
+
119
+ except Exception as e:
120
+ print(f"⚠️ Autocrop failed: {e}")
121
+ return None
122
+
123
+ # ============================================================
124
+ # TECHNICAL IMAGE GENERATION (FIXED - NEW API with proper error checking)
125
+ # ============================================================
126
+
127
+ @retry(
128
+ stop=stop_after_attempt(2),
129
+ wait=wait_exponential(multiplier=1, min=3, max=10)
130
+ )
131
+ def generate_technical_image(slide_title, slide_content, image_description):
132
+ """
133
+ Generate a technical diagram using NEW Gemini 2.5 Flash Image API with streaming.
134
+ FIXED: Using google.genai API with generate_content_stream and proper null checking
135
+ Returns: (success: bool, image_data: bytes or error_message: str)
136
+ """
137
+ try:
138
+ if not IMAGE_API_KEY:
139
+ return False, "IMAGE_API_KEY not configured"
140
+
141
+ # Initialize client with IMAGE API KEY
142
+ client = google_genai.Client(api_key=IMAGE_API_KEY)
143
+
144
+ # Professional technical prompt
145
+ prompt_text = f"""
146
+ Generate a professional, clean, and visually compelling image for a technical presentation.
147
+
148
+ **Context:**
149
+ This image will be used for a slide titled "{slide_title}" with the following content:
150
+ "{slide_content}"
151
+
152
+ The image should visually represent the concept described below to enhance understanding:
153
+ {image_description}
154
+
155
+ **Critical Requirements:**
156
+ - NO explanatory text, paragraphs, or detailed written descriptions overlaid on the image.
157
+ - Component labels ARE allowed where necessary for clarity (e.g., "API Server", "Worker Node", "Control Plane").
158
+ - Include a brief, centered caption below the image (max 5-7 words, research paper style) summarizing the visual concept.
159
+ - Use full canvas space efficiently β€” minimize blank margins, maximize information density.
160
+ - Clean, professional, modern aesthetic.
161
+ - Use color strategically to convey meaning and hierarchy.
162
+ - Suitable for a formal technical presentation slide.
163
+ - Prefer abstract/conceptual visualizations over literal images.
164
+ - Ensure all text in the diagram is spell-checked and professionally styled.
165
+
166
+ **Style Guidelines:**
167
+ - Pure white background (#FFFFFF) for professional appearance.
168
+ - Professional color palette optimized for white backgrounds:
169
+ * Primary: Deep navy blue (#1a365d), slate gray (#475569)
170
+ * Accent: Teal (#0d9488), ocean blue (#0284c7)
171
+ - Minimalist and elegant design with balanced spacing.
172
+ - 4:3 aspect ratio (landscape orientation).
173
+ """
174
+
175
+ print(f" 🎨 Generating technical image for: {slide_title}...")
176
+
177
+ # Create content with proper structure
178
+ contents = [types.Content(
179
+ role="user",
180
+ parts=[types.Part.from_text(text=prompt_text)]
181
+ )]
182
+
183
+ # Configure generation with 4:3 aspect ratio
184
+ generate_content_config = types.GenerateContentConfig(
185
+ response_modalities=["IMAGE", "TEXT"],
186
+ image_config=types.ImageConfig(aspect_ratio="4:3", image_size="1K"),
187
+ )
188
+
189
+ # Stream response and extract image
190
+ for chunk in client.models.generate_content_stream(
191
+ model="gemini-2.5-flash-image",
192
+ contents=contents,
193
+ config=generate_content_config
194
+ ):
195
+ # ===== FIXED: 5-level null checking as per notebooks =====
196
+ if not chunk.candidates:
197
+ continue
198
+
199
+ candidate = chunk.candidates[0]
200
+
201
+ if not hasattr(candidate, 'content') or candidate.content is None:
202
+ continue
203
+
204
+ if not hasattr(candidate.content, 'parts') or not candidate.content.parts:
205
+ continue
206
+
207
+ part = candidate.content.parts[0]
208
+
209
+ if not hasattr(part, 'inline_data') or part.inline_data is None:
210
+ continue
211
+
212
+ inline_data = part.inline_data
213
+
214
+ if inline_data.data:
215
+ image_data = inline_data.data
216
+ print(f" βœ… Image generated successfully")
217
+ return True, image_data
218
+
219
+ return False, "No image generated from API"
220
+
221
+ except Exception as e:
222
+ print(f" ❌ Image generation error: {str(e)}")
223
+ return False, f"Error: {str(e)}"
224
+
225
+ # ============================================================
226
+ # OPERATIONAL IMAGE GENERATION (FIXED - NEW API with proper error checking)
227
+ # ============================================================
228
+
229
+ @retry(
230
+ stop=stop_after_attempt(2),
231
+ wait=wait_exponential(multiplier=1, min=3, max=10)
232
+ )
233
+ def generate_operational_image(slide_title, slide_content, image_description):
234
+ """
235
+ Generate a business/operational diagram using NEW Gemini 2.5 Flash Image API with streaming.
236
+ FIXED: Using google.genai API with generate_content_stream and proper null checking
237
+ Returns: (success: bool, image_data: bytes or error_message: str)
238
+ """
239
+ try:
240
+ if not IMAGE_API_KEY:
241
+ return False, "IMAGE_API_KEY not configured"
242
+
243
+ # Initialize client with IMAGE API KEY
244
+ client = google_genai.Client(api_key=IMAGE_API_KEY)
245
+
246
+ # Business-focused prompt
247
+ prompt_text = f"""
248
+ Generate a professional, clean business/operational diagram for a compliance or regulatory presentation.
249
+
250
+ **Context:**
251
+ This image will be used for a slide titled "{slide_title}" with the following business content:
252
+ "{slide_content}"
253
+
254
+ The image should visually represent the operational/business/compliance concept described below:
255
+ {image_description}
256
+
257
+ **Critical Requirements:**
258
+ - NO explanatory text, paragraphs, or detailed written descriptions overlaid on the image.
259
+ - Component labels and process flow indicators ARE allowed (e.g., "Compliance Check", "Approval", "Risk Mitigation").
260
+ - Include a brief, centered caption below the image (max 5-7 words, business report style).
261
+ - Use full canvas space efficiently β€” minimize blank margins.
262
+ - Clean, professional, corporate aesthetic.
263
+ - Use color strategically: consider business standard colors (blue for trust, green for process).
264
+ - Suitable for a formal business presentation or compliance report.
265
+ - Prefer process flows, matrices, or business diagrams.
266
+
267
+ **Style Guidelines:**
268
+ - Pure white background (#FFFFFF).
269
+ - Professional business color palette:
270
+ * Primary: Corporate blue (#003366), professional gray (#4a5568)
271
+ * Accent: Business green (#2d5016), alert red (#c53030)
272
+ - Clean, minimal design with professional spacing.
273
+ - 4:3 aspect ratio (landscape for business presentations).
274
+ """
275
+
276
+ print(f" πŸ“Š Generating operational image for: {slide_title}...")
277
+
278
+ # Create content with proper structure
279
+ contents = [types.Content(
280
+ role="user",
281
+ parts=[types.Part.from_text(text=prompt_text)]
282
+ )]
283
+
284
+ # Configure generation with 4:3 aspect ratio
285
+ generate_content_config = types.GenerateContentConfig(
286
+ response_modalities=["IMAGE", "TEXT"],
287
+ image_config=types.ImageConfig(aspect_ratio="4:3", image_size="1K"),
288
+ )
289
+
290
+ # Stream response and extract image
291
+ for chunk in client.models.generate_content_stream(
292
+ model="gemini-2.5-flash-image",
293
+ contents=contents,
294
+ config=generate_content_config
295
+ ):
296
+ # ===== FIXED: 5-level null checking as per notebooks =====
297
+ if not chunk.candidates:
298
+ continue
299
+
300
+ candidate = chunk.candidates[0]
301
+
302
+ if not hasattr(candidate, 'content') or candidate.content is None:
303
+ continue
304
+
305
+ if not hasattr(candidate.content, 'parts') or not candidate.content.parts:
306
+ continue
307
+
308
+ part = candidate.content.parts[0]
309
+
310
+ if not hasattr(part, 'inline_data') or part.inline_data is None:
311
+ continue
312
+
313
+ inline_data = part.inline_data
314
+
315
+ if inline_data.data:
316
+ image_data = inline_data.data
317
+ print(f" βœ… Image generated successfully")
318
+ return True, image_data
319
+
320
+ return False, "No image generated from API"
321
+
322
+ except Exception as e:
323
+ print(f" ❌ Image generation error: {str(e)}")
324
+ return False, f"Error: {str(e)}"
325
+
326
+ # ============================================================
327
+ # PIPELINE IMAGE REPLACEMENT (FIXED - Complete integration)
328
+ # ============================================================
329
+
330
+ def process_images_for_pipeline(slide_json, mode="technical"):
331
+ """
332
+ FIXED: Complete image processing pipeline with proper sequencing.
333
+
334
+ Process all slides with image descriptions:
335
+ 1. Generate image with Gemini 2.5 Flash Image
336
+ 2. Save temporarily
337
+ 3. Autocrop white space
338
+ 4. Upload to GCP
339
+ 5. Replace image_description with GCP URL
340
+
341
+ Args:
342
+ slide_json: Slides JSON with image_description fields
343
+ mode: "technical" or "operational"
344
+
345
+ Returns:
346
+ Updated slide_json with image_description as GCP URLs
347
+ """
348
+
349
+ print(f"\n{'='*70}")
350
+ print(f"🎨 STAGE 4: Processing Images ({mode.upper()} Mode)")
351
+ print('='*70)
352
+
353
+ # Create temp folder for intermediate images
354
+ temp_folder = "/tmp/gen_images"
355
+ os.makedirs(temp_folder, exist_ok=True)
356
+
357
+ image_generator = generate_technical_image if mode == "technical" else generate_operational_image
358
+
359
+ for idx, slide in enumerate(slide_json.get('content', []), 1):
360
+ # Skip slides without image descriptions or with null
361
+ if not slide.get('image_description') or slide['image_description'] == "null":
362
+ print(f" ⊘ Slide {idx}: No image description")
363
+ continue
364
+
365
+ try:
366
+ slide_title = slide.get('slide_title', 'Slide')
367
+ slide_content = slide.get('slide_content', '')
368
+ image_desc = slide.get('image_description', '')
369
+
370
+ print(f"\n πŸ“ Processing Slide {idx}: {slide_title}")
371
+
372
+ # STEP 1: Generate image with NEW API
373
+ print(f" 1️⃣ Generating image...")
374
+ success, result = image_generator(slide_title, slide_content, image_desc)
375
+
376
+ if not success:
377
+ print(f" ❌ Generation failed: {result}")
378
+ slide['image_description'] = f"Failed: {result}"
379
+ continue
380
+
381
+ image_data = result
382
+
383
+ # STEP 2: Save image temporarily
384
+ print(f" 2️⃣ Saving to temporary file...")
385
+ raw_topic = slide_json.get('topic', 'topic')
386
+ topic_slug = re.sub(r'[^a-zA-Z0-9_-]+', '_', raw_topic.strip().lower()).strip('_')
387
+ topic_slug = topic_slug[:15]
388
+ ts = int(time.time())
389
+ temp_file_name = f"slide_{idx}_{topic_slug}_{mode}_{ts}.png"
390
+ temp_file_path = os.path.join(temp_folder, temp_file_name)
391
+
392
+ with open(temp_file_path, 'wb') as f:
393
+ f.write(image_data)
394
+
395
+ print(f" βœ“ Saved: {temp_file_name}")
396
+
397
+ # STEP 3: Autocrop white space
398
+ print(f" 3️⃣ Autocropping white space...")
399
+ try:
400
+ autocrop_tight_vertical(temp_file_path, temp_file_path)
401
+ print(f" βœ“ Autocrop successful")
402
+ except Exception as e:
403
+ print(f" ⚠️ Autocrop skipped: {e}")
404
+
405
+ # STEP 4: Upload to GCP
406
+ print(f" 4️⃣ Uploading to GCP Storage...")
407
+ image_url = None
408
+
409
+ if GCP_AVAILABLE:
410
+ try:
411
+ with open(temp_file_path, 'rb') as f:
412
+ image_bytes = f.read()
413
+
414
+ gcp_blob_path = f"images/{mode}/{temp_file_name}"
415
+ blob = gcp_bucket.blob(gcp_blob_path)
416
+ blob.upload_from_string(image_bytes, content_type="image/png")
417
+
418
+ image_url = blob.public_url
419
+ print(f" βœ… Uploaded to GCP: {image_url}")
420
+
421
+ except Exception as e:
422
+ error_str = str(e).lower()
423
+ if 'billing' in error_str or 'project_invalid' in error_str:
424
+ print(f" ⚠️ GCP billing not enabled")
425
+ image_url = None
426
+ else:
427
+ print(f" ❌ GCP upload error: {str(e)}")
428
+ image_url = None
429
+ else:
430
+ print(f" ⚠️ GCP not configured - cannot upload")
431
+
432
+ # STEP 5: Update slide with URL or error message
433
+ if image_url:
434
+ slide['image_description'] = image_url
435
+ print(f" βœ… Slide {idx} complete: Image available at GCP URL")
436
+ else:
437
+ slide['image_description'] = "Image generation succeeded but upload unavailable"
438
+ print(f" ⚠️ Slide {idx}: Image not uploaded to GCP")
439
+
440
+ # Cleanup temp file
441
+ try:
442
+ os.remove(temp_file_path)
443
+ except:
444
+ pass
445
+
446
+ except Exception as e:
447
+ print(f" ❌ Error processing slide {idx}: {str(e)}")
448
+ slide['image_description'] = f"Error: {str(e)}"
449
+
450
+ print(f"\nβœ… Image processing complete")
451
+ return slide_json
452
+
453
+ print("βœ“ Image generation functions ready (NEW Gemini 2.5 Flash Image API + proper error checking)")
src/pipelines_functions.py ADDED
@@ -0,0 +1,482 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import re
4
+ from openai import OpenAI
5
+ import google.generativeai as genai
6
+ from tenacity import retry, stop_after_attempt, wait_exponential
7
+ from dotenv import load_dotenv
8
+
9
+ load_dotenv()
10
+
11
+ # ============================================================
12
+ # API INITIALIZATION
13
+ # ============================================================
14
+
15
+ PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")
16
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
17
+
18
+ if not PERPLEXITY_API_KEY:
19
+ raise ValueError("❌ PERPLEXITY_API_KEY not set in .env")
20
+ if not GEMINI_API_KEY:
21
+ raise ValueError("❌ GEMINI_API_KEY not set in .env")
22
+
23
+ perplexity_client = OpenAI(
24
+ api_key=PERPLEXITY_API_KEY,
25
+ base_url="https://api.perplexity.ai",
26
+ )
27
+
28
+ genai.configure(api_key=GEMINI_API_KEY)
29
+
30
+ # ============================================================
31
+ # TECHNICAL PIPELINE
32
+ # ============================================================
33
+
34
+ @retry(
35
+ stop=stop_after_attempt(3),
36
+ wait=wait_exponential(multiplier=1, min=4, max=10)
37
+ )
38
+ def generate_technical_content(topic):
39
+ """
40
+ Stage 1: Generate technical slides using Perplexity.
41
+ EXACT PROMPT from technical_gcp_image_pipeline-1.ipynb
42
+ """
43
+ print(f"\nπŸ”„ Generating technical content for: {topic}")
44
+
45
+ try:
46
+ system_prompt = f"""You are a domain expert in technology and IT infrastructure with deep knowledge across all technology domains.
47
+
48
+ Task:
49
+ For the topic "{topic}", generate 9 to 10 slides as JSON.
50
+
51
+ Instructions:
52
+ - Write universally applicable content that any technology professional can understand and use.
53
+ - Each slide should have an engaging and concise "slide_title" (maximum 6 words).
54
+ - "slide_content" must be 3-4 sentences (strictly 40-60 words) with technical depth and practical relevance.
55
+ - For the 3 most critical slides ONLY, add "image_description" (strictly 30-40 words) describing specific technical diagrams.
56
+ - First slide: Overview explaining why this technology matters universally.
57
+ - Last slide: "Further Learning & Documentation" with placeholder for 5 curated URLs.
58
+ - Use clear, accessible language. Avoid industry-specific jargon.
59
+ - For all other slides, set image_description to null.
60
+
61
+ Additional Requirement β€” ALIASES FIELD:
62
+ - Generate 6-7 lowercase alternative names/synonyms for "{topic}".
63
+ - First alias MUST be the normalized lowercase form of the topic.
64
+ - Include abbreviations and common variations.
65
+
66
+ Output ONLY valid JSON (no code blocks, no markdown):
67
+ {{
68
+ "topic": "{topic}",
69
+ "aliases": ["primary lowercase form", "alias2", "alias3", ...],
70
+ "content": [
71
+ {{
72
+ "slide_title": "...",
73
+ "slide_content": "...",
74
+ "image_description": "..." or null
75
+ }}
76
+ ],
77
+ "urls": [
78
+ {{"title": "...", "url": "https://..."}},
79
+ ...
80
+ ]
81
+ }}
82
+ """
83
+
84
+ response = perplexity_client.chat.completions.create(
85
+ model="sonar-pro",
86
+ messages=[
87
+ {"role": "system", "content": system_prompt},
88
+ {"role": "user", "content": f"Generate a universally applicable technical presentation on {topic}"}
89
+ ],
90
+ temperature=0.5,
91
+ max_tokens=4000,
92
+ timeout=60,
93
+ )
94
+
95
+ content = response.choices[0].message.content
96
+
97
+ try:
98
+ result = json.loads(content)
99
+ if 'aliases' not in result:
100
+ result['aliases'] = [topic.lower().strip()]
101
+ print(f"βœ… Generation successful - {len(result.get('content', []))} slides")
102
+ return result
103
+ except json.JSONDecodeError:
104
+ json_match = re.search(r'\{.*\}', content, re.DOTALL)
105
+ if json_match:
106
+ result = json.loads(json_match.group())
107
+ if 'aliases' not in result:
108
+ result['aliases'] = [topic.lower().strip()]
109
+ return result
110
+ raise ValueError("Could not parse JSON from response")
111
+
112
+ except Exception as e:
113
+ print(f"❌ Generation failed: {type(e).__name__}: {str(e)}")
114
+ raise
115
+
116
+ @retry(
117
+ stop=stop_after_attempt(2),
118
+ wait=wait_exponential(multiplier=1, min=3, max=10)
119
+ )
120
+ def correct_technical_content(generated_json):
121
+ """
122
+ Stage 2: Correct with Gemini 2.5 Flash (TEXT ONLY).
123
+ EXACT PROMPT from technical_gcp_image_pipeline-1.ipynb
124
+ """
125
+ print(f"\nπŸ”„ Correcting technical content with Gemini 2.5 Flash")
126
+
127
+ try:
128
+ gemini_model = genai.GenerativeModel("gemini-2.5-flash")
129
+
130
+ correction_prompt = f"""You are an expert technical editor for universal technology training materials.
131
+
132
+ Review the following slide presentation and improve it:
133
+
134
+ {json.dumps(generated_json, indent=2)}
135
+
136
+ Your tasks:
137
+ 1. Ensure slide titles are clear, concise (max 6 words) and engaging.
138
+ 2. Verify that slide_content is universally applicable.
139
+ 3. Check that content flows logically, is technically accurate.
140
+ 4. For image_descriptions: Make them specific, actionable, and suitable for technical diagram generation.
141
+ 5. Review and enhance URLs - add 2-3 additional high-quality URLs if missing.
142
+ 6. Keep all word counts natural and readable.
143
+
144
+ CRITICAL INSTRUCTION:
145
+ - The field "aliases" must remain EXACTLY as provided (do not change it).
146
+ - Keep "image_description" fields exactly as they are.
147
+ - For slides without image_description, set to null.
148
+ - Retain the most educationally valuable 3 slides for images β€” set the rest to null.
149
+
150
+ OUTPUT REQUIREMENT:
151
+ Return ONLY the corrected JSON in the exact same schema as the input.
152
+ Do not include code fences, markdown, or extra commentary.
153
+ """
154
+
155
+ response = gemini_model.generate_content(correction_prompt)
156
+ corrected_text = response.text.strip()
157
+
158
+ corrected_text = re.sub(r'^\s*```(?:json)?\s*\n?', '', corrected_text, count=1)
159
+ corrected_text = re.sub(r'\s*```\s*$', '', corrected_text, count=1)
160
+
161
+ try:
162
+ result = json.loads(corrected_text)
163
+ if 'aliases' not in result:
164
+ result['aliases'] = generated_json.get('aliases', [])
165
+ print(f"βœ… Correction successful")
166
+ return result
167
+
168
+ except json.JSONDecodeError:
169
+ json_match = re.search(r'\{.*\}', corrected_text, re.DOTALL)
170
+ if json_match:
171
+ result = json.loads(json_match.group())
172
+ if 'aliases' not in result:
173
+ result['aliases'] = generated_json.get('aliases', [])
174
+ return result
175
+
176
+ print(f"⚠️ Correction parsing failed - returning original")
177
+ return generated_json
178
+
179
+ except Exception as e:
180
+ print(f"❌ Correction failed: {type(e).__name__}: {str(e)}")
181
+ raise
182
+
183
+ @retry(
184
+ stop=stop_after_attempt(2),
185
+ wait=wait_exponential(multiplier=1, min=3, max=10)
186
+ )
187
+ def refine_technical_content(validated_json):
188
+ """
189
+ Stage 3: Final refinement with Perplexity.
190
+ EXACT PROMPT from technical_gcp_image_pipeline-1.ipynb
191
+ """
192
+ print(f"\nπŸ”„ Refining technical content")
193
+
194
+ try:
195
+ refine_prompt = f"""You are a senior technical content specialist for universal technology training.
196
+
197
+ This slide presentation has been validated. Perform the final refinement:
198
+
199
+ {json.dumps(validated_json, indent=2)}
200
+
201
+ Your tasks:
202
+ 1. Ensure image_descriptions are detailed, specific, and suitable for technical diagram generation.
203
+ 2. Verify that slide content is universally applicable and consistent.
204
+ 3. Confirm that all technical terms are accurate.
205
+ 4. Review and refine the URLs:
206
+ - Select up to 5 of the best URLs only.
207
+ - Order them by: Authority, Relevance, Learning value, Diversity.
208
+ - Ensure all chosen URLs are authoritative and current.
209
+ 5. Keep all slide content exactly the same length/style.
210
+ 6. Maintain perfect JSON structure.
211
+
212
+ CRITICAL INSTRUCTION:
213
+ - The field "aliases" must remain EXACTLY as provided.
214
+ - Keep "image_description" fields for image generation.
215
+
216
+ OUTPUT REQUIREMENT:
217
+ Return ONLY the refined JSON in the exact same schema as the input.
218
+ """
219
+
220
+ response = perplexity_client.chat.completions.create(
221
+ model="sonar-pro",
222
+ messages=[{"role": "user", "content": refine_prompt}],
223
+ temperature=0.3,
224
+ max_tokens=4000,
225
+ timeout=60,
226
+ )
227
+
228
+ refined_text = response.choices[0].message.content.strip()
229
+
230
+ refined_text = re.sub(r'^\s*```(?:json)?\s*\n?', '', refined_text, count=1)
231
+ refined_text = re.sub(r'\s*```\s*$', '', refined_text, count=1)
232
+
233
+ try:
234
+ result = json.loads(refined_text)
235
+ if 'aliases' not in result:
236
+ result['aliases'] = validated_json.get('aliases', [])
237
+ print(f"βœ… Refinement successful")
238
+ return result
239
+
240
+ except json.JSONDecodeError:
241
+ json_match = re.search(r'\{.*\}', refined_text, re.DOTALL)
242
+ if json_match:
243
+ result = json.loads(json_match.group())
244
+ if 'aliases' not in result:
245
+ result['aliases'] = validated_json.get('aliases', [])
246
+ return result
247
+
248
+ print(f"⚠️ Refinement failed - returning validated content")
249
+ return validated_json
250
+
251
+ except Exception as e:
252
+ print(f"❌ Refinement failed: {type(e).__name__}: {str(e)}")
253
+ raise
254
+
255
+ # ============================================================
256
+ # OPERATIONAL PIPELINE
257
+ # ============================================================
258
+
259
+ @retry(
260
+ stop=stop_after_attempt(3),
261
+ wait=wait_exponential(multiplier=1, min=4, max=10)
262
+ )
263
+ def generate_operational_content(topic):
264
+ """
265
+ Stage 1: Generate operational slides using Perplexity.
266
+ EXACT PROMPT from operational_gcp_image_pipeline-2.ipynb
267
+ """
268
+ print(f"\nπŸ”„ Generating operational content for: {topic}")
269
+
270
+ try:
271
+ system_prompt = f"""You are a domain expert in business operations, compliance, regulatory frameworks, and enterprise management.
272
+
273
+ Task:
274
+ For the topic "{topic}", generate 9 to 10 slides as JSON.
275
+
276
+ Instructions:
277
+ - Target intermediate professionals (2+ years experience) seeking actionable, scenario-driven insights.
278
+ - Each slide should have a unique and engaging "slide_title" (maximum 6 words).
279
+ - "slide_content" must be 3-4 sentences (strictly 40-60 words), balancing regulatory requirements with operational business value.
280
+ - Emphasize both regulatory drivers AND business impact: compliance obligations, operational efficiency, risk mitigation, and competitive advantage.
281
+ - For the 3 most important slides ONLY, add "image_description" (strictly 30-40 words) describing meaningful business/operational diagrams.
282
+ - First slide: Overview positioning the topic's regulatory importance and business operational impact.
283
+ - Last slide: "Further Learning & Documentation" with specific next learning topics.
284
+ - Use clear, accessible language without basic dictionary definitions.
285
+ - Focus on practical application, regulatory compliance, and business outcomes.
286
+ - For all other slides, set image_description to null.
287
+
288
+ Additional Requirement β€” ALIASES FIELD:
289
+ - Generate 4-5 lowercase alternative names/synonyms for "{topic}".
290
+ - First alias MUST be the normalized lowercase form of the topic.
291
+ - Include abbreviations and terms that refer to the same concept.
292
+
293
+ Output ONLY valid JSON (no code blocks, no markdown):
294
+ {{
295
+ "topic": "{topic}",
296
+ "aliases": ["primary lowercase form", "alias2", ...],
297
+ "content": [
298
+ {{
299
+ "slide_title": "...",
300
+ "slide_content": "...",
301
+ "image_description": "..." or null
302
+ }}
303
+ ],
304
+ "urls": [
305
+ {{"title": "...", "url": "https://..."}},
306
+ ...
307
+ ]
308
+ }}
309
+ """
310
+
311
+ response = perplexity_client.chat.completions.create(
312
+ model="sonar-pro",
313
+ messages=[
314
+ {"role": "system", "content": system_prompt},
315
+ {"role": "user", "content": f"Generate an intermediate-level, practical business operations presentation on: {topic}"}
316
+ ],
317
+ temperature=0.5,
318
+ max_tokens=4000,
319
+ timeout=60,
320
+ )
321
+
322
+ content = response.choices[0].message.content
323
+
324
+ try:
325
+ result = json.loads(content)
326
+ if 'aliases' not in result:
327
+ result['aliases'] = [topic.lower().strip()]
328
+ print(f"βœ… Generation successful - {len(result.get('content', []))} slides")
329
+ return result
330
+ except json.JSONDecodeError:
331
+ json_match = re.search(r'\{.*\}', content, re.DOTALL)
332
+ if json_match:
333
+ result = json.loads(json_match.group())
334
+ if 'aliases' not in result:
335
+ result['aliases'] = [topic.lower().strip()]
336
+ return result
337
+ raise ValueError("Could not parse JSON from response")
338
+
339
+ except Exception as e:
340
+ print(f"❌ Generation failed: {type(e).__name__}: {str(e)}")
341
+ raise
342
+
343
+ @retry(
344
+ stop=stop_after_attempt(2),
345
+ wait=wait_exponential(multiplier=1, min=3, max=10)
346
+ )
347
+ def correct_operational_content(generated_json):
348
+ """
349
+ Stage 2: Correct with Gemini 2.5 PRO (stronger model for operational).
350
+ EXACT PROMPT from operational_gcp_image_pipeline-2.ipynb
351
+ """
352
+ print(f"\nπŸ”„ Correcting operational content with Gemini 2.5 PRO")
353
+
354
+ try:
355
+ gemini_model = genai.GenerativeModel("gemini-2.5-pro") # STRONGER MODEL FOR OPERATIONAL
356
+
357
+ correction_prompt = f"""You are an expert business operations and compliance editor.
358
+
359
+ Review this business operations presentation and improve it:
360
+
361
+ {json.dumps(generated_json, indent=2)}
362
+
363
+ Your tasks:
364
+ 1. Ensure slide titles are clear, concise (max 6 words), and business-focused.
365
+ 2. Verify slide_content balances regulatory requirements WITH business operational value (40–60 words).
366
+ 3. Strengthen regulatory references: name specific acts, frameworks, or compliance concepts.
367
+ 4. For image_descriptions: Make them specific to business processes and regulatory workflows.
368
+ 5. Review and improve the URLs - add 2-3 additional high-quality official URLs.
369
+ 6. Maintain the intermediate professional tone.
370
+ 7. Ensure logical flow: regulatory β†’ operational β†’ actionable insights.
371
+
372
+ CRITICAL INSTRUCTION:
373
+ - The field "aliases" must remain EXACTLY as provided.
374
+ - Keep "image_description" fields for image generation.
375
+ - For slides without image_description, set to null.
376
+ - Retain the most important 3 slides for images β€” set the rest to null.
377
+
378
+ OUTPUT REQUIREMENT:
379
+ Return ONLY the corrected JSON in the exact same schema as the input.
380
+ """
381
+
382
+ response = gemini_model.generate_content(correction_prompt)
383
+ corrected_text = response.text.strip()
384
+
385
+ corrected_text = re.sub(r'^\s*```(?:json)?\s*\n?', '', corrected_text, count=1)
386
+ corrected_text = re.sub(r'\s*```\s*$', '', corrected_text, count=1)
387
+
388
+ try:
389
+ result = json.loads(corrected_text)
390
+ if 'aliases' not in result:
391
+ result['aliases'] = generated_json.get('aliases', [])
392
+ print(f"βœ… Correction successful")
393
+ return result
394
+
395
+ except json.JSONDecodeError:
396
+ json_match = re.search(r'\{.*\}', corrected_text, re.DOTALL)
397
+ if json_match:
398
+ result = json.loads(json_match.group())
399
+ if 'aliases' not in result:
400
+ result['aliases'] = generated_json.get('aliases', [])
401
+ return result
402
+
403
+ print(f"⚠️ Correction parsing failed - returning original")
404
+ return generated_json
405
+
406
+ except Exception as e:
407
+ print(f"❌ Correction failed: {type(e).__name__}: {str(e)}")
408
+ raise
409
+
410
+ @retry(
411
+ stop=stop_after_attempt(2),
412
+ wait=wait_exponential(multiplier=1, min=3, max=10)
413
+ )
414
+ def refine_operational_content(validated_json):
415
+ """
416
+ Stage 3: Final refinement with Perplexity.
417
+ EXACT PROMPT from operational_gcp_image_pipeline-2.ipynb
418
+ """
419
+ print(f"\nπŸ”„ Refining operational content")
420
+
421
+ try:
422
+ refine_prompt = f"""You are a senior business operations content specialist.
423
+
424
+ This business operations presentation has been validated. Perform the final refinement:
425
+
426
+ {json.dumps(validated_json, indent=2)}
427
+
428
+ Your tasks:
429
+ 1. Ensure image descriptions are specific to business workflows, compliance processes, and decision-making.
430
+ 2. Verify slide content emphasizes actionable business value, regulatory relevance, and measurable outcomes.
431
+ 3. Confirm terminology is accurate, consistent, and understandable to intermediate business professionals.
432
+ 4. Review and refine the URLs:
433
+ - Select up to 5 of the best URLs only.
434
+ - Order by: Authority (regulatory bodies first), Relevance, Learning value, Diversity.
435
+ - Ensure all URLs are authoritative, recent, and relevant.
436
+ 5. Keep all slide content exactly the same.
437
+ 6. Maintain perfect JSON structure.
438
+
439
+ CRITICAL INSTRUCTION:
440
+ - The field "aliases" must remain EXACTLY as provided.
441
+ - Keep "image_description" fields for image generation.
442
+
443
+ OUTPUT REQUIREMENT:
444
+ Return ONLY the refined JSON in the exact same schema as the input.
445
+ """
446
+
447
+ response = perplexity_client.chat.completions.create(
448
+ model="sonar-pro",
449
+ messages=[{"role": "user", "content": refine_prompt}],
450
+ temperature=0.3,
451
+ max_tokens=4000,
452
+ timeout=60,
453
+ )
454
+
455
+ refined_text = response.choices[0].message.content.strip()
456
+
457
+ refined_text = re.sub(r'^\s*```(?:json)?\s*\n?', '', refined_text, count=1)
458
+ refined_text = re.sub(r'\s*```\s*$', '', refined_text, count=1)
459
+
460
+ try:
461
+ result = json.loads(refined_text)
462
+ if 'aliases' not in result:
463
+ result['aliases'] = validated_json.get('aliases', [])
464
+ print(f"βœ… Refinement successful")
465
+ return result
466
+
467
+ except json.JSONDecodeError:
468
+ json_match = re.search(r'\{.*\}', refined_text, re.DOTALL)
469
+ if json_match:
470
+ result = json.loads(json_match.group())
471
+ if 'aliases' not in result:
472
+ result['aliases'] = validated_json.get('aliases', [])
473
+ return result
474
+
475
+ print(f"⚠️ Refinement failed - returning validated content")
476
+ return validated_json
477
+
478
+ except Exception as e:
479
+ print(f"❌ Refinement failed: {type(e).__name__}: {str(e)}")
480
+ raise
481
+
482
+ print("βœ“ All pipeline functions loaded (Perplexity + Gemini 2.5 Flash/Pro for text)")
src/streamlit_app.py CHANGED
@@ -1,40 +1,725 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
  import streamlit as st
 
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import json
3
+ import os
4
+ from datetime import datetime
5
+ from dotenv import load_dotenv
6
 
7
+
8
+ # Import custom functions
9
+ from pipelines_functions import (
10
+ generate_technical_content, correct_technical_content, refine_technical_content,
11
+ generate_operational_content, correct_operational_content, refine_operational_content
12
+ )
13
+ from utils_functions import (
14
+ validate_and_sanitize_topic, check_cache, save_to_cache, validate_and_select_urls,
15
+ get_collections, PipelineMetrics
16
+ )
17
+ from image_generation_functions import process_images_for_pipeline
18
+
19
+
20
+ load_dotenv()
21
+
22
+
23
+ # ============================================================
24
+ # PAGE CONFIGURATION
25
+ # ============================================================
26
+
27
+
28
+ st.set_page_config(
29
+ page_title="LearnOnTheGo",
30
+ page_icon="πŸŽ“",
31
+ layout="wide",
32
+ initial_sidebar_state="collapsed"
33
+ )
34
+
35
+
36
+ # ============================================================
37
+ # HELPER FUNCTIONS
38
+ # ============================================================
39
+
40
+
41
+ def sanitize_for_html(raw_text: str) -> str:
42
+ """Escape HTML special characters for safe embedding."""
43
+ if not isinstance(raw_text, str):
44
+ raw_text = str(raw_text)
45
+ return raw_text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
46
+
47
+
48
+ def detect_code_content(text: str) -> bool:
49
+ """Detect if content looks like code (has HTML tags, brackets, etc)."""
50
+ code_indicators = ['<div', '<html', 'class=', 'style=', '{', '}', 'function', 'import', 'def ']
51
+ return any(indicator in text for indicator in code_indicators)
52
+
53
+
54
+ # ============================================================
55
+ # CUSTOM CSS - FINAL VERSION WITH SEPARATED PROGRESS BAR
56
+ # ============================================================
57
+
58
+
59
+ st.markdown("""
60
+ <style>
61
+ /* Root color palette */
62
+ :root {
63
+ --primary-blue: #2563eb;
64
+ --accent-teal: #0891b2;
65
+ --light-blue: #eff6ff;
66
+ --light-teal: #e0f2fe;
67
+ --text-dark: #1e293b;
68
+ --text-light: #64748b;
69
+ --border-color: #bae6fd;
70
+ --shadow: 0 8px 24px rgba(37, 99, 235, 0.12);
71
+ --shadow-lg: 0 20px 50px rgba(37, 99, 235, 0.25);
72
+ }
73
+
74
+ /* Overall app styling */
75
+ .stApp {
76
+ background: linear-gradient(135deg, #dbeafe 0%, #cffafe 100%);
77
+ }
78
+
79
+ /* Hide default Streamlit elements */
80
+ #MainMenu {visibility: hidden;}
81
+ footer {visibility: hidden;}
82
+ header {visibility: hidden;}
83
+
84
+ /* Main container */
85
+ .block-container {
86
+ max-width: 1400px;
87
+ padding-top: 2rem;
88
+ padding-bottom: 2rem;
89
+ }
90
+
91
+ /* Header styling */
92
+ .header-container {
93
+ background: linear-gradient(110deg, var(--primary-blue) 0%, var(--accent-teal) 100%);
94
+ padding: 60px 30px;
95
+ border-radius: 28px;
96
+ text-align: center;
97
+ margin-bottom: 40px;
98
+ box-shadow: var(--shadow-lg);
99
+ }
100
+
101
+ .header-container h1 {
102
+ color: white;
103
+ font-size: 62px;
104
+ margin: 0;
105
+ font-weight: 900;
106
+ letter-spacing: 3px;
107
+ text-shadow: 0 4px 12px rgba(0, 0, 0, 0.2);
108
+ }
109
+
110
+ /* Search box styling */
111
+ .search-container {
112
+ background: linear-gradient(135deg, #ffffff 0%, #f8fafc 100%);
113
+ padding: 35px;
114
+ border-radius: 24px;
115
+ margin-bottom: 30px;
116
+ border: 3px solid var(--border-color);
117
+ box-shadow: var(--shadow);
118
+ }
119
+
120
+ /* Text input styling */
121
+ .stTextInput > div > div > input {
122
+ border: 3px solid var(--accent-teal) !important;
123
+ border-radius: 14px !important;
124
+ padding: 16px 20px !important;
125
+ font-size: 17px !important;
126
+ background-color: white !important;
127
+ transition: all 0.3s ease !important;
128
+ }
129
+
130
+ .stTextInput > div > div > input:focus {
131
+ border-color: var(--primary-blue) !important;
132
+ box-shadow: 0 0 0 4px rgba(37, 99, 235, 0.15) !important;
133
+ outline: none !important;
134
+ }
135
+
136
+ .stTextInput > div > div > input::placeholder {
137
+ color: rgba(100, 116, 139, 0.6) !important;
138
+ font-weight: 500 !important;
139
+ }
140
+
141
+ /* Radio container */
142
+ .stRadio > div[role="radiogroup"] {
143
+ display: flex !important;
144
+ gap: 0 !important;
145
+ background: #e0e7ff !important;
146
+ border-radius: 14px !important;
147
+ padding: 4px !important;
148
+ border: 3px solid var(--border-color) !important;
149
+ width: fit-content !important;
150
+ margin: 0 auto !important;
151
+ }
152
+
153
+ /* Individual radio labels */
154
+ .stRadio > div[role="radiogroup"] > label {
155
+ background: transparent !important;
156
+ border: none !important;
157
+ padding: 12px 32px !important;
158
+ border-radius: 10px !important;
159
+ cursor: pointer !important;
160
+ transition: all 0.3s ease !important;
161
+ font-weight: 700 !important;
162
+ color: var(--text-light) !important;
163
+ font-size: 15px !important;
164
+ text-align: center !important;
165
+ min-width: 140px !important;
166
+ margin: 0 !important;
167
+ flex: 1 !important;
168
+ }
169
+
170
+ .stRadio > div[role="radiogroup"] > label:hover {
171
+ background: rgba(255, 255, 255, 0.5) !important;
172
+ }
173
+
174
+ .stRadio > div[role="radiogroup"] > label[data-checked="true"],
175
+ .stRadio > div[role="radiogroup"] > label:has(input:checked),
176
+ .stRadio > div[role="radiogroup"] > label[aria-checked="true"] {
177
+ background: linear-gradient(110deg, var(--primary-blue) 0%, var(--accent-teal) 100%) !important;
178
+ color: white !important;
179
+ box-shadow: 0 4px 12px rgba(37, 99, 235, 0.35) !important;
180
+ }
181
+
182
+ .stRadio input[type="radio"] {
183
+ display: none !important;
184
+ }
185
+
186
+ /* Button styling */
187
+ .stButton > button {
188
+ background: linear-gradient(100deg, var(--primary-blue) 0%, var(--accent-teal) 100%) !important;
189
+ color: white !important;
190
+ font-weight: 800 !important;
191
+ padding: 18px 40px !important;
192
+ border-radius: 14px !important;
193
+ border: none !important;
194
+ transition: all 0.3s ease !important;
195
+ box-shadow: 0 6px 20px rgba(37, 99, 235, 0.3) !important;
196
+ font-size: 17px !important;
197
+ letter-spacing: 0.5px !important;
198
+ }
199
+
200
+ .stButton > button:hover {
201
+ background: linear-gradient(100deg, #1d4ed8 0%, #0e7490 100%) !important;
202
+ transform: translateY(-3px) !important;
203
+ box-shadow: 0 10px 30px rgba(37, 99, 235, 0.4) !important;
204
+ }
205
+
206
+ /* SLIDE BOX - WITHOUT PROGRESS BAR */
207
+ .slide-box-wrapper {
208
+ background: white;
209
+ border-radius: 28px;
210
+ border: 3px solid var(--border-color);
211
+ box-shadow: 0 20px 60px rgba(37, 99, 235, 0.18);
212
+ padding: 50px 45px;
213
+ margin: 40px auto;
214
+ max-width: 1000px;
215
+ animation: slideIn 0.5s ease-out;
216
+ }
217
+
218
+ @keyframes slideIn {
219
+ from {
220
+ opacity: 0;
221
+ transform: translateY(30px);
222
+ }
223
+ to {
224
+ opacity: 1;
225
+ transform: translateY(0);
226
+ }
227
+ }
228
+
229
+ /* Slide content */
230
+ .slide-box-wrapper > * {
231
+ display: block !important;
232
+ width: 100% !important;
233
+ box-sizing: border-box !important;
234
+ }
235
+
236
+ /* Slide title */
237
+ .slide-box-wrapper h2 {
238
+ font-size: 42px;
239
+ font-weight: 900;
240
+ color: var(--primary-blue);
241
+ margin: 0 0 20px 0 !important;
242
+ letter-spacing: 1px;
243
+ line-height: 1.3;
244
+ text-shadow: 0 2px 8px rgba(37, 99, 235, 0.15);
245
+ word-wrap: break-word;
246
+ overflow-wrap: break-word;
247
+ text-align: center;
248
+ }
249
+
250
+ /* Slide text and paragraphs */
251
+ .slide-box-wrapper p {
252
+ font-size: 20px;
253
+ color: var(--text-dark);
254
+ line-height: 2.2;
255
+ margin: 0 0 24px 0 !important;
256
+ font-weight: 500;
257
+ text-align: left;
258
+ padding: 0 20px;
259
+ box-sizing: border-box;
260
+ }
261
+
262
+ /* Code block styling */
263
+ .slide-box-wrapper pre {
264
+ background: #f8fafc;
265
+ border-radius: 12px;
266
+ padding: 20px;
267
+ font-family: 'Consolas', 'Monaco', 'Courier New', monospace;
268
+ font-size: 14px;
269
+ color: #0f172a;
270
+ overflow-x: auto;
271
+ border: 2px solid #e0e7ff;
272
+ margin: 16px 20px !important;
273
+ text-align: left;
274
+ line-height: 1.6;
275
+ white-space: pre-wrap;
276
+ word-wrap: break-word;
277
+ }
278
+
279
+ /* Images inside slide box */
280
+ .slide-box-wrapper img {
281
+ max-width: 90% !important;
282
+ height: auto !important;
283
+ border-radius: 20px !important;
284
+ box-shadow: 0 8px 24px rgba(37, 99, 235, 0.2) !important;
285
+ display: block !important;
286
+ margin: 24px auto !important;
287
+ }
288
+
289
+ /* Learning Resources */
290
+ .resources-section {
291
+ margin-top: 32px;
292
+ padding-top: 28px;
293
+ border-top: 3px solid var(--border-color);
294
+ text-align: left;
295
+ }
296
+
297
+ .resources-section h4 {
298
+ color: var(--primary-blue);
299
+ font-size: 22px;
300
+ margin: 0 0 18px 0 !important;
301
+ font-weight: 800;
302
+ text-align: center;
303
+ }
304
+
305
+ .resources-section a {
306
+ color: var(--accent-teal);
307
+ text-decoration: none;
308
+ font-weight: 600;
309
+ transition: all 0.3s ease;
310
+ display: block;
311
+ padding: 10px 15px;
312
+ font-size: 16px;
313
+ border-radius: 8px;
314
+ margin-bottom: 8px;
315
+ }
316
+
317
+ .resources-section a:hover {
318
+ color: white;
319
+ background: var(--primary-blue);
320
+ padding-left: 20px;
321
+ box-shadow: 0 4px 12px rgba(37, 99, 235, 0.2);
322
+ }
323
+
324
+ /* PROGRESS CONTAINER - MOVED OUTSIDE BOX */
325
+ .progress-container {
326
+ display: flex;
327
+ align-items: center;
328
+ justify-content: center;
329
+ gap: 20px;
330
+ margin: 30px auto;
331
+ padding: 20px 0;
332
+ max-width: 1000px;
333
+ width: 100%;
334
+ }
335
+
336
+ .progress-bar {
337
+ flex: 1;
338
+ max-width: 700px;
339
+ height: 10px;
340
+ background: #e0e7ff;
341
+ border-radius: 12px;
342
+ overflow: hidden;
343
+ box-shadow: inset 0 2px 4px rgba(37, 99, 235, 0.1);
344
+ }
345
+
346
+ .progress-fill {
347
+ height: 100%;
348
+ background: linear-gradient(90deg, var(--primary-blue) 0%, var(--accent-teal) 100%);
349
+ transition: width 0.4s ease;
350
+ box-shadow: 0 0 10px rgba(37, 99, 235, 0.4);
351
+ }
352
+
353
+ /* Slide counter badge */
354
+ .slide-counter-badge {
355
+ background: linear-gradient(110deg, var(--primary-blue) 0%, var(--accent-teal) 100%);
356
+ color: white;
357
+ padding: 10px 20px;
358
+ border-radius: 24px;
359
+ font-size: 16px;
360
+ font-weight: 800;
361
+ min-width: 90px;
362
+ text-align: center;
363
+ box-shadow: 0 4px 12px rgba(37, 99, 235, 0.3);
364
+ white-space: nowrap;
365
+ }
366
+
367
+ /* Footer */
368
+ .footer-bar {
369
+ margin-top: 60px;
370
+ text-align: center;
371
+ color: var(--text-dark);
372
+ font-size: 16px;
373
+ letter-spacing: 0.5px;
374
+ padding: 32px 0;
375
+ border-top: 3px solid var(--border-color);
376
+ background: white;
377
+ border-radius: 20px;
378
+ box-shadow: 0 4px 16px rgba(37, 99, 235, 0.08);
379
+ }
380
+
381
+ .footer-bar p {
382
+ margin: 10px 0;
383
+ font-weight: 600;
384
+ }
385
+
386
+ .footer-bar p:last-child {
387
+ font-size: 14px;
388
+ color: var(--text-light);
389
+ font-weight: 500;
390
+ }
391
+
392
+ /* Alert messages */
393
+ .stSuccess {
394
+ border-radius: 14px !important;
395
+ border-left: 5px solid #10b981 !important;
396
+ background-color: #ecfdf5 !important;
397
+ padding: 16px !important;
398
+ font-weight: 600 !important;
399
+ }
400
+
401
+ .stInfo {
402
+ border-radius: 14px !important;
403
+ border-left: 5px solid var(--primary-blue) !important;
404
+ background-color: #f0f9ff !important;
405
+ padding: 16px !important;
406
+ font-weight: 600 !important;
407
+ }
408
+
409
+ .stError {
410
+ border-radius: 14px !important;
411
+ border-left: 5px solid #ef4444 !important;
412
+ background-color: #fef2f2 !important;
413
+ padding: 16px !important;
414
+ font-weight: 600 !important;
415
+ }
416
+
417
+ /* Mobile responsive */
418
+ @media (max-width: 768px) {
419
+ .header-container h1 {
420
+ font-size: 40px;
421
+ }
422
+
423
+ .slide-box-wrapper {
424
+ padding: 30px 20px;
425
+ }
426
+
427
+ .slide-box-wrapper h2 {
428
+ font-size: 30px;
429
+ }
430
+
431
+ .slide-box-wrapper p {
432
+ font-size: 17px;
433
+ line-height: 1.9;
434
+ padding: 0 10px;
435
+ }
436
+
437
+ .progress-container {
438
+ flex-direction: row;
439
+ gap: 15px;
440
+ margin: 20px auto;
441
+ padding: 15px 0;
442
+ }
443
+
444
+ .progress-bar {
445
+ width: 100%;
446
+ max-width: none;
447
+ }
448
+ }
449
+ </style>
450
+ """, unsafe_allow_html=True)
451
+
452
+
453
+ # ============================================================
454
+ # SESSION STATE INITIALIZATION
455
+ # ============================================================
456
+
457
+
458
+ if "current_slide" not in st.session_state:
459
+ st.session_state.current_slide = 0
460
+ if "slides_data" not in st.session_state:
461
+ st.session_state.slides_data = None
462
+ if "search_query" not in st.session_state:
463
+ st.session_state.search_query = ""
464
+ if "mode" not in st.session_state:
465
+ st.session_state.mode = "technical"
466
+ if "is_loading" not in st.session_state:
467
+ st.session_state.is_loading = False
468
+ if "error_message" not in st.session_state:
469
+ st.session_state.error_message = None
470
+ if "metrics" not in st.session_state:
471
+ st.session_state.metrics = None
472
+
473
+
474
+ # ============================================================
475
+ # PIPELINE FUNCTION
476
+ # ============================================================
477
+
478
+
479
+ def run_pipeline(query, mode):
480
+ """Execute the 5-stage pipeline with metrics tracking."""
481
+ try:
482
+ metrics = PipelineMetrics(query, mode)
483
+ query = validate_and_sanitize_topic(query)
484
+
485
+ technical_col, operational_col, db = get_collections()
486
+ collection = operational_col if mode == "operational" else technical_col
487
+
488
+ # Cache check
489
+ metrics.start_stage("Cache Check")
490
+ cached_content, is_cached = check_cache(query, collection)
491
+ if is_cached:
492
+ metrics.set_cache_hit("mongodb")
493
+ metrics.end_stage("Cache Check")
494
+
495
+ if is_cached:
496
+ st.session_state.slides_data = cached_content
497
+ st.session_state.current_slide = 0
498
+ metrics.end()
499
+ metrics.save_metrics()
500
+ return True, "βœ… Retrieved from cache (instant!)"
501
+
502
+ st.session_state.is_loading = True
503
+
504
+ with st.spinner(f"πŸ”„ Generating {mode} content with images (5 stages)..."):
505
+ if mode == "technical":
506
+ metrics.start_stage("Generate")
507
+ generated = generate_technical_content(query)
508
+ metrics.end_stage("Generate", f"{len(generated.get('content', []))} slides")
509
+
510
+ metrics.start_stage("Correct")
511
+ corrected = correct_technical_content(generated)
512
+ metrics.end_stage("Correct", "Content improved")
513
+
514
+ metrics.start_stage("Validate URLs")
515
+ validated, _ = validate_and_select_urls(corrected)
516
+ metrics.end_stage("Validate URLs", f"{len(validated.get('urls', []))} URLs validated")
517
+
518
+ metrics.start_stage("Refine")
519
+ refined = refine_technical_content(validated)
520
+ metrics.end_stage("Refine", "Content refined")
521
+
522
+ metrics.start_stage("Generate Images")
523
+ final_result = process_images_for_pipeline(refined, mode="technical")
524
+ metrics.end_stage("Generate Images", "Images generated")
525
+ else:
526
+ metrics.start_stage("Generate")
527
+ generated = generate_operational_content(query)
528
+ metrics.end_stage("Generate", f"{len(generated.get('content', []))} slides")
529
+
530
+ metrics.start_stage("Correct")
531
+ corrected = correct_operational_content(generated)
532
+ metrics.end_stage("Correct", "Content improved")
533
+
534
+ metrics.start_stage("Validate URLs")
535
+ validated, _ = validate_and_select_urls(corrected)
536
+ metrics.end_stage("Validate URLs", f"{len(validated.get('urls', []))} URLs validated")
537
+
538
+ metrics.start_stage("Refine")
539
+ refined = refine_operational_content(validated)
540
+ metrics.end_stage("Refine", "Content refined")
541
+
542
+ metrics.start_stage("Generate Images")
543
+ final_result = process_images_for_pipeline(refined, mode="operational")
544
+ metrics.end_stage("Generate Images", "Images generated")
545
+
546
+ save_to_cache(query, final_result, collection)
547
+ st.session_state.slides_data = final_result
548
+ st.session_state.current_slide = 0
549
+ st.session_state.is_loading = False
550
+
551
+ pipeline_metrics = metrics.end()
552
+ metrics.save_metrics()
553
+ st.session_state.metrics = pipeline_metrics
554
+
555
+ total_time = pipeline_metrics.get('total_duration_seconds', 0)
556
+ return True, f"βœ… Generated {len(final_result.get('content', []))} slides in {total_time:.1f}s!"
557
+
558
+ except Exception as e:
559
+ st.session_state.is_loading = False
560
+ st.session_state.error_message = str(e)
561
+ return False, f"❌ Error: {str(e)}"
562
+
563
+
564
+ # ============================================================
565
+ # DISPLAY SLIDE FUNCTION - PROGRESS BAR OUTSIDE BOX
566
+ # ============================================================
567
+
568
+
569
+ def display_slide(slide_index):
570
+ """Display current slide with progress bar OUTSIDE the white box."""
571
+ if not st.session_state.slides_data:
572
+ return
573
+
574
+ slides = st.session_state.slides_data.get('content', [])
575
+ if not slides or slide_index >= len(slides):
576
+ return
577
+
578
+ slide = slides[slide_index]
579
+ total_slides = len(slides)
580
+ progress_percent = ((slide_index + 1) / total_slides) * 100
581
+
582
+ # Build the slide box HTML (WITHOUT progress bar)
583
+ title = sanitize_for_html(slide.get("slide_title", ""))
584
+ raw_content = slide.get("slide_content", "")
585
+
586
+ # Determine content type
587
+ if detect_code_content(raw_content):
588
+ sanitized_content = f"<pre>{sanitize_for_html(raw_content)}</pre>"
589
+ else:
590
+ sanitized_content = f"<p>{sanitize_for_html(raw_content)}</p>"
591
+
592
+ # Start building the slide HTML (NO progress bar inside)
593
+ slide_html = f"""
594
+ <div class="slide-box-wrapper">
595
+ <h2>{title}</h2>
596
+ {sanitized_content}
597
+ """
598
+
599
+ # Add image if available
600
+ img_url = slide.get('image_description')
601
+ if isinstance(img_url, str) and img_url.startswith('http'):
602
+ slide_html += f'<img src="{img_url}" alt="Slide image" style="max-width: 90%; height: auto; display: block; margin: 24px auto; border-radius: 20px; box-shadow: 0 8px 24px rgba(37, 99, 235, 0.2);">'
603
+
604
+ # Add learning resources (last slide only)
605
+ if slide_index == total_slides - 1:
606
+ urls = st.session_state.slides_data.get('urls', [])
607
+ if urls:
608
+ slide_html += '<div class="resources-section"><h4>πŸ“š Learning Resources</h4>'
609
+ for i, url_obj in enumerate(urls, 1):
610
+ url_title = sanitize_for_html(url_obj.get('title', 'Documentation'))
611
+ url = url_obj.get('url', '#')
612
+ slide_html += f'<a href="{url}" target="_blank">{i}. {url_title}</a>'
613
+ slide_html += '</div>'
614
+
615
+ # Close the slide box (NO progress bar)
616
+ slide_html += '</div>'
617
+
618
+ # Render the slide box
619
+ st.markdown(slide_html, unsafe_allow_html=True)
620
+
621
+ # RENDER PROGRESS BAR OUTSIDE THE BOX
622
+ progress_html = f"""
623
+ <div class="progress-container">
624
+ <div class="progress-bar">
625
+ <div class="progress-fill" style="width: {progress_percent}%"></div>
626
+ </div>
627
+ <div class="slide-counter-badge">{slide_index + 1} / {total_slides}</div>
628
+ </div>
629
+ """
630
+ st.markdown(progress_html, unsafe_allow_html=True)
631
+
632
+ # Navigation buttons below
633
+ st.markdown('<br>', unsafe_allow_html=True)
634
+ col_left, col_center, col_right = st.columns([1, 8, 1])
635
+
636
+ with col_left:
637
+ if slide_index > 0:
638
+ if st.button("β¬…", key="prev_btn", help="Previous slide", use_container_width=True):
639
+ st.session_state.current_slide -= 1
640
+ st.rerun()
641
+
642
+ with col_right:
643
+ if slide_index < total_slides - 1:
644
+ if st.button("➑", key="next_btn", help="Next slide", use_container_width=True):
645
+ st.session_state.current_slide += 1
646
+ st.rerun()
647
+
648
+
649
+ # ============================================================
650
+ # PAGE LAYOUT
651
+ # ============================================================
652
+
653
+
654
+ # Header
655
+ st.markdown(
656
+ '<div class="header-container"><h1>πŸŽ“ LearnOnTheGo</h1></div>',
657
+ unsafe_allow_html=True
658
+ )
659
+
660
+ # Search container
661
+ st.markdown('<div class="search-container">', unsafe_allow_html=True)
662
+
663
+ col1, col2 = st.columns([3, 1])
664
+
665
+ with col1:
666
+ search_query = st.text_input(
667
+ "Search",
668
+ value=st.session_state.search_query,
669
+ placeholder="e.g., Python, Machine Learning, Cloud Computing...",
670
+ key="search_input",
671
+ label_visibility="collapsed"
672
+ )
673
+ st.session_state.search_query = search_query
674
+
675
+ with col2:
676
+ mode = st.radio(
677
+ "Mode",
678
+ options=["Technical", "Operational"],
679
+ index=0 if st.session_state.mode == "technical" else 1,
680
+ key="mode_radio",
681
+ horizontal=True,
682
+ label_visibility="collapsed"
683
+ )
684
+ st.session_state.mode = mode.lower()
685
+
686
+ st.markdown('</div>', unsafe_allow_html=True)
687
+
688
+ # Generate button
689
+ col1, col2, col3 = st.columns([1, 2, 1])
690
+ with col2:
691
+ search_button = st.button("πŸ” Generate Slides", key="search_btn", use_container_width=True)
692
+
693
+ # Error handling
694
+ if st.session_state.error_message:
695
+ st.error(st.session_state.error_message)
696
+ st.session_state.error_message = None
697
+
698
+ # Execute pipeline
699
+ if search_button and st.session_state.search_query:
700
+ success, message = run_pipeline(st.session_state.search_query, st.session_state.mode)
701
+ if success:
702
+ st.success(message)
703
+ else:
704
+ st.error(message)
705
+
706
+ # Display slides
707
+ if st.session_state.slides_data:
708
+ st.markdown("---")
709
+ if st.session_state.current_slide >= len(st.session_state.slides_data.get('content', [])):
710
+ st.session_state.current_slide = 0
711
+ display_slide(st.session_state.current_slide)
712
+ else:
713
+ st.info("πŸ‘† Enter a topic and click 'Generate Slides' to get started!")
714
+
715
+ # Footer
716
+ st.markdown(
717
+ """<div class="footer-bar">
718
+ <p><strong>LearnOnTheGo</strong> β€’ Powered by AI β€’ Built with Streamlit</p>
719
+ <p>5-Stage Pipeline: Generate β†’ Correct β†’ Validate β†’ Refine β†’ Generate Images</p>
720
+ <p>Gemini 2.5 Flash (text) β€’ Gemini 2.5 Flash Image (images) β€’ Perplexity Sonar Pro</p>
721
+ </div>""",
722
+ unsafe_allow_html=True
723
+ )
724
+
725
+ print("βœ… LearnOnTheGo - Progress bar moved outside box - Fixed!")
src/utils_functions.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import pickle
4
+ import hashlib
5
+ import httpx
6
+ from datetime import datetime, timezone
7
+ from pymongo import MongoClient
8
+ from tenacity import retry, stop_after_attempt, wait_exponential
9
+ from dotenv import load_dotenv
10
+
11
+ load_dotenv()
12
+
13
+ # ============================================================
14
+ # PIPELINE METRICS CLASS (Complete tracking system)
15
+ # ============================================================
16
+
17
+ class PipelineMetrics:
18
+ """
19
+ Complete metrics tracking for pipeline execution.
20
+ Tracks timing, stages, cache hits, and saves to MongoDB.
21
+ """
22
+
23
+ def __init__(self, topic, mode):
24
+ """Initialize metrics tracker"""
25
+ self.topic = topic
26
+ self.mode = mode
27
+ self.run_id = f"{mode}_{int(datetime.now().timestamp())}"
28
+ self.start_time = datetime.now(timezone.utc)
29
+ self.stages = {}
30
+ self.current_stage = None
31
+ self.current_stage_start = None
32
+ self.cache_hit = False
33
+ self.cache_type = None
34
+ self.error_occurred = False
35
+ self.error_message = None
36
+
37
+ def start_stage(self, stage_name):
38
+ """Start tracking a stage"""
39
+ self.current_stage = stage_name
40
+ self.current_stage_start = datetime.now(timezone.utc)
41
+ print(f" πŸ“Š [METRICS] Starting: {stage_name}")
42
+
43
+ def end_stage(self, stage_name, output_summary=None):
44
+ """End tracking a stage"""
45
+ if self.current_stage_start:
46
+ duration = (datetime.now(timezone.utc) - self.current_stage_start).total_seconds()
47
+ self.stages[stage_name] = {
48
+ "duration_seconds": duration,
49
+ "timestamp": datetime.now(timezone.utc),
50
+ "output_summary": output_summary
51
+ }
52
+ print(f" βœ“ Stage '{stage_name}' completed in {duration:.2f}s")
53
+
54
+ def set_cache_hit(self, cache_type="mongodb"):
55
+ """Record cache hit"""
56
+ self.cache_hit = True
57
+ self.cache_type = cache_type
58
+ print(f" πŸ’Ύ Cache hit: {cache_type}")
59
+
60
+ def set_error(self, error_message):
61
+ """Record error"""
62
+ self.error_occurred = True
63
+ self.error_message = error_message
64
+ print(f" ❌ Error: {error_message}")
65
+
66
+ def end(self):
67
+ """End pipeline tracking"""
68
+ total_duration = (datetime.now(timezone.utc) - self.start_time).total_seconds()
69
+ self.metrics = {
70
+ "run_id": self.run_id,
71
+ "topic": self.topic,
72
+ "mode": self.mode,
73
+ "started_at": self.start_time,
74
+ "completed_at": datetime.now(timezone.utc),
75
+ "total_duration_seconds": total_duration,
76
+ "stages": self.stages,
77
+ "cache_hit": self.cache_hit,
78
+ "cache_type": self.cache_type,
79
+ "error_occurred": self.error_occurred,
80
+ "error_message": self.error_message
81
+ }
82
+ print(f"\n πŸ“Š Pipeline Complete: {total_duration:.2f}s total")
83
+ return self.metrics
84
+
85
+ def save_metrics(self):
86
+ """Save metrics to MongoDB"""
87
+ try:
88
+ mongo_uri = os.getenv("MONGO_URI")
89
+ if not mongo_uri:
90
+ print(" ⚠️ MONGO_URI not set - skipping metrics save")
91
+ return False
92
+
93
+ client = MongoClient(mongo_uri, serverSelectionTimeoutMS=5000)
94
+ db = client["learnToGo"]
95
+
96
+ # Collections based on mode
97
+ if self.mode == "technical":
98
+ metrics_col = db["pipelinemetrics"]
99
+ stages_col = db["stageoutputs"]
100
+ else:
101
+ metrics_col = db["operational_pipeline_metrics"]
102
+ stages_col = db["operational_stage_outputs"]
103
+
104
+ # Save metrics
105
+ metrics_col.insert_one(self.metrics)
106
+
107
+ # Save stage details
108
+ for stage_name, stage_data in self.stages.items():
109
+ stage_doc = {
110
+ "run_id": self.run_id,
111
+ "topic": self.topic,
112
+ "mode": self.mode,
113
+ "stage_name": stage_name,
114
+ "stage_data": stage_data
115
+ }
116
+ stages_col.insert_one(stage_doc)
117
+
118
+ print(f" βœ“ Metrics saved to MongoDB")
119
+ return True
120
+
121
+ except Exception as e:
122
+ print(f" ⚠️ Could not save metrics: {e}")
123
+ return False
124
+
125
+ # ============================================================
126
+ # MONGODB CONNECTION & COLLECTIONS
127
+ # ============================================================
128
+
129
+ @retry(
130
+ stop=stop_after_attempt(3),
131
+ wait=wait_exponential(multiplier=1, min=2, max=10)
132
+ )
133
+ def get_mongo_client():
134
+ """Get MongoDB client from environment variables"""
135
+ mongo_uri = os.getenv("MONGO_URI")
136
+ if not mongo_uri:
137
+ raise ValueError("MONGO_URI not set in .env")
138
+ return MongoClient(mongo_uri, serverSelectionTimeoutMS=5000)
139
+
140
+ def get_collections():
141
+ """Get MongoDB collections for Technical and Operational keywords"""
142
+ client = get_mongo_client()
143
+ db = client["learnToGo"]
144
+
145
+ technical_collection = db["Keywords"]
146
+ operational_collection = db["OperationalKeywords"]
147
+
148
+ # Create indexes
149
+ technical_collection.create_index("aliases")
150
+ operational_collection.create_index("aliases")
151
+
152
+ return technical_collection, operational_collection, db
153
+
154
+ # ============================================================
155
+ # URL CACHING (Pickle-based - FIXED with proper dict structure)
156
+ # ============================================================
157
+
158
+ URL_CACHE_FILE = "/tmp/url_validation_cache.pkl"
159
+
160
+ def load_url_cache():
161
+ """Load URL validation cache from pickle file"""
162
+ try:
163
+ if os.path.exists(URL_CACHE_FILE):
164
+ with open(URL_CACHE_FILE, 'rb') as f:
165
+ cache = pickle.load(f)
166
+ print(f"βœ“ Loaded URL cache with {len(cache)} entries")
167
+ return cache
168
+ except Exception as e:
169
+ print(f"⚠️ Could not load URL cache: {e}")
170
+ return {}
171
+
172
+ def save_url_cache(cache):
173
+ """Save URL validation cache to pickle file"""
174
+ try:
175
+ with open(URL_CACHE_FILE, 'wb') as f:
176
+ pickle.dump(cache, f)
177
+ print(f"βœ“ Saved URL cache with {len(cache)} entries")
178
+ return True
179
+ except Exception as e:
180
+ print(f"⚠️ Could not save URL cache: {e}")
181
+ return False
182
+
183
+ def get_url_hash(url):
184
+ """Generate MD5 hash for URL as cache key"""
185
+ return hashlib.md5(url.encode()).hexdigest()
186
+
187
+ @retry(
188
+ stop=stop_after_attempt(2),
189
+ wait=wait_exponential(multiplier=1, min=2, max=5)
190
+ )
191
+ def validate_url_cached(url, timeout=5):
192
+ """Check if URL is valid with cache check - FIXED to return dict"""
193
+ url_hash = get_url_hash(url)
194
+
195
+ # Load cache
196
+ url_cache = load_url_cache()
197
+
198
+ # Check cache
199
+ if url_hash in url_cache:
200
+ print(f" πŸ’Ύ URL cache hit: {url[:50]}...")
201
+ return url_cache[url_hash]['valid'] # ← Returns boolean from dict
202
+
203
+ # Validate URL
204
+ try:
205
+ response = httpx.head(url, timeout=timeout, follow_redirects=True)
206
+ is_valid = response.status_code in [200, 301, 302, 303, 307, 308]
207
+ except:
208
+ try:
209
+ response = httpx.get(url, timeout=timeout, follow_redirects=True)
210
+ is_valid = response.status_code == 200
211
+ except:
212
+ is_valid = False
213
+
214
+ # Save to cache as DICT with valid, checked_at, url
215
+ url_cache[url_hash] = {
216
+ 'valid': is_valid,
217
+ 'checked_at': datetime.now(timezone.utc).isoformat(),
218
+ 'url': url
219
+ }
220
+ save_url_cache(url_cache)
221
+
222
+ print(f" βœ“ URL validated: {url[:50]}... = {is_valid}")
223
+ return is_valid
224
+
225
+ # ============================================================
226
+ # CACHE OPERATIONS
227
+ # ============================================================
228
+
229
+ @retry(
230
+ stop=stop_after_attempt(3),
231
+ wait=wait_exponential(multiplier=1, min=2, max=10)
232
+ )
233
+ def check_cache(topic, collection):
234
+ """
235
+ Check MongoDB cache using normalized keyword - NO LLM call!
236
+ Includes retry logic for connection failures.
237
+ """
238
+ try:
239
+ normalized = topic.lower().strip()
240
+ print(f"πŸ” Checking cache for: {normalized}")
241
+
242
+ cached = collection.find_one({"aliases": normalized})
243
+
244
+ if cached:
245
+ print(f"βœ… CACHE HIT! Found topic: {cached['topic']}")
246
+ return cached['content'], True
247
+ else:
248
+ print(f"❌ CACHE MISS - Will run full pipeline")
249
+ return None, False
250
+
251
+ except Exception as e:
252
+ print(f"❌ Cache lookup error: {e}")
253
+ raise
254
+
255
+ @retry(
256
+ stop=stop_after_attempt(3),
257
+ wait=wait_exponential(multiplier=1, min=2, max=10)
258
+ )
259
+ def save_to_cache(topic, content, collection):
260
+ """
261
+ Save generated slides to MongoDB.
262
+ Includes retry logic for connection failures.
263
+ """
264
+ try:
265
+ aliases = content.get('aliases', [topic.lower().strip()])
266
+
267
+ document = {
268
+ "topic": content.get('topic', topic),
269
+ "aliases": aliases,
270
+ "createdAt": datetime.now(timezone.utc),
271
+ "content": content
272
+ }
273
+
274
+ result = collection.insert_one(document)
275
+ print(f"βœ… Saved to MongoDB - Document ID: {result.inserted_id}")
276
+ return result.inserted_id
277
+
278
+ except Exception as e:
279
+ print(f"❌ Cache save error: {e}")
280
+ raise
281
+
282
+ # ============================================================
283
+ # URL VALIDATION & SELECTION
284
+ # ============================================================
285
+
286
+ def validate_and_select_urls(corrected_json):
287
+ """
288
+ Validate ALL URLs and select best ones.
289
+ Uses cached validation to avoid repeated HTTP requests.
290
+ """
291
+ urls = corrected_json.get("urls", [])
292
+ print(f"Validating {len(urls)} URLs with caching...")
293
+
294
+ valid_urls = []
295
+ validation_results = []
296
+
297
+ for url_obj in urls:
298
+ url = url_obj.get("url")
299
+ if url:
300
+ is_valid = validate_url_cached(url)
301
+
302
+ validation_results.append({
303
+ "url": url,
304
+ "title": url_obj.get("title"),
305
+ "valid": is_valid
306
+ })
307
+
308
+ if is_valid:
309
+ valid_urls.append(url_obj)
310
+
311
+ # Keep only best 5 URLs
312
+ valid_urls = valid_urls[:5]
313
+
314
+ print(f"βœ“ Kept {len(valid_urls)} valid URLs")
315
+
316
+ corrected_json["urls"] = valid_urls
317
+ return corrected_json, validation_results
318
+
319
+ # ============================================================
320
+ # INPUT VALIDATION (50 char limit for both technical and operational)
321
+ # ============================================================
322
+
323
+ @retry(
324
+ stop=stop_after_attempt(3),
325
+ wait=wait_exponential(multiplier=1, min=1, max=3)
326
+ )
327
+ def validate_and_sanitize_topic(topic):
328
+ """
329
+ Validate and sanitize user input before pipeline.
330
+ Prevents errors and invalid topics.
331
+ FIXED: Both technical and operational now have 50 char limit
332
+ """
333
+ if not topic or not topic.strip():
334
+ raise ValueError("❌ Topic cannot be empty.")
335
+
336
+ topic = topic.strip()
337
+
338
+ if len(topic) < 1:
339
+ raise ValueError("❌ Topic must be at least 1 character long.")
340
+ if len(topic) > 50:
341
+ raise ValueError("❌ Topic cannot exceed 50 characters.")
342
+
343
+ print(f"βœ… Input validated: '{topic}'")
344
+ return topic
345
+
346
+ print("βœ“ All utility functions ready with metrics, URL caching, and retry logic")