Spaces:

daemon03
/

ContentGeneration

Sleeping

App Files Files Community

daemon03 commited on Nov 14, 2025

Commit

edd00ca

1 Parent(s): 07d7d9a

content_generator v1.0

Browse files

Files changed (4) hide show

src/image_generation_functions.py +453 -0
src/pipelines_functions.py +482 -0
src/streamlit_app.py +723 -38
src/utils_functions.py +346 -0

src/image_generation_functions.py ADDED Viewed

	@@ -0,0 +1,453 @@

+import os
+import json
+import time
+import re
+import mimetypes
+from io import BytesIO
+from PIL import Image as PILImage
+import google.generativeai as genai
+from google.cloud import storage
+from google import genai as google_genai
+from google.genai import types
+from tenacity import retry, stop_after_attempt, wait_exponential
+from dotenv import load_dotenv
+load_dotenv()
+# ============================================================
+# IMAGE GENERATION CONFIGURATION (FIXED - Two separate keys)
+# ============================================================
+# For text correction (Gemini 2.5 Flash)
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+# For image generation (Gemini 2.5 Flash Image - NEW API)
+IMAGE_API_KEY = os.getenv("IMAGE_API_KEY")
+GCP_CREDENTIALS_JSON = os.getenv("GCP_CREDENTIALS_JSON")
+GCP_PROJECT_ID = os.getenv("GCP_PROJECT_ID")
+GCP_BUCKET_NAME = os.getenv("GCP_BUCKET_NAME")
+# Initialize Gemini for correction (old API - works for text)
+if GEMINI_API_KEY:
+    genai.configure(api_key=GEMINI_API_KEY)
+else:
+    print("⚠️ GEMINI_API_KEY not set - text correction will fail")
+# Initialize GCP Storage
+try:
+    if GCP_CREDENTIALS_JSON and GCP_PROJECT_ID and GCP_BUCKET_NAME:
+        import json as json_lib
+        from google.oauth2 import service_account
+        credentials_dict = json_lib.loads(GCP_CREDENTIALS_JSON)
+        credentials = service_account.Credentials.from_service_account_info(credentials_dict)
+        gcp_client = storage.Client(credentials=credentials, project=GCP_PROJECT_ID)
+        gcp_bucket = gcp_client.bucket(GCP_BUCKET_NAME)
+        GCP_AVAILABLE = True
+        print("✓ GCP Storage configured for image uploads")
+    else:
+        GCP_AVAILABLE = False
+        print("⚠️ GCP credentials not fully configured - image upload disabled")
+except Exception as e:
+    GCP_AVAILABLE = False
+    print(f"⚠️ GCP configuration error: {e}")
+# ============================================================
+# AUTOCROP FUNCTION (Proper implementation)
+# ============================================================
+def autocrop_tight_vertical(image_path, output_path=None):
+    """
+    Remove excess white space from top and bottom of image while keeping left/right margins.
+    FIXED: Proper PIL implementation with margin preservation.
+    """
+    try:
+        img = PILImage.open(image_path)
+        img_array = img.convert('RGB')
+        # Get image dimensions
+        width, height = img_array.size
+        # Define white threshold (pure white or very close)
+        white_threshold = 250
+        # Find first non-white row from top
+        top_crop = 0
+        for y in range(height):
+            row_pixels = []
+            for x in range(width):
+                r, g, b = img_array.getpixel((x, y))
+                row_pixels.append((r + g + b) / 3)
+            avg_brightness = sum(row_pixels) / len(row_pixels)
+            if avg_brightness < white_threshold:
+                top_crop = y
+                break
+        # Find first non-white row from bottom
+        bottom_crop = height
+        for y in range(height - 1, -1, -1):
+            row_pixels = []
+            for x in range(width):
+                r, g, b = img_array.getpixel((x, y))
+                row_pixels.append((r + g + b) / 3)
+            avg_brightness = sum(row_pixels) / len(row_pixels)
+            if avg_brightness < white_threshold:
+                bottom_crop = y + 1
+                break
+        # Crop image with small margin
+        margin = 10
+        top_crop = max(0, top_crop - margin)
+        bottom_crop = min(height, bottom_crop + margin)
+        # Make sure we have at least some height
+        if bottom_crop <= top_crop:
+            print("   ⚠️ Autocrop: No content found, returning original")
+            return img_array
+        cropped_img = img_array.crop((0, top_crop, width, bottom_crop))
+        if output_path:
+            cropped_img.save(output_path)
+        print(f"   ✓ Autocropped from {height}px to {cropped_img.size[1]}px")
+        return cropped_img
+    except Exception as e:
+        print(f"⚠️ Autocrop failed: {e}")
+        return None
+# ============================================================
+# TECHNICAL IMAGE GENERATION (FIXED - NEW API with proper error checking)
+# ============================================================
+@retry(
+    stop=stop_after_attempt(2),
+    wait=wait_exponential(multiplier=1, min=3, max=10)
+)
+def generate_technical_image(slide_title, slide_content, image_description):
+    """
+    Generate a technical diagram using NEW Gemini 2.5 Flash Image API with streaming.
+    FIXED: Using google.genai API with generate_content_stream and proper null checking
+    Returns: (success: bool, image_data: bytes or error_message: str)
+    """
+    try:
+        if not IMAGE_API_KEY:
+            return False, "IMAGE_API_KEY not configured"
+        # Initialize client with IMAGE API KEY
+        client = google_genai.Client(api_key=IMAGE_API_KEY)
+        # Professional technical prompt
+        prompt_text = f"""
+Generate a professional, clean, and visually compelling image for a technical presentation.
+**Context:**
+This image will be used for a slide titled "{slide_title}" with the following content:
+"{slide_content}"
+The image should visually represent the concept described below to enhance understanding:
+{image_description}
+**Critical Requirements:**
+- NO explanatory text, paragraphs, or detailed written descriptions overlaid on the image.
+- Component labels ARE allowed where necessary for clarity (e.g., "API Server", "Worker Node", "Control Plane").
+- Include a brief, centered caption below the image (max 5-7 words, research paper style) summarizing the visual concept.
+- Use full canvas space efficiently — minimize blank margins, maximize information density.
+- Clean, professional, modern aesthetic.
+- Use color strategically to convey meaning and hierarchy.
+- Suitable for a formal technical presentation slide.
+- Prefer abstract/conceptual visualizations over literal images.
+- Ensure all text in the diagram is spell-checked and professionally styled.
+**Style Guidelines:**
+- Pure white background (#FFFFFF) for professional appearance.
+- Professional color palette optimized for white backgrounds:
+  * Primary: Deep navy blue (#1a365d), slate gray (#475569)
+  * Accent: Teal (#0d9488), ocean blue (#0284c7)
+- Minimalist and elegant design with balanced spacing.
+- 4:3 aspect ratio (landscape orientation).
+"""
+        print(f"   🎨 Generating technical image for: {slide_title}...")
+        # Create content with proper structure
+        contents = [types.Content(
+            role="user",
+            parts=[types.Part.from_text(text=prompt_text)]
+        )]
+        # Configure generation with 4:3 aspect ratio
+        generate_content_config = types.GenerateContentConfig(
+            response_modalities=["IMAGE", "TEXT"],
+            image_config=types.ImageConfig(aspect_ratio="4:3", image_size="1K"),
+        )
+        # Stream response and extract image
+        for chunk in client.models.generate_content_stream(
+            model="gemini-2.5-flash-image",
+            contents=contents,
+            config=generate_content_config
+        ):
+            # ===== FIXED: 5-level null checking as per notebooks =====
+            if not chunk.candidates:
+                continue
+            candidate = chunk.candidates[0]
+            if not hasattr(candidate, 'content') or candidate.content is None:
+                continue
+            if not hasattr(candidate.content, 'parts') or not candidate.content.parts:
+                continue
+            part = candidate.content.parts[0]
+            if not hasattr(part, 'inline_data') or part.inline_data is None:
+                continue
+            inline_data = part.inline_data
+            if inline_data.data:
+                image_data = inline_data.data
+                print(f"   ✅ Image generated successfully")
+                return True, image_data
+        return False, "No image generated from API"
+    except Exception as e:
+        print(f"   ❌ Image generation error: {str(e)}")
+        return False, f"Error: {str(e)}"
+# ============================================================
+# OPERATIONAL IMAGE GENERATION (FIXED - NEW API with proper error checking)
+# ============================================================
+@retry(
+    stop=stop_after_attempt(2),
+    wait=wait_exponential(multiplier=1, min=3, max=10)
+)
+def generate_operational_image(slide_title, slide_content, image_description):
+    """
+    Generate a business/operational diagram using NEW Gemini 2.5 Flash Image API with streaming.
+    FIXED: Using google.genai API with generate_content_stream and proper null checking
+    Returns: (success: bool, image_data: bytes or error_message: str)
+    """
+    try:
+        if not IMAGE_API_KEY:
+            return False, "IMAGE_API_KEY not configured"
+        # Initialize client with IMAGE API KEY
+        client = google_genai.Client(api_key=IMAGE_API_KEY)
+        # Business-focused prompt
+        prompt_text = f"""
+Generate a professional, clean business/operational diagram for a compliance or regulatory presentation.
+**Context:**
+This image will be used for a slide titled "{slide_title}" with the following business content:
+"{slide_content}"
+The image should visually represent the operational/business/compliance concept described below:
+{image_description}
+**Critical Requirements:**
+- NO explanatory text, paragraphs, or detailed written descriptions overlaid on the image.
+- Component labels and process flow indicators ARE allowed (e.g., "Compliance Check", "Approval", "Risk Mitigation").
+- Include a brief, centered caption below the image (max 5-7 words, business report style).
+- Use full canvas space efficiently — minimize blank margins.
+- Clean, professional, corporate aesthetic.
+- Use color strategically: consider business standard colors (blue for trust, green for process).
+- Suitable for a formal business presentation or compliance report.
+- Prefer process flows, matrices, or business diagrams.
+**Style Guidelines:**
+- Pure white background (#FFFFFF).
+- Professional business color palette:
+  * Primary: Corporate blue (#003366), professional gray (#4a5568)
+  * Accent: Business green (#2d5016), alert red (#c53030)
+- Clean, minimal design with professional spacing.
+- 4:3 aspect ratio (landscape for business presentations).
+"""
+        print(f"   📊 Generating operational image for: {slide_title}...")
+        # Create content with proper structure
+        contents = [types.Content(
+            role="user",
+            parts=[types.Part.from_text(text=prompt_text)]
+        )]
+        # Configure generation with 4:3 aspect ratio
+        generate_content_config = types.GenerateContentConfig(
+            response_modalities=["IMAGE", "TEXT"],
+            image_config=types.ImageConfig(aspect_ratio="4:3", image_size="1K"),
+        )
+        # Stream response and extract image
+        for chunk in client.models.generate_content_stream(
+            model="gemini-2.5-flash-image",
+            contents=contents,
+            config=generate_content_config
+        ):
+            # ===== FIXED: 5-level null checking as per notebooks =====
+            if not chunk.candidates:
+                continue
+            candidate = chunk.candidates[0]
+            if not hasattr(candidate, 'content') or candidate.content is None:
+                continue
+            if not hasattr(candidate.content, 'parts') or not candidate.content.parts:
+                continue
+            part = candidate.content.parts[0]
+            if not hasattr(part, 'inline_data') or part.inline_data is None:
+                continue
+            inline_data = part.inline_data
+            if inline_data.data:
+                image_data = inline_data.data
+                print(f"   ✅ Image generated successfully")
+                return True, image_data
+        return False, "No image generated from API"
+    except Exception as e:
+        print(f"   ❌ Image generation error: {str(e)}")
+        return False, f"Error: {str(e)}"
+# ============================================================
+# PIPELINE IMAGE REPLACEMENT (FIXED - Complete integration)
+# ============================================================
+def process_images_for_pipeline(slide_json, mode="technical"):
+    """
+    FIXED: Complete image processing pipeline with proper sequencing.
+    Process all slides with image descriptions:
+    1. Generate image with Gemini 2.5 Flash Image
+    2. Save temporarily
+    3. Autocrop white space
+    4. Upload to GCP
+    5. Replace image_description with GCP URL
+    Args:
+        slide_json: Slides JSON with image_description fields
+        mode: "technical" or "operational"
+    Returns:
+        Updated slide_json with image_description as GCP URLs
+    """
+    print(f"\n{'='*70}")
+    print(f"🎨 STAGE 4: Processing Images ({mode.upper()} Mode)")
+    print('='*70)
+    # Create temp folder for intermediate images
+    temp_folder = "/tmp/gen_images"
+    os.makedirs(temp_folder, exist_ok=True)
+    image_generator = generate_technical_image if mode == "technical" else generate_operational_image
+    for idx, slide in enumerate(slide_json.get('content', []), 1):
+        # Skip slides without image descriptions or with null
+        if not slide.get('image_description') or slide['image_description'] == "null":
+            print(f"   ⊘ Slide {idx}: No image description")
+            continue
+        try:
+            slide_title = slide.get('slide_title', 'Slide')
+            slide_content = slide.get('slide_content', '')
+            image_desc = slide.get('image_description', '')
+            print(f"\n   📍 Processing Slide {idx}: {slide_title}")
+            # STEP 1: Generate image with NEW API
+            print(f"   1️⃣ Generating image...")
+            success, result = image_generator(slide_title, slide_content, image_desc)
+            if not success:
+                print(f"   ❌ Generation failed: {result}")
+                slide['image_description'] = f"Failed: {result}"
+                continue
+            image_data = result
+            # STEP 2: Save image temporarily
+            print(f"   2️⃣ Saving to temporary file...")
+            raw_topic = slide_json.get('topic', 'topic')
+            topic_slug = re.sub(r'[^a-zA-Z0-9_-]+', '_', raw_topic.strip().lower()).strip('_')
+            topic_slug = topic_slug[:15]
+            ts = int(time.time())
+            temp_file_name = f"slide_{idx}_{topic_slug}_{mode}_{ts}.png"
+            temp_file_path = os.path.join(temp_folder, temp_file_name)
+            with open(temp_file_path, 'wb') as f:
+                f.write(image_data)
+            print(f"   ✓ Saved: {temp_file_name}")
+            # STEP 3: Autocrop white space
+            print(f"   3️⃣ Autocropping white space...")
+            try:
+                autocrop_tight_vertical(temp_file_path, temp_file_path)
+                print(f"   ✓ Autocrop successful")
+            except Exception as e:
+                print(f"   ⚠️ Autocrop skipped: {e}")
+            # STEP 4: Upload to GCP
+            print(f"   4️⃣ Uploading to GCP Storage...")
+            image_url = None
+            if GCP_AVAILABLE:
+                try:
+                    with open(temp_file_path, 'rb') as f:
+                        image_bytes = f.read()
+                    gcp_blob_path = f"images/{mode}/{temp_file_name}"
+                    blob = gcp_bucket.blob(gcp_blob_path)
+                    blob.upload_from_string(image_bytes, content_type="image/png")
+                    image_url = blob.public_url
+                    print(f"   ✅ Uploaded to GCP: {image_url}")
+                except Exception as e:
+                    error_str = str(e).lower()
+                    if 'billing' in error_str or 'project_invalid' in error_str:
+                        print(f"   ⚠️ GCP billing not enabled")
+                        image_url = None
+                    else:
+                        print(f"   ❌ GCP upload error: {str(e)}")
+                        image_url = None
+            else:
+                print(f"   ⚠️ GCP not configured - cannot upload")
+            # STEP 5: Update slide with URL or error message
+            if image_url:
+                slide['image_description'] = image_url
+                print(f"   ✅ Slide {idx} complete: Image available at GCP URL")
+            else:
+                slide['image_description'] = "Image generation succeeded but upload unavailable"
+                print(f"   ⚠️ Slide {idx}: Image not uploaded to GCP")
+            # Cleanup temp file
+            try:
+                os.remove(temp_file_path)
+            except:
+                pass
+        except Exception as e:
+            print(f"   ❌ Error processing slide {idx}: {str(e)}")
+            slide['image_description'] = f"Error: {str(e)}"
+    print(f"\n✅ Image processing complete")
+    return slide_json
+print("✓ Image generation functions ready (NEW Gemini 2.5 Flash Image API + proper error checking)")

src/pipelines_functions.py ADDED Viewed

	@@ -0,0 +1,482 @@

+import os
+import json
+import re
+from openai import OpenAI
+import google.generativeai as genai
+from tenacity import retry, stop_after_attempt, wait_exponential
+from dotenv import load_dotenv
+load_dotenv()
+# ============================================================
+# API INITIALIZATION
+# ============================================================
+PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+if not PERPLEXITY_API_KEY:
+    raise ValueError("❌ PERPLEXITY_API_KEY not set in .env")
+if not GEMINI_API_KEY:
+    raise ValueError("❌ GEMINI_API_KEY not set in .env")
+perplexity_client = OpenAI(
+    api_key=PERPLEXITY_API_KEY,
+    base_url="https://api.perplexity.ai",
+)
+genai.configure(api_key=GEMINI_API_KEY)
+# ============================================================
+# TECHNICAL PIPELINE
+# ============================================================
+@retry(
+    stop=stop_after_attempt(3),
+    wait=wait_exponential(multiplier=1, min=4, max=10)
+)
+def generate_technical_content(topic):
+    """
+    Stage 1: Generate technical slides using Perplexity.
+    EXACT PROMPT from technical_gcp_image_pipeline-1.ipynb
+    """
+    print(f"\n🔄 Generating technical content for: {topic}")
+    try:
+        system_prompt = f"""You are a domain expert in technology and IT infrastructure with deep knowledge across all technology domains.
+Task:
+For the topic "{topic}", generate 9 to 10 slides as JSON.
+Instructions:
+- Write universally applicable content that any technology professional can understand and use.
+- Each slide should have an engaging and concise "slide_title" (maximum 6 words).
+- "slide_content" must be 3-4 sentences (strictly 40-60 words) with technical depth and practical relevance.
+- For the 3 most critical slides ONLY, add "image_description" (strictly 30-40 words) describing specific technical diagrams.
+- First slide: Overview explaining why this technology matters universally.
+- Last slide: "Further Learning & Documentation" with placeholder for 5 curated URLs.
+- Use clear, accessible language. Avoid industry-specific jargon.
+- For all other slides, set image_description to null.
+Additional Requirement — ALIASES FIELD:
+- Generate 6-7 lowercase alternative names/synonyms for "{topic}".
+- First alias MUST be the normalized lowercase form of the topic.
+- Include abbreviations and common variations.
+Output ONLY valid JSON (no code blocks, no markdown):
+{{
+ "topic": "{topic}",
+ "aliases": ["primary lowercase form", "alias2", "alias3", ...],
+ "content": [
+   {{
+     "slide_title": "...",
+     "slide_content": "...",
+     "image_description": "..." or null
+   }}
+ ],
+ "urls": [
+   {{"title": "...", "url": "https://..."}},
+   ...
+ ]
+}}
+"""
+        response = perplexity_client.chat.completions.create(
+            model="sonar-pro",
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": f"Generate a universally applicable technical presentation on {topic}"}
+            ],
+            temperature=0.5,
+            max_tokens=4000,
+            timeout=60,
+        )
+        content = response.choices[0].message.content
+        try:
+            result = json.loads(content)
+            if 'aliases' not in result:
+                result['aliases'] = [topic.lower().strip()]
+            print(f"✅ Generation successful - {len(result.get('content', []))} slides")
+            return result
+        except json.JSONDecodeError:
+            json_match = re.search(r'\{.*\}', content, re.DOTALL)
+            if json_match:
+                result = json.loads(json_match.group())
+                if 'aliases' not in result:
+                    result['aliases'] = [topic.lower().strip()]
+                return result
+            raise ValueError("Could not parse JSON from response")
+    except Exception as e:
+        print(f"❌ Generation failed: {type(e).__name__}: {str(e)}")
+        raise
+@retry(
+    stop=stop_after_attempt(2),
+    wait=wait_exponential(multiplier=1, min=3, max=10)
+)
+def correct_technical_content(generated_json):
+    """
+    Stage 2: Correct with Gemini 2.5 Flash (TEXT ONLY).
+    EXACT PROMPT from technical_gcp_image_pipeline-1.ipynb
+    """
+    print(f"\n🔄 Correcting technical content with Gemini 2.5 Flash")
+    try:
+        gemini_model = genai.GenerativeModel("gemini-2.5-flash")
+        correction_prompt = f"""You are an expert technical editor for universal technology training materials.
+Review the following slide presentation and improve it:
+{json.dumps(generated_json, indent=2)}
+Your tasks:
+1. Ensure slide titles are clear, concise (max 6 words) and engaging.
+2. Verify that slide_content is universally applicable.
+3. Check that content flows logically, is technically accurate.
+4. For image_descriptions: Make them specific, actionable, and suitable for technical diagram generation.
+5. Review and enhance URLs - add 2-3 additional high-quality URLs if missing.
+6. Keep all word counts natural and readable.
+CRITICAL INSTRUCTION:
+- The field "aliases" must remain EXACTLY as provided (do not change it).
+- Keep "image_description" fields exactly as they are.
+- For slides without image_description, set to null.
+- Retain the most educationally valuable 3 slides for images — set the rest to null.
+OUTPUT REQUIREMENT:
+Return ONLY the corrected JSON in the exact same schema as the input.
+Do not include code fences, markdown, or extra commentary.
+"""
+        response = gemini_model.generate_content(correction_prompt)
+        corrected_text = response.text.strip()
+        corrected_text = re.sub(r'^\s*```(?:json)?\s*\n?', '', corrected_text, count=1)
+        corrected_text = re.sub(r'\s*```\s*$', '', corrected_text, count=1)
+        try:
+            result = json.loads(corrected_text)
+            if 'aliases' not in result:
+                result['aliases'] = generated_json.get('aliases', [])
+            print(f"✅ Correction successful")
+            return result
+        except json.JSONDecodeError:
+            json_match = re.search(r'\{.*\}', corrected_text, re.DOTALL)
+            if json_match:
+                result = json.loads(json_match.group())
+                if 'aliases' not in result:
+                    result['aliases'] = generated_json.get('aliases', [])
+                return result
+            print(f"⚠️ Correction parsing failed - returning original")
+            return generated_json
+    except Exception as e:
+        print(f"❌ Correction failed: {type(e).__name__}: {str(e)}")
+        raise
+@retry(
+    stop=stop_after_attempt(2),
+    wait=wait_exponential(multiplier=1, min=3, max=10)
+)
+def refine_technical_content(validated_json):
+    """
+    Stage 3: Final refinement with Perplexity.
+    EXACT PROMPT from technical_gcp_image_pipeline-1.ipynb
+    """
+    print(f"\n🔄 Refining technical content")
+    try:
+        refine_prompt = f"""You are a senior technical content specialist for universal technology training.
+This slide presentation has been validated. Perform the final refinement:
+{json.dumps(validated_json, indent=2)}
+Your tasks:
+1. Ensure image_descriptions are detailed, specific, and suitable for technical diagram generation.
+2. Verify that slide content is universally applicable and consistent.
+3. Confirm that all technical terms are accurate.
+4. Review and refine the URLs:
+   - Select up to 5 of the best URLs only.
+   - Order them by: Authority, Relevance, Learning value, Diversity.
+   - Ensure all chosen URLs are authoritative and current.
+5. Keep all slide content exactly the same length/style.
+6. Maintain perfect JSON structure.
+CRITICAL INSTRUCTION:
+- The field "aliases" must remain EXACTLY as provided.
+- Keep "image_description" fields for image generation.
+OUTPUT REQUIREMENT:
+Return ONLY the refined JSON in the exact same schema as the input.
+"""
+        response = perplexity_client.chat.completions.create(
+            model="sonar-pro",
+            messages=[{"role": "user", "content": refine_prompt}],
+            temperature=0.3,
+            max_tokens=4000,
+            timeout=60,
+        )
+        refined_text = response.choices[0].message.content.strip()
+        refined_text = re.sub(r'^\s*```(?:json)?\s*\n?', '', refined_text, count=1)
+        refined_text = re.sub(r'\s*```\s*$', '', refined_text, count=1)
+        try:
+            result = json.loads(refined_text)
+            if 'aliases' not in result:
+                result['aliases'] = validated_json.get('aliases', [])
+            print(f"✅ Refinement successful")
+            return result
+        except json.JSONDecodeError:
+            json_match = re.search(r'\{.*\}', refined_text, re.DOTALL)
+            if json_match:
+                result = json.loads(json_match.group())
+                if 'aliases' not in result:
+                    result['aliases'] = validated_json.get('aliases', [])
+                return result
+            print(f"⚠️ Refinement failed - returning validated content")
+            return validated_json
+    except Exception as e:
+        print(f"❌ Refinement failed: {type(e).__name__}: {str(e)}")
+        raise
+# ============================================================
+# OPERATIONAL PIPELINE
+# ============================================================
+@retry(
+    stop=stop_after_attempt(3),
+    wait=wait_exponential(multiplier=1, min=4, max=10)
+)
+def generate_operational_content(topic):
+    """
+    Stage 1: Generate operational slides using Perplexity.
+    EXACT PROMPT from operational_gcp_image_pipeline-2.ipynb
+    """
+    print(f"\n🔄 Generating operational content for: {topic}")
+    try:
+        system_prompt = f"""You are a domain expert in business operations, compliance, regulatory frameworks, and enterprise management.
+Task:
+For the topic "{topic}", generate 9 to 10 slides as JSON.
+Instructions:
+- Target intermediate professionals (2+ years experience) seeking actionable, scenario-driven insights.
+- Each slide should have a unique and engaging "slide_title" (maximum 6 words).
+- "slide_content" must be 3-4 sentences (strictly 40-60 words), balancing regulatory requirements with operational business value.
+- Emphasize both regulatory drivers AND business impact: compliance obligations, operational efficiency, risk mitigation, and competitive advantage.
+- For the 3 most important slides ONLY, add "image_description" (strictly 30-40 words) describing meaningful business/operational diagrams.
+- First slide: Overview positioning the topic's regulatory importance and business operational impact.
+- Last slide: "Further Learning & Documentation" with specific next learning topics.
+- Use clear, accessible language without basic dictionary definitions.
+- Focus on practical application, regulatory compliance, and business outcomes.
+- For all other slides, set image_description to null.
+Additional Requirement — ALIASES FIELD:
+- Generate 4-5 lowercase alternative names/synonyms for "{topic}".
+- First alias MUST be the normalized lowercase form of the topic.
+- Include abbreviations and terms that refer to the same concept.
+Output ONLY valid JSON (no code blocks, no markdown):
+{{
+ "topic": "{topic}",
+ "aliases": ["primary lowercase form", "alias2", ...],
+ "content": [
+   {{
+     "slide_title": "...",
+     "slide_content": "...",
+     "image_description": "..." or null
+   }}
+ ],
+ "urls": [
+   {{"title": "...", "url": "https://..."}},
+   ...
+ ]
+}}
+"""
+        response = perplexity_client.chat.completions.create(
+            model="sonar-pro",
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": f"Generate an intermediate-level, practical business operations presentation on: {topic}"}
+            ],
+            temperature=0.5,
+            max_tokens=4000,
+            timeout=60,
+        )
+        content = response.choices[0].message.content
+        try:
+            result = json.loads(content)
+            if 'aliases' not in result:
+                result['aliases'] = [topic.lower().strip()]
+            print(f"✅ Generation successful - {len(result.get('content', []))} slides")
+            return result
+        except json.JSONDecodeError:
+            json_match = re.search(r'\{.*\}', content, re.DOTALL)
+            if json_match:
+                result = json.loads(json_match.group())
+                if 'aliases' not in result:
+                    result['aliases'] = [topic.lower().strip()]
+                return result
+            raise ValueError("Could not parse JSON from response")
+    except Exception as e:
+        print(f"❌ Generation failed: {type(e).__name__}: {str(e)}")
+        raise
+@retry(
+    stop=stop_after_attempt(2),
+    wait=wait_exponential(multiplier=1, min=3, max=10)
+)
+def correct_operational_content(generated_json):
+    """
+    Stage 2: Correct with Gemini 2.5 PRO (stronger model for operational).
+    EXACT PROMPT from operational_gcp_image_pipeline-2.ipynb
+    """
+    print(f"\n🔄 Correcting operational content with Gemini 2.5 PRO")
+    try:
+        gemini_model = genai.GenerativeModel("gemini-2.5-pro")  # STRONGER MODEL FOR OPERATIONAL
+        correction_prompt = f"""You are an expert business operations and compliance editor.
+Review this business operations presentation and improve it:
+{json.dumps(generated_json, indent=2)}
+Your tasks:
+1. Ensure slide titles are clear, concise (max 6 words), and business-focused.
+2. Verify slide_content balances regulatory requirements WITH business operational value (40–60 words).
+3. Strengthen regulatory references: name specific acts, frameworks, or compliance concepts.
+4. For image_descriptions: Make them specific to business processes and regulatory workflows.
+5. Review and improve the URLs - add 2-3 additional high-quality official URLs.
+6. Maintain the intermediate professional tone.
+7. Ensure logical flow: regulatory → operational → actionable insights.
+CRITICAL INSTRUCTION:
+- The field "aliases" must remain EXACTLY as provided.
+- Keep "image_description" fields for image generation.
+- For slides without image_description, set to null.
+- Retain the most important 3 slides for images — set the rest to null.
+OUTPUT REQUIREMENT:
+Return ONLY the corrected JSON in the exact same schema as the input.
+"""
+        response = gemini_model.generate_content(correction_prompt)
+        corrected_text = response.text.strip()
+        corrected_text = re.sub(r'^\s*```(?:json)?\s*\n?', '', corrected_text, count=1)
+        corrected_text = re.sub(r'\s*```\s*$', '', corrected_text, count=1)
+        try:
+            result = json.loads(corrected_text)
+            if 'aliases' not in result:
+                result['aliases'] = generated_json.get('aliases', [])
+            print(f"✅ Correction successful")
+            return result
+        except json.JSONDecodeError:
+            json_match = re.search(r'\{.*\}', corrected_text, re.DOTALL)
+            if json_match:
+                result = json.loads(json_match.group())
+                if 'aliases' not in result:
+                    result['aliases'] = generated_json.get('aliases', [])
+                return result
+            print(f"⚠️ Correction parsing failed - returning original")
+            return generated_json
+    except Exception as e:
+        print(f"❌ Correction failed: {type(e).__name__}: {str(e)}")
+        raise
+@retry(
+    stop=stop_after_attempt(2),
+    wait=wait_exponential(multiplier=1, min=3, max=10)
+)
+def refine_operational_content(validated_json):
+    """
+    Stage 3: Final refinement with Perplexity.
+    EXACT PROMPT from operational_gcp_image_pipeline-2.ipynb
+    """
+    print(f"\n🔄 Refining operational content")
+    try:
+        refine_prompt = f"""You are a senior business operations content specialist.
+This business operations presentation has been validated. Perform the final refinement:
+{json.dumps(validated_json, indent=2)}
+Your tasks:
+1. Ensure image descriptions are specific to business workflows, compliance processes, and decision-making.
+2. Verify slide content emphasizes actionable business value, regulatory relevance, and measurable outcomes.
+3. Confirm terminology is accurate, consistent, and understandable to intermediate business professionals.
+4. Review and refine the URLs:
+   - Select up to 5 of the best URLs only.
+   - Order by: Authority (regulatory bodies first), Relevance, Learning value, Diversity.
+   - Ensure all URLs are authoritative, recent, and relevant.
+5. Keep all slide content exactly the same.
+6. Maintain perfect JSON structure.
+CRITICAL INSTRUCTION:
+- The field "aliases" must remain EXACTLY as provided.
+- Keep "image_description" fields for image generation.
+OUTPUT REQUIREMENT:
+Return ONLY the refined JSON in the exact same schema as the input.
+"""
+        response = perplexity_client.chat.completions.create(
+            model="sonar-pro",
+            messages=[{"role": "user", "content": refine_prompt}],
+            temperature=0.3,
+            max_tokens=4000,
+            timeout=60,
+        )
+        refined_text = response.choices[0].message.content.strip()
+        refined_text = re.sub(r'^\s*```(?:json)?\s*\n?', '', refined_text, count=1)
+        refined_text = re.sub(r'\s*```\s*$', '', refined_text, count=1)
+        try:
+            result = json.loads(refined_text)
+            if 'aliases' not in result:
+                result['aliases'] = validated_json.get('aliases', [])
+            print(f"✅ Refinement successful")
+            return result
+        except json.JSONDecodeError:
+            json_match = re.search(r'\{.*\}', refined_text, re.DOTALL)
+            if json_match:
+                result = json.loads(json_match.group())
+                if 'aliases' not in result:
+                    result['aliases'] = validated_json.get('aliases', [])
+                return result
+            print(f"⚠️ Refinement failed - returning validated content")
+            return validated_json
+    except Exception as e:
+        print(f"❌ Refinement failed: {type(e).__name__}: {str(e)}")
+        raise
+print("✓ All pipeline functions loaded (Perplexity + Gemini 2.5 Flash/Pro for text)")

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,725 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import streamlit as st
+import json
+import os
+from datetime import datetime
+from dotenv import load_dotenv
+# Import custom functions
+from pipelines_functions import (
+    generate_technical_content, correct_technical_content, refine_technical_content,
+    generate_operational_content, correct_operational_content, refine_operational_content
+)
+from utils_functions import (
+    validate_and_sanitize_topic, check_cache, save_to_cache, validate_and_select_urls,
+    get_collections, PipelineMetrics
+)
+from image_generation_functions import process_images_for_pipeline
+load_dotenv()
+# ============================================================
+# PAGE CONFIGURATION
+# ============================================================
+st.set_page_config(
+    page_title="LearnOnTheGo",
+    page_icon="🎓",
+    layout="wide",
+    initial_sidebar_state="collapsed"
+)
+# ============================================================
+# HELPER FUNCTIONS
+# ============================================================
+def sanitize_for_html(raw_text: str) -> str:
+    """Escape HTML special characters for safe embedding."""
+    if not isinstance(raw_text, str):
+        raw_text = str(raw_text)
+    return raw_text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
+def detect_code_content(text: str) -> bool:
+    """Detect if content looks like code (has HTML tags, brackets, etc)."""
+    code_indicators = ['<div', '<html', 'class=', 'style=', '{', '}', 'function', 'import', 'def ']
+    return any(indicator in text for indicator in code_indicators)
+# ============================================================
+# CUSTOM CSS - FINAL VERSION WITH SEPARATED PROGRESS BAR
+# ============================================================
+st.markdown("""
+<style>
+    /* Root color palette */
+    :root {
+        --primary-blue: #2563eb;
+        --accent-teal: #0891b2;
+        --light-blue: #eff6ff;
+        --light-teal: #e0f2fe;
+        --text-dark: #1e293b;
+        --text-light: #64748b;
+        --border-color: #bae6fd;
+        --shadow: 0 8px 24px rgba(37, 99, 235, 0.12);
+        --shadow-lg: 0 20px 50px rgba(37, 99, 235, 0.25);
+    }
+    /* Overall app styling */
+    .stApp {
+        background: linear-gradient(135deg, #dbeafe 0%, #cffafe 100%);
+    }
+    /* Hide default Streamlit elements */
+    #MainMenu {visibility: hidden;}
+    footer {visibility: hidden;}
+    header {visibility: hidden;}
+    /* Main container */
+    .block-container {
+        max-width: 1400px;
+        padding-top: 2rem;
+        padding-bottom: 2rem;
+    }
+    /* Header styling */
+    .header-container {
+        background: linear-gradient(110deg, var(--primary-blue) 0%, var(--accent-teal) 100%);
+        padding: 60px 30px;
+        border-radius: 28px;
+        text-align: center;
+        margin-bottom: 40px;
+        box-shadow: var(--shadow-lg);
+    }
+    .header-container h1 {
+        color: white;
+        font-size: 62px;
+        margin: 0;
+        font-weight: 900;
+        letter-spacing: 3px;
+        text-shadow: 0 4px 12px rgba(0, 0, 0, 0.2);
+    }
+    /* Search box styling */
+    .search-container {
+        background: linear-gradient(135deg, #ffffff 0%, #f8fafc 100%);
+        padding: 35px;
+        border-radius: 24px;
+        margin-bottom: 30px;
+        border: 3px solid var(--border-color);
+        box-shadow: var(--shadow);
+    }
+    /* Text input styling */
+    .stTextInput > div > div > input {
+        border: 3px solid var(--accent-teal) !important;
+        border-radius: 14px !important;
+        padding: 16px 20px !important;
+        font-size: 17px !important;
+        background-color: white !important;
+        transition: all 0.3s ease !important;
+    }
+    .stTextInput > div > div > input:focus {
+        border-color: var(--primary-blue) !important;
+        box-shadow: 0 0 0 4px rgba(37, 99, 235, 0.15) !important;
+        outline: none !important;
+    }
+    .stTextInput > div > div > input::placeholder {
+        color: rgba(100, 116, 139, 0.6) !important;
+        font-weight: 500 !important;
+    }
+    /* Radio container */
+    .stRadio > div[role="radiogroup"] {
+        display: flex !important;
+        gap: 0 !important;
+        background: #e0e7ff !important;
+        border-radius: 14px !important;
+        padding: 4px !important;
+        border: 3px solid var(--border-color) !important;
+        width: fit-content !important;
+        margin: 0 auto !important;
+    }
+    /* Individual radio labels */
+    .stRadio > div[role="radiogroup"] > label {
+        background: transparent !important;
+        border: none !important;
+        padding: 12px 32px !important;
+        border-radius: 10px !important;
+        cursor: pointer !important;
+        transition: all 0.3s ease !important;
+        font-weight: 700 !important;
+        color: var(--text-light) !important;
+        font-size: 15px !important;
+        text-align: center !important;
+        min-width: 140px !important;
+        margin: 0 !important;
+        flex: 1 !important;
+    }
+    .stRadio > div[role="radiogroup"] > label:hover {
+        background: rgba(255, 255, 255, 0.5) !important;
+    }
+    .stRadio > div[role="radiogroup"] > label[data-checked="true"],
+    .stRadio > div[role="radiogroup"] > label:has(input:checked),
+    .stRadio > div[role="radiogroup"] > label[aria-checked="true"] {
+        background: linear-gradient(110deg, var(--primary-blue) 0%, var(--accent-teal) 100%) !important;
+        color: white !important;
+        box-shadow: 0 4px 12px rgba(37, 99, 235, 0.35) !important;
+    }
+    .stRadio input[type="radio"] {
+        display: none !important;
+    }
+    /* Button styling */
+    .stButton > button {
+        background: linear-gradient(100deg, var(--primary-blue) 0%, var(--accent-teal) 100%) !important;
+        color: white !important;
+        font-weight: 800 !important;
+        padding: 18px 40px !important;
+        border-radius: 14px !important;
+        border: none !important;
+        transition: all 0.3s ease !important;
+        box-shadow: 0 6px 20px rgba(37, 99, 235, 0.3) !important;
+        font-size: 17px !important;
+        letter-spacing: 0.5px !important;
+    }
+    .stButton > button:hover {
+        background: linear-gradient(100deg, #1d4ed8 0%, #0e7490 100%) !important;
+        transform: translateY(-3px) !important;
+        box-shadow: 0 10px 30px rgba(37, 99, 235, 0.4) !important;
+    }
+    /* SLIDE BOX - WITHOUT PROGRESS BAR */
+    .slide-box-wrapper {
+        background: white;
+        border-radius: 28px;
+        border: 3px solid var(--border-color);
+        box-shadow: 0 20px 60px rgba(37, 99, 235, 0.18);
+        padding: 50px 45px;
+        margin: 40px auto;
+        max-width: 1000px;
+        animation: slideIn 0.5s ease-out;
+    }
+    @keyframes slideIn {
+        from {
+            opacity: 0;
+            transform: translateY(30px);
+        }
+        to {
+            opacity: 1;
+            transform: translateY(0);
+        }
+    }
+    /* Slide content */
+    .slide-box-wrapper > * {
+        display: block !important;
+        width: 100% !important;
+        box-sizing: border-box !important;
+    }
+    /* Slide title */
+    .slide-box-wrapper h2 {
+        font-size: 42px;
+        font-weight: 900;
+        color: var(--primary-blue);
+        margin: 0 0 20px 0 !important;
+        letter-spacing: 1px;
+        line-height: 1.3;
+        text-shadow: 0 2px 8px rgba(37, 99, 235, 0.15);
+        word-wrap: break-word;
+        overflow-wrap: break-word;
+        text-align: center;
+    }
+    /* Slide text and paragraphs */
+    .slide-box-wrapper p {
+        font-size: 20px;
+        color: var(--text-dark);
+        line-height: 2.2;
+        margin: 0 0 24px 0 !important;
+        font-weight: 500;
+        text-align: left;
+        padding: 0 20px;
+        box-sizing: border-box;
+    }
+    /* Code block styling */
+    .slide-box-wrapper pre {
+        background: #f8fafc;
+        border-radius: 12px;
+        padding: 20px;
+        font-family: 'Consolas', 'Monaco', 'Courier New', monospace;
+        font-size: 14px;
+        color: #0f172a;
+        overflow-x: auto;
+        border: 2px solid #e0e7ff;
+        margin: 16px 20px !important;
+        text-align: left;
+        line-height: 1.6;
+        white-space: pre-wrap;
+        word-wrap: break-word;
+    }
+    /* Images inside slide box */
+    .slide-box-wrapper img {
+        max-width: 90% !important;
+        height: auto !important;
+        border-radius: 20px !important;
+        box-shadow: 0 8px 24px rgba(37, 99, 235, 0.2) !important;
+        display: block !important;
+        margin: 24px auto !important;
+    }
+    /* Learning Resources */
+    .resources-section {
+        margin-top: 32px;
+        padding-top: 28px;
+        border-top: 3px solid var(--border-color);
+        text-align: left;
+    }
+    .resources-section h4 {
+        color: var(--primary-blue);
+        font-size: 22px;
+        margin: 0 0 18px 0 !important;
+        font-weight: 800;
+        text-align: center;
+    }
+    .resources-section a {
+        color: var(--accent-teal);
+        text-decoration: none;
+        font-weight: 600;
+        transition: all 0.3s ease;
+        display: block;
+        padding: 10px 15px;
+        font-size: 16px;
+        border-radius: 8px;
+        margin-bottom: 8px;
+    }
+    .resources-section a:hover {
+        color: white;
+        background: var(--primary-blue);
+        padding-left: 20px;
+        box-shadow: 0 4px 12px rgba(37, 99, 235, 0.2);
+    }
+    /* PROGRESS CONTAINER - MOVED OUTSIDE BOX */
+    .progress-container {
+        display: flex;
+        align-items: center;
+        justify-content: center;
+        gap: 20px;
+        margin: 30px auto;
+        padding: 20px 0;
+        max-width: 1000px;
+        width: 100%;
+    }
+    .progress-bar {
+        flex: 1;
+        max-width: 700px;
+        height: 10px;
+        background: #e0e7ff;
+        border-radius: 12px;
+        overflow: hidden;
+        box-shadow: inset 0 2px 4px rgba(37, 99, 235, 0.1);
+    }
+    .progress-fill {
+        height: 100%;
+        background: linear-gradient(90deg, var(--primary-blue) 0%, var(--accent-teal) 100%);
+        transition: width 0.4s ease;
+        box-shadow: 0 0 10px rgba(37, 99, 235, 0.4);
+    }
+    /* Slide counter badge */
+    .slide-counter-badge {
+        background: linear-gradient(110deg, var(--primary-blue) 0%, var(--accent-teal) 100%);
+        color: white;
+        padding: 10px 20px;
+        border-radius: 24px;
+        font-size: 16px;
+        font-weight: 800;
+        min-width: 90px;
+        text-align: center;
+        box-shadow: 0 4px 12px rgba(37, 99, 235, 0.3);
+        white-space: nowrap;
+    }
+    /* Footer */
+    .footer-bar {
+        margin-top: 60px;
+        text-align: center;
+        color: var(--text-dark);
+        font-size: 16px;
+        letter-spacing: 0.5px;
+        padding: 32px 0;
+        border-top: 3px solid var(--border-color);
+        background: white;
+        border-radius: 20px;
+        box-shadow: 0 4px 16px rgba(37, 99, 235, 0.08);
+    }
+    .footer-bar p {
+        margin: 10px 0;
+        font-weight: 600;
+    }
+    .footer-bar p:last-child {
+        font-size: 14px;
+        color: var(--text-light);
+        font-weight: 500;
+    }
+    /* Alert messages */
+    .stSuccess {
+        border-radius: 14px !important;
+        border-left: 5px solid #10b981 !important;
+        background-color: #ecfdf5 !important;
+        padding: 16px !important;
+        font-weight: 600 !important;
+    }
+    .stInfo {
+        border-radius: 14px !important;
+        border-left: 5px solid var(--primary-blue) !important;
+        background-color: #f0f9ff !important;
+        padding: 16px !important;
+        font-weight: 600 !important;
+    }
+    .stError {
+        border-radius: 14px !important;
+        border-left: 5px solid #ef4444 !important;
+        background-color: #fef2f2 !important;
+        padding: 16px !important;
+        font-weight: 600 !important;
+    }
+    /* Mobile responsive */
+    @media (max-width: 768px) {
+        .header-container h1 {
+            font-size: 40px;
+        }
+        .slide-box-wrapper {
+            padding: 30px 20px;
+        }
+        .slide-box-wrapper h2 {
+            font-size: 30px;
+        }
+        .slide-box-wrapper p {
+            font-size: 17px;
+            line-height: 1.9;
+            padding: 0 10px;
+        }
+        .progress-container {
+            flex-direction: row;
+            gap: 15px;
+            margin: 20px auto;
+            padding: 15px 0;
+        }
+        .progress-bar {
+            width: 100%;
+            max-width: none;
+        }
+    }
+</style>
+""", unsafe_allow_html=True)
+# ============================================================
+# SESSION STATE INITIALIZATION
+# ============================================================
+if "current_slide" not in st.session_state:
+    st.session_state.current_slide = 0
+if "slides_data" not in st.session_state:
+    st.session_state.slides_data = None
+if "search_query" not in st.session_state:
+    st.session_state.search_query = ""
+if "mode" not in st.session_state:
+    st.session_state.mode = "technical"
+if "is_loading" not in st.session_state:
+    st.session_state.is_loading = False
+if "error_message" not in st.session_state:
+    st.session_state.error_message = None
+if "metrics" not in st.session_state:
+    st.session_state.metrics = None
+# ============================================================
+# PIPELINE FUNCTION
+# ============================================================
+def run_pipeline(query, mode):
+    """Execute the 5-stage pipeline with metrics tracking."""
+    try:
+        metrics = PipelineMetrics(query, mode)
+        query = validate_and_sanitize_topic(query)
+        technical_col, operational_col, db = get_collections()
+        collection = operational_col if mode == "operational" else technical_col
+        # Cache check
+        metrics.start_stage("Cache Check")
+        cached_content, is_cached = check_cache(query, collection)
+        if is_cached:
+            metrics.set_cache_hit("mongodb")
+        metrics.end_stage("Cache Check")
+        if is_cached:
+            st.session_state.slides_data = cached_content
+            st.session_state.current_slide = 0
+            metrics.end()
+            metrics.save_metrics()
+            return True, "✅ Retrieved from cache (instant!)"
+        st.session_state.is_loading = True
+        with st.spinner(f"🔄 Generating {mode} content with images (5 stages)..."):
+            if mode == "technical":
+                metrics.start_stage("Generate")
+                generated = generate_technical_content(query)
+                metrics.end_stage("Generate", f"{len(generated.get('content', []))} slides")
+                metrics.start_stage("Correct")
+                corrected = correct_technical_content(generated)
+                metrics.end_stage("Correct", "Content improved")
+                metrics.start_stage("Validate URLs")
+                validated, _ = validate_and_select_urls(corrected)
+                metrics.end_stage("Validate URLs", f"{len(validated.get('urls', []))} URLs validated")
+                metrics.start_stage("Refine")
+                refined = refine_technical_content(validated)
+                metrics.end_stage("Refine", "Content refined")
+                metrics.start_stage("Generate Images")
+                final_result = process_images_for_pipeline(refined, mode="technical")
+                metrics.end_stage("Generate Images", "Images generated")
+            else:
+                metrics.start_stage("Generate")
+                generated = generate_operational_content(query)
+                metrics.end_stage("Generate", f"{len(generated.get('content', []))} slides")
+                metrics.start_stage("Correct")
+                corrected = correct_operational_content(generated)
+                metrics.end_stage("Correct", "Content improved")
+                metrics.start_stage("Validate URLs")
+                validated, _ = validate_and_select_urls(corrected)
+                metrics.end_stage("Validate URLs", f"{len(validated.get('urls', []))} URLs validated")
+                metrics.start_stage("Refine")
+                refined = refine_operational_content(validated)
+                metrics.end_stage("Refine", "Content refined")
+                metrics.start_stage("Generate Images")
+                final_result = process_images_for_pipeline(refined, mode="operational")
+                metrics.end_stage("Generate Images", "Images generated")
+            save_to_cache(query, final_result, collection)
+            st.session_state.slides_data = final_result
+            st.session_state.current_slide = 0
+            st.session_state.is_loading = False
+        pipeline_metrics = metrics.end()
+        metrics.save_metrics()
+        st.session_state.metrics = pipeline_metrics
+        total_time = pipeline_metrics.get('total_duration_seconds', 0)
+        return True, f"✅ Generated {len(final_result.get('content', []))} slides in {total_time:.1f}s!"
+    except Exception as e:
+        st.session_state.is_loading = False
+        st.session_state.error_message = str(e)
+        return False, f"❌ Error: {str(e)}"
+# ============================================================
+# DISPLAY SLIDE FUNCTION - PROGRESS BAR OUTSIDE BOX
+# ============================================================
+def display_slide(slide_index):
+    """Display current slide with progress bar OUTSIDE the white box."""
+    if not st.session_state.slides_data:
+        return
+    slides = st.session_state.slides_data.get('content', [])
+    if not slides or slide_index >= len(slides):
+        return
+    slide = slides[slide_index]
+    total_slides = len(slides)
+    progress_percent = ((slide_index + 1) / total_slides) * 100
+    # Build the slide box HTML (WITHOUT progress bar)
+    title = sanitize_for_html(slide.get("slide_title", ""))
+    raw_content = slide.get("slide_content", "")
+    # Determine content type
+    if detect_code_content(raw_content):
+        sanitized_content = f"<pre>{sanitize_for_html(raw_content)}</pre>"
+    else:
+        sanitized_content = f"<p>{sanitize_for_html(raw_content)}</p>"
+    # Start building the slide HTML (NO progress bar inside)
+    slide_html = f"""
+    <div class="slide-box-wrapper">
+        <h2>{title}</h2>
+        {sanitized_content}
+    """
+    # Add image if available
+    img_url = slide.get('image_description')
+    if isinstance(img_url, str) and img_url.startswith('http'):
+        slide_html += f'<img src="{img_url}" alt="Slide image" style="max-width: 90%; height: auto; display: block; margin: 24px auto; border-radius: 20px; box-shadow: 0 8px 24px rgba(37, 99, 235, 0.2);">'
+    # Add learning resources (last slide only)
+    if slide_index == total_slides - 1:
+        urls = st.session_state.slides_data.get('urls', [])
+        if urls:
+            slide_html += '<div class="resources-section"><h4>📚 Learning Resources</h4>'
+            for i, url_obj in enumerate(urls, 1):
+                url_title = sanitize_for_html(url_obj.get('title', 'Documentation'))
+                url = url_obj.get('url', '#')
+                slide_html += f'<a href="{url}" target="_blank">{i}. {url_title}</a>'
+            slide_html += '</div>'
+    # Close the slide box (NO progress bar)
+    slide_html += '</div>'
+    # Render the slide box
+    st.markdown(slide_html, unsafe_allow_html=True)
+    # RENDER PROGRESS BAR OUTSIDE THE BOX
+    progress_html = f"""
+    <div class="progress-container">
+        <div class="progress-bar">
+            <div class="progress-fill" style="width: {progress_percent}%"></div>
+        </div>
+        <div class="slide-counter-badge">{slide_index + 1} / {total_slides}</div>
+    </div>
+    """
+    st.markdown(progress_html, unsafe_allow_html=True)
+    # Navigation buttons below
+    st.markdown('<br>', unsafe_allow_html=True)
+    col_left, col_center, col_right = st.columns([1, 8, 1])
+    with col_left:
+        if slide_index > 0:
+            if st.button("⬅", key="prev_btn", help="Previous slide", use_container_width=True):
+                st.session_state.current_slide -= 1
+                st.rerun()
+    with col_right:
+        if slide_index < total_slides - 1:
+            if st.button("➡", key="next_btn", help="Next slide", use_container_width=True):
+                st.session_state.current_slide += 1
+                st.rerun()
+# ============================================================
+# PAGE LAYOUT
+# ============================================================
+# Header
+st.markdown(
+    '<div class="header-container"><h1>🎓 LearnOnTheGo</h1></div>',
+    unsafe_allow_html=True
+)
+# Search container
+st.markdown('<div class="search-container">', unsafe_allow_html=True)
+col1, col2 = st.columns([3, 1])
+with col1:
+    search_query = st.text_input(
+        "Search",
+        value=st.session_state.search_query,
+        placeholder="e.g., Python, Machine Learning, Cloud Computing...",
+        key="search_input",
+        label_visibility="collapsed"
+    )
+    st.session_state.search_query = search_query
+with col2:
+    mode = st.radio(
+        "Mode",
+        options=["Technical", "Operational"],
+        index=0 if st.session_state.mode == "technical" else 1,
+        key="mode_radio",
+        horizontal=True,
+        label_visibility="collapsed"
+    )
+    st.session_state.mode = mode.lower()
+st.markdown('</div>', unsafe_allow_html=True)
+# Generate button
+col1, col2, col3 = st.columns([1, 2, 1])
+with col2:
+    search_button = st.button("🔍 Generate Slides", key="search_btn", use_container_width=True)
+# Error handling
+if st.session_state.error_message:
+    st.error(st.session_state.error_message)
+    st.session_state.error_message = None
+# Execute pipeline
+if search_button and st.session_state.search_query:
+    success, message = run_pipeline(st.session_state.search_query, st.session_state.mode)
+    if success:
+        st.success(message)
+    else:
+        st.error(message)
+# Display slides
+if st.session_state.slides_data:
+    st.markdown("---")
+    if st.session_state.current_slide >= len(st.session_state.slides_data.get('content', [])):
+        st.session_state.current_slide = 0
+    display_slide(st.session_state.current_slide)
+else:
+    st.info("👆 Enter a topic and click 'Generate Slides' to get started!")
+# Footer
+st.markdown(
+    """<div class="footer-bar">
+    <p><strong>LearnOnTheGo</strong> • Powered by AI • Built with Streamlit</p>
+    <p>5-Stage Pipeline: Generate → Correct → Validate → Refine → Generate Images</p>
+    <p>Gemini 2.5 Flash (text) • Gemini 2.5 Flash Image (images) • Perplexity Sonar Pro</p>
+</div>""",
+    unsafe_allow_html=True
+)
+print("✅ LearnOnTheGo - Progress bar moved outside box - Fixed!")

src/utils_functions.py ADDED Viewed

	@@ -0,0 +1,346 @@

+import os
+import json
+import pickle
+import hashlib
+import httpx
+from datetime import datetime, timezone
+from pymongo import MongoClient
+from tenacity import retry, stop_after_attempt, wait_exponential
+from dotenv import load_dotenv
+load_dotenv()
+# ============================================================
+# PIPELINE METRICS CLASS (Complete tracking system)
+# ============================================================
+class PipelineMetrics:
+    """
+    Complete metrics tracking for pipeline execution.
+    Tracks timing, stages, cache hits, and saves to MongoDB.
+    """
+    def __init__(self, topic, mode):
+        """Initialize metrics tracker"""
+        self.topic = topic
+        self.mode = mode
+        self.run_id = f"{mode}_{int(datetime.now().timestamp())}"
+        self.start_time = datetime.now(timezone.utc)
+        self.stages = {}
+        self.current_stage = None
+        self.current_stage_start = None
+        self.cache_hit = False
+        self.cache_type = None
+        self.error_occurred = False
+        self.error_message = None
+    def start_stage(self, stage_name):
+        """Start tracking a stage"""
+        self.current_stage = stage_name
+        self.current_stage_start = datetime.now(timezone.utc)
+        print(f"   📊 [METRICS] Starting: {stage_name}")
+    def end_stage(self, stage_name, output_summary=None):
+        """End tracking a stage"""
+        if self.current_stage_start:
+            duration = (datetime.now(timezone.utc) - self.current_stage_start).total_seconds()
+            self.stages[stage_name] = {
+                "duration_seconds": duration,
+                "timestamp": datetime.now(timezone.utc),
+                "output_summary": output_summary
+            }
+            print(f"   ✓ Stage '{stage_name}' completed in {duration:.2f}s")
+    def set_cache_hit(self, cache_type="mongodb"):
+        """Record cache hit"""
+        self.cache_hit = True
+        self.cache_type = cache_type
+        print(f"   💾 Cache hit: {cache_type}")
+    def set_error(self, error_message):
+        """Record error"""
+        self.error_occurred = True
+        self.error_message = error_message
+        print(f"   ❌ Error: {error_message}")
+    def end(self):
+        """End pipeline tracking"""
+        total_duration = (datetime.now(timezone.utc) - self.start_time).total_seconds()
+        self.metrics = {
+            "run_id": self.run_id,
+            "topic": self.topic,
+            "mode": self.mode,
+            "started_at": self.start_time,
+            "completed_at": datetime.now(timezone.utc),
+            "total_duration_seconds": total_duration,
+            "stages": self.stages,
+            "cache_hit": self.cache_hit,
+            "cache_type": self.cache_type,
+            "error_occurred": self.error_occurred,
+            "error_message": self.error_message
+        }
+        print(f"\n   📊 Pipeline Complete: {total_duration:.2f}s total")
+        return self.metrics
+    def save_metrics(self):
+        """Save metrics to MongoDB"""
+        try:
+            mongo_uri = os.getenv("MONGO_URI")
+            if not mongo_uri:
+                print("   ⚠️ MONGO_URI not set - skipping metrics save")
+                return False
+            client = MongoClient(mongo_uri, serverSelectionTimeoutMS=5000)
+            db = client["learnToGo"]
+            # Collections based on mode
+            if self.mode == "technical":
+                metrics_col = db["pipelinemetrics"]
+                stages_col = db["stageoutputs"]
+            else:
+                metrics_col = db["operational_pipeline_metrics"]
+                stages_col = db["operational_stage_outputs"]
+            # Save metrics
+            metrics_col.insert_one(self.metrics)
+            # Save stage details
+            for stage_name, stage_data in self.stages.items():
+                stage_doc = {
+                    "run_id": self.run_id,
+                    "topic": self.topic,
+                    "mode": self.mode,
+                    "stage_name": stage_name,
+                    "stage_data": stage_data
+                }
+                stages_col.insert_one(stage_doc)
+            print(f"   ✓ Metrics saved to MongoDB")
+            return True
+        except Exception as e:
+            print(f"   ⚠️ Could not save metrics: {e}")
+            return False
+# ============================================================
+# MONGODB CONNECTION & COLLECTIONS
+# ============================================================
+@retry(
+    stop=stop_after_attempt(3),
+    wait=wait_exponential(multiplier=1, min=2, max=10)
+)
+def get_mongo_client():
+    """Get MongoDB client from environment variables"""
+    mongo_uri = os.getenv("MONGO_URI")
+    if not mongo_uri:
+        raise ValueError("MONGO_URI not set in .env")
+    return MongoClient(mongo_uri, serverSelectionTimeoutMS=5000)
+def get_collections():
+    """Get MongoDB collections for Technical and Operational keywords"""
+    client = get_mongo_client()
+    db = client["learnToGo"]
+    technical_collection = db["Keywords"]
+    operational_collection = db["OperationalKeywords"]
+    # Create indexes
+    technical_collection.create_index("aliases")
+    operational_collection.create_index("aliases")
+    return technical_collection, operational_collection, db
+# ============================================================
+# URL CACHING (Pickle-based - FIXED with proper dict structure)
+# ============================================================
+URL_CACHE_FILE = "/tmp/url_validation_cache.pkl"
+def load_url_cache():
+    """Load URL validation cache from pickle file"""
+    try:
+        if os.path.exists(URL_CACHE_FILE):
+            with open(URL_CACHE_FILE, 'rb') as f:
+                cache = pickle.load(f)
+            print(f"✓ Loaded URL cache with {len(cache)} entries")
+            return cache
+    except Exception as e:
+        print(f"⚠️ Could not load URL cache: {e}")
+    return {}
+def save_url_cache(cache):
+    """Save URL validation cache to pickle file"""
+    try:
+        with open(URL_CACHE_FILE, 'wb') as f:
+            pickle.dump(cache, f)
+        print(f"✓ Saved URL cache with {len(cache)} entries")
+        return True
+    except Exception as e:
+        print(f"⚠️ Could not save URL cache: {e}")
+        return False
+def get_url_hash(url):
+    """Generate MD5 hash for URL as cache key"""
+    return hashlib.md5(url.encode()).hexdigest()
+@retry(
+    stop=stop_after_attempt(2),
+    wait=wait_exponential(multiplier=1, min=2, max=5)
+)
+def validate_url_cached(url, timeout=5):
+    """Check if URL is valid with cache check - FIXED to return dict"""
+    url_hash = get_url_hash(url)
+    # Load cache
+    url_cache = load_url_cache()
+    # Check cache
+    if url_hash in url_cache:
+        print(f"   💾 URL cache hit: {url[:50]}...")
+        return url_cache[url_hash]['valid']  # ← Returns boolean from dict
+    # Validate URL
+    try:
+        response = httpx.head(url, timeout=timeout, follow_redirects=True)
+        is_valid = response.status_code in [200, 301, 302, 303, 307, 308]
+    except:
+        try:
+            response = httpx.get(url, timeout=timeout, follow_redirects=True)
+            is_valid = response.status_code == 200
+        except:
+            is_valid = False
+    # Save to cache as DICT with valid, checked_at, url
+    url_cache[url_hash] = {
+        'valid': is_valid,
+        'checked_at': datetime.now(timezone.utc).isoformat(),
+        'url': url
+    }
+    save_url_cache(url_cache)
+    print(f"   ✓ URL validated: {url[:50]}... = {is_valid}")
+    return is_valid
+# ============================================================
+# CACHE OPERATIONS
+# ============================================================
+@retry(
+    stop=stop_after_attempt(3),
+    wait=wait_exponential(multiplier=1, min=2, max=10)
+)
+def check_cache(topic, collection):
+    """
+    Check MongoDB cache using normalized keyword - NO LLM call!
+    Includes retry logic for connection failures.
+    """
+    try:
+        normalized = topic.lower().strip()
+        print(f"🔍 Checking cache for: {normalized}")
+        cached = collection.find_one({"aliases": normalized})
+        if cached:
+            print(f"✅ CACHE HIT! Found topic: {cached['topic']}")
+            return cached['content'], True
+        else:
+            print(f"❌ CACHE MISS - Will run full pipeline")
+            return None, False
+    except Exception as e:
+        print(f"❌ Cache lookup error: {e}")
+        raise
+@retry(
+    stop=stop_after_attempt(3),
+    wait=wait_exponential(multiplier=1, min=2, max=10)
+)
+def save_to_cache(topic, content, collection):
+    """
+    Save generated slides to MongoDB.
+    Includes retry logic for connection failures.
+    """
+    try:
+        aliases = content.get('aliases', [topic.lower().strip()])
+        document = {
+            "topic": content.get('topic', topic),
+            "aliases": aliases,
+            "createdAt": datetime.now(timezone.utc),
+            "content": content
+        }
+        result = collection.insert_one(document)
+        print(f"✅ Saved to MongoDB - Document ID: {result.inserted_id}")
+        return result.inserted_id
+    except Exception as e:
+        print(f"❌ Cache save error: {e}")
+        raise
+# ============================================================
+# URL VALIDATION & SELECTION
+# ============================================================
+def validate_and_select_urls(corrected_json):
+    """
+    Validate ALL URLs and select best ones.
+    Uses cached validation to avoid repeated HTTP requests.
+    """
+    urls = corrected_json.get("urls", [])
+    print(f"Validating {len(urls)} URLs with caching...")
+    valid_urls = []
+    validation_results = []
+    for url_obj in urls:
+        url = url_obj.get("url")
+        if url:
+            is_valid = validate_url_cached(url)
+            validation_results.append({
+                "url": url,
+                "title": url_obj.get("title"),
+                "valid": is_valid
+            })
+            if is_valid:
+                valid_urls.append(url_obj)
+    # Keep only best 5 URLs
+    valid_urls = valid_urls[:5]
+    print(f"✓ Kept {len(valid_urls)} valid URLs")
+    corrected_json["urls"] = valid_urls
+    return corrected_json, validation_results
+# ============================================================
+# INPUT VALIDATION (50 char limit for both technical and operational)
+# ============================================================
+@retry(
+    stop=stop_after_attempt(3),
+    wait=wait_exponential(multiplier=1, min=1, max=3)
+)
+def validate_and_sanitize_topic(topic):
+    """
+    Validate and sanitize user input before pipeline.
+    Prevents errors and invalid topics.
+    FIXED: Both technical and operational now have 50 char limit
+    """
+    if not topic or not topic.strip():
+        raise ValueError("❌ Topic cannot be empty.")
+    topic = topic.strip()
+    if len(topic) < 1:
+        raise ValueError("❌ Topic must be at least 1 character long.")
+    if len(topic) > 50:
+        raise ValueError("❌ Topic cannot exceed 50 characters.")
+    print(f"✅ Input validated: '{topic}'")
+    return topic
+print("✓ All utility functions ready with metrics, URL caching, and retry logic")