Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| import re | |
| from openai import OpenAI | |
| import google.generativeai as genai | |
| from tenacity import retry, stop_after_attempt, wait_exponential | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| # ============================================================ | |
| # API INITIALIZATION | |
| # ============================================================ | |
| PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY") | |
| GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") | |
| if not PERPLEXITY_API_KEY: | |
| raise ValueError("β PERPLEXITY_API_KEY not set in .env") | |
| if not GEMINI_API_KEY: | |
| raise ValueError("β GEMINI_API_KEY not set in .env") | |
| perplexity_client = OpenAI( | |
| api_key=PERPLEXITY_API_KEY, | |
| base_url="https://api.perplexity.ai", | |
| ) | |
| genai.configure(api_key=GEMINI_API_KEY) | |
| # ============================================================ | |
| # TECHNICAL PIPELINE | |
| # ============================================================ | |
| def generate_technical_content(topic): | |
| """ | |
| Stage 1: Generate technical slides using Perplexity. | |
| EXACT PROMPT from technical_gcp_image_pipeline-1.ipynb | |
| """ | |
| print(f"\nπ Generating technical content for: {topic}") | |
| try: | |
| system_prompt = f"""You are a domain expert in technology and IT infrastructure with deep knowledge across all technology domains. | |
| Task: | |
| For the topic "{topic}", generate 9 to 10 slides as JSON. | |
| Instructions: | |
| - Write universally applicable content that any technology professional can understand and use. | |
| - Each slide should have an engaging and concise "slide_title" (maximum 6 words). | |
| - "slide_content" must be 3-4 sentences (strictly 40-60 words) with technical depth and practical relevance. | |
| - For the 3 most critical slides ONLY, add "image_description" (strictly 30-40 words) describing specific technical diagrams. | |
| - First slide: Overview explaining why this technology matters universally. | |
| - Last slide: "Further Learning & Documentation" with placeholder for 5 curated URLs. | |
| - Use clear, accessible language. Avoid industry-specific jargon. | |
| - For all other slides, set image_description to null. | |
| Additional Requirement β ALIASES FIELD: | |
| - Generate 6-7 lowercase alternative names/synonyms for "{topic}". | |
| - First alias MUST be the normalized lowercase form of the topic. | |
| - Include abbreviations and common variations. | |
| Output ONLY valid JSON (no code blocks, no markdown): | |
| {{ | |
| "topic": "{topic}", | |
| "aliases": ["primary lowercase form", "alias2", "alias3", ...], | |
| "content": [ | |
| {{ | |
| "slide_title": "...", | |
| "slide_content": "...", | |
| "image_description": "..." or null | |
| }} | |
| ], | |
| "urls": [ | |
| {{"title": "...", "url": "https://..."}}, | |
| ... | |
| ] | |
| }} | |
| """ | |
| response = perplexity_client.chat.completions.create( | |
| model="sonar-pro", | |
| messages=[ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": f"Generate a universally applicable technical presentation on {topic}"} | |
| ], | |
| temperature=0.5, | |
| max_tokens=4000, | |
| timeout=60, | |
| ) | |
| content = response.choices[0].message.content | |
| try: | |
| result = json.loads(content) | |
| if 'aliases' not in result: | |
| result['aliases'] = [topic.lower().strip()] | |
| print(f"β Generation successful - {len(result.get('content', []))} slides") | |
| return result | |
| except json.JSONDecodeError: | |
| json_match = re.search(r'\{.*\}', content, re.DOTALL) | |
| if json_match: | |
| result = json.loads(json_match.group()) | |
| if 'aliases' not in result: | |
| result['aliases'] = [topic.lower().strip()] | |
| return result | |
| raise ValueError("Could not parse JSON from response") | |
| except Exception as e: | |
| print(f"β Generation failed: {type(e).__name__}: {str(e)}") | |
| raise | |
| def correct_technical_content(generated_json): | |
| """ | |
| Stage 2: Correct with Gemini 2.5 Flash (TEXT ONLY). | |
| EXACT PROMPT from technical_gcp_image_pipeline-1.ipynb | |
| """ | |
| print(f"\nπ Correcting technical content with Gemini 2.5 Flash") | |
| try: | |
| gemini_model = genai.GenerativeModel("gemini-2.5-flash") | |
| correction_prompt = f"""You are an expert technical editor for universal technology training materials. | |
| Review the following slide presentation and improve it: | |
| {json.dumps(generated_json, indent=2)} | |
| Your tasks: | |
| 1. Ensure slide titles are clear, concise (max 6 words) and engaging. | |
| 2. Verify that slide_content is universally applicable. | |
| 3. Check that content flows logically, is technically accurate. | |
| 4. For image_descriptions: Make them specific, actionable, and suitable for technical diagram generation. | |
| 5. Review and enhance URLs - add 2-3 additional high-quality URLs if missing. | |
| 6. Keep all word counts natural and readable. | |
| CRITICAL INSTRUCTION: | |
| - The field "aliases" must remain EXACTLY as provided (do not change it). | |
| - Keep "image_description" fields exactly as they are. | |
| - For slides without image_description, set to null. | |
| - Retain the most educationally valuable 3 slides for images β set the rest to null. | |
| OUTPUT REQUIREMENT: | |
| Return ONLY the corrected JSON in the exact same schema as the input. | |
| Do not include code fences, markdown, or extra commentary. | |
| """ | |
| response = gemini_model.generate_content(correction_prompt) | |
| corrected_text = response.text.strip() | |
| corrected_text = re.sub(r'^\s*```(?:json)?\s*\n?', '', corrected_text, count=1) | |
| corrected_text = re.sub(r'\s*```\s*$', '', corrected_text, count=1) | |
| try: | |
| result = json.loads(corrected_text) | |
| if 'aliases' not in result: | |
| result['aliases'] = generated_json.get('aliases', []) | |
| print(f"β Correction successful") | |
| return result | |
| except json.JSONDecodeError: | |
| json_match = re.search(r'\{.*\}', corrected_text, re.DOTALL) | |
| if json_match: | |
| result = json.loads(json_match.group()) | |
| if 'aliases' not in result: | |
| result['aliases'] = generated_json.get('aliases', []) | |
| return result | |
| print(f"β οΈ Correction parsing failed - returning original") | |
| return generated_json | |
| except Exception as e: | |
| print(f"β Correction failed: {type(e).__name__}: {str(e)}") | |
| raise | |
| def refine_technical_content(validated_json): | |
| """ | |
| Stage 3: Final refinement with Perplexity. | |
| EXACT PROMPT from technical_gcp_image_pipeline-1.ipynb | |
| """ | |
| print(f"\nπ Refining technical content") | |
| try: | |
| refine_prompt = f"""You are a senior technical content specialist for universal technology training. | |
| This slide presentation has been validated. Perform the final refinement: | |
| {json.dumps(validated_json, indent=2)} | |
| Your tasks: | |
| 1. Ensure image_descriptions are detailed, specific, and suitable for technical diagram generation. | |
| 2. Verify that slide content is universally applicable and consistent. | |
| 3. Confirm that all technical terms are accurate. | |
| 4. Review and refine the URLs: | |
| - Select up to 5 of the best URLs only. | |
| - Order them by: Authority, Relevance, Learning value, Diversity. | |
| - Ensure all chosen URLs are authoritative and current. | |
| 5. Keep all slide content exactly the same length/style. | |
| 6. Maintain perfect JSON structure. | |
| CRITICAL INSTRUCTION: | |
| - The field "aliases" must remain EXACTLY as provided. | |
| - Keep "image_description" fields for image generation. | |
| OUTPUT REQUIREMENT: | |
| Return ONLY the refined JSON in the exact same schema as the input. | |
| """ | |
| response = perplexity_client.chat.completions.create( | |
| model="sonar-pro", | |
| messages=[{"role": "user", "content": refine_prompt}], | |
| temperature=0.3, | |
| max_tokens=4000, | |
| timeout=60, | |
| ) | |
| refined_text = response.choices[0].message.content.strip() | |
| refined_text = re.sub(r'^\s*```(?:json)?\s*\n?', '', refined_text, count=1) | |
| refined_text = re.sub(r'\s*```\s*$', '', refined_text, count=1) | |
| try: | |
| result = json.loads(refined_text) | |
| if 'aliases' not in result: | |
| result['aliases'] = validated_json.get('aliases', []) | |
| print(f"β Refinement successful") | |
| return result | |
| except json.JSONDecodeError: | |
| json_match = re.search(r'\{.*\}', refined_text, re.DOTALL) | |
| if json_match: | |
| result = json.loads(json_match.group()) | |
| if 'aliases' not in result: | |
| result['aliases'] = validated_json.get('aliases', []) | |
| return result | |
| print(f"β οΈ Refinement failed - returning validated content") | |
| return validated_json | |
| except Exception as e: | |
| print(f"β Refinement failed: {type(e).__name__}: {str(e)}") | |
| raise | |
| # ============================================================ | |
| # OPERATIONAL PIPELINE | |
| # ============================================================ | |
| def generate_operational_content(topic): | |
| """ | |
| Stage 1: Generate operational slides using Perplexity. | |
| EXACT PROMPT from operational_gcp_image_pipeline-2.ipynb | |
| """ | |
| print(f"\nπ Generating operational content for: {topic}") | |
| try: | |
| system_prompt = f"""You are a domain expert in business operations, compliance, regulatory frameworks, and enterprise management. | |
| Task: | |
| For the topic "{topic}", generate 9 to 10 slides as JSON. | |
| Instructions: | |
| - Target intermediate professionals (2+ years experience) seeking actionable, scenario-driven insights. | |
| - Each slide should have a unique and engaging "slide_title" (maximum 6 words). | |
| - "slide_content" must be 3-4 sentences (strictly 40-60 words), balancing regulatory requirements with operational business value. | |
| - Emphasize both regulatory drivers AND business impact: compliance obligations, operational efficiency, risk mitigation, and competitive advantage. | |
| - For the 3 most important slides ONLY, add "image_description" (strictly 30-40 words) describing meaningful business/operational diagrams. | |
| - First slide: Overview positioning the topic's regulatory importance and business operational impact. | |
| - Last slide: "Further Learning & Documentation" with specific next learning topics. | |
| - Use clear, accessible language without basic dictionary definitions. | |
| - Focus on practical application, regulatory compliance, and business outcomes. | |
| - For all other slides, set image_description to null. | |
| Additional Requirement β ALIASES FIELD: | |
| - Generate 4-5 lowercase alternative names/synonyms for "{topic}". | |
| - First alias MUST be the normalized lowercase form of the topic. | |
| - Include abbreviations and terms that refer to the same concept. | |
| Output ONLY valid JSON (no code blocks, no markdown): | |
| {{ | |
| "topic": "{topic}", | |
| "aliases": ["primary lowercase form", "alias2", ...], | |
| "content": [ | |
| {{ | |
| "slide_title": "...", | |
| "slide_content": "...", | |
| "image_description": "..." or null | |
| }} | |
| ], | |
| "urls": [ | |
| {{"title": "...", "url": "https://..."}}, | |
| ... | |
| ] | |
| }} | |
| """ | |
| response = perplexity_client.chat.completions.create( | |
| model="sonar-pro", | |
| messages=[ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": f"Generate an intermediate-level, practical business operations presentation on: {topic}"} | |
| ], | |
| temperature=0.5, | |
| max_tokens=4000, | |
| timeout=60, | |
| ) | |
| content = response.choices[0].message.content | |
| try: | |
| result = json.loads(content) | |
| if 'aliases' not in result: | |
| result['aliases'] = [topic.lower().strip()] | |
| print(f"β Generation successful - {len(result.get('content', []))} slides") | |
| return result | |
| except json.JSONDecodeError: | |
| json_match = re.search(r'\{.*\}', content, re.DOTALL) | |
| if json_match: | |
| result = json.loads(json_match.group()) | |
| if 'aliases' not in result: | |
| result['aliases'] = [topic.lower().strip()] | |
| return result | |
| raise ValueError("Could not parse JSON from response") | |
| except Exception as e: | |
| print(f"β Generation failed: {type(e).__name__}: {str(e)}") | |
| raise | |
| def correct_operational_content(generated_json): | |
| """ | |
| Stage 2: Correct with Gemini 2.5 PRO (stronger model for operational). | |
| EXACT PROMPT from operational_gcp_image_pipeline-2.ipynb | |
| """ | |
| print(f"\nπ Correcting operational content with Gemini 2.5 PRO") | |
| try: | |
| gemini_model = genai.GenerativeModel("gemini-2.5-pro") # STRONGER MODEL FOR OPERATIONAL | |
| correction_prompt = f"""You are an expert business operations and compliance editor. | |
| Review this business operations presentation and improve it: | |
| {json.dumps(generated_json, indent=2)} | |
| Your tasks: | |
| 1. Ensure slide titles are clear, concise (max 6 words), and business-focused. | |
| 2. Verify slide_content balances regulatory requirements WITH business operational value (40β60 words). | |
| 3. Strengthen regulatory references: name specific acts, frameworks, or compliance concepts. | |
| 4. For image_descriptions: Make them specific to business processes and regulatory workflows. | |
| 5. Review and improve the URLs - add 2-3 additional high-quality official URLs. | |
| 6. Maintain the intermediate professional tone. | |
| 7. Ensure logical flow: regulatory β operational β actionable insights. | |
| CRITICAL INSTRUCTION: | |
| - The field "aliases" must remain EXACTLY as provided. | |
| - Keep "image_description" fields for image generation. | |
| - For slides without image_description, set to null. | |
| - Retain the most important 3 slides for images β set the rest to null. | |
| OUTPUT REQUIREMENT: | |
| Return ONLY the corrected JSON in the exact same schema as the input. | |
| """ | |
| response = gemini_model.generate_content(correction_prompt) | |
| corrected_text = response.text.strip() | |
| corrected_text = re.sub(r'^\s*```(?:json)?\s*\n?', '', corrected_text, count=1) | |
| corrected_text = re.sub(r'\s*```\s*$', '', corrected_text, count=1) | |
| try: | |
| result = json.loads(corrected_text) | |
| if 'aliases' not in result: | |
| result['aliases'] = generated_json.get('aliases', []) | |
| print(f"β Correction successful") | |
| return result | |
| except json.JSONDecodeError: | |
| json_match = re.search(r'\{.*\}', corrected_text, re.DOTALL) | |
| if json_match: | |
| result = json.loads(json_match.group()) | |
| if 'aliases' not in result: | |
| result['aliases'] = generated_json.get('aliases', []) | |
| return result | |
| print(f"β οΈ Correction parsing failed - returning original") | |
| return generated_json | |
| except Exception as e: | |
| print(f"β Correction failed: {type(e).__name__}: {str(e)}") | |
| raise | |
| def refine_operational_content(validated_json): | |
| """ | |
| Stage 3: Final refinement with Perplexity. | |
| EXACT PROMPT from operational_gcp_image_pipeline-2.ipynb | |
| """ | |
| print(f"\nπ Refining operational content") | |
| try: | |
| refine_prompt = f"""You are a senior business operations content specialist. | |
| This business operations presentation has been validated. Perform the final refinement: | |
| {json.dumps(validated_json, indent=2)} | |
| Your tasks: | |
| 1. Ensure image descriptions are specific to business workflows, compliance processes, and decision-making. | |
| 2. Verify slide content emphasizes actionable business value, regulatory relevance, and measurable outcomes. | |
| 3. Confirm terminology is accurate, consistent, and understandable to intermediate business professionals. | |
| 4. Review and refine the URLs: | |
| - Select up to 5 of the best URLs only. | |
| - Order by: Authority (regulatory bodies first), Relevance, Learning value, Diversity. | |
| - Ensure all URLs are authoritative, recent, and relevant. | |
| 5. Keep all slide content exactly the same. | |
| 6. Maintain perfect JSON structure. | |
| CRITICAL INSTRUCTION: | |
| - The field "aliases" must remain EXACTLY as provided. | |
| - Keep "image_description" fields for image generation. | |
| OUTPUT REQUIREMENT: | |
| Return ONLY the refined JSON in the exact same schema as the input. | |
| """ | |
| response = perplexity_client.chat.completions.create( | |
| model="sonar-pro", | |
| messages=[{"role": "user", "content": refine_prompt}], | |
| temperature=0.3, | |
| max_tokens=4000, | |
| timeout=60, | |
| ) | |
| refined_text = response.choices[0].message.content.strip() | |
| refined_text = re.sub(r'^\s*```(?:json)?\s*\n?', '', refined_text, count=1) | |
| refined_text = re.sub(r'\s*```\s*$', '', refined_text, count=1) | |
| try: | |
| result = json.loads(refined_text) | |
| if 'aliases' not in result: | |
| result['aliases'] = validated_json.get('aliases', []) | |
| print(f"β Refinement successful") | |
| return result | |
| except json.JSONDecodeError: | |
| json_match = re.search(r'\{.*\}', refined_text, re.DOTALL) | |
| if json_match: | |
| result = json.loads(json_match.group()) | |
| if 'aliases' not in result: | |
| result['aliases'] = validated_json.get('aliases', []) | |
| return result | |
| print(f"β οΈ Refinement failed - returning validated content") | |
| return validated_json | |
| except Exception as e: | |
| print(f"β Refinement failed: {type(e).__name__}: {str(e)}") | |
| raise | |
| print("β All pipeline functions loaded (Perplexity + Gemini 2.5 Flash/Pro for text)") | |