Spaces:

MLBench
/

getscenes

Sleeping

App Files Files Community

saim1309 commited on Feb 16

Commit

41b57f8

verified ·

1 Parent(s): e730fc6

Upload 5 files

Browse files

Files changed (5) hide show

app.py +976 -0
config.py +68 -0
database.py +168 -0
scraper.py +347 -0
utils.py +128 -0

app.py ADDED Viewed

	@@ -0,0 +1,976 @@

+import gradio as gr
+import openai
+import json
+import re
+import os
+from datetime import datetime, timedelta
+import uuid
+from typing import Dict
+from config import (
+    OPENAI_API_KEY, DB_PATH, EMBED_MODEL,
+    GEN_MODEL, FAST_MODEL,
+    EMOTIONAL_KEYWORDS, ACTION_KEYWORDS, POLICY_KEYWORDS,
+    EMAIL_ONLY_KEYWORDS, DETAIL_SYNONYMS, PERSONA_INSTRUCTION
+)
+from utils import (
+    get_embedding, cosine_similarity, find_top_k_matches,
+    classify_intent, should_include_email, classify_user_type
+)
+from scraper import scrape_workshops_from_squarespace
+from database import (
+    fetch_all_embeddings,
+    fetch_row_by_id,
+    fetch_all_faq_embeddings,
+    get_session_state,
+    update_session_state,
+    log_question
+)
+# ============================================================================
+# CONFIGURATION
+# ============================================================================
+if not OPENAI_API_KEY:
+    raise ValueError("OPENAI_API_KEY not found in .env file")
+openai.api_key = OPENAI_API_KEY
+# Store session ID for the conversation
+session_id = str(uuid.uuid4())
+# Cache for workshop data and embeddings
+workshop_cache = {
+    'data': [],
+    'embeddings': [],
+    'last_updated': None,
+    'cache_duration': timedelta(hours=24)
+}
+# Load Truth Sheet (Structured Knowledge)
+STRUCTURED_KNOWLEDGE = {}
+try:
+    knowledge_path = os.path.join(os.path.dirname(__file__), "structured_knowledge.json")
+    with open(knowledge_path, "r") as f:
+        STRUCTURED_KNOWLEDGE = json.load(f)
+except Exception as e:
+    print(f"Error loading structured_knowledge.json: {e}")
+def get_structured_knowledge_snippet(preference=None):
+    """Formats structured knowledge into a text snippet for the prompt, filtering by preference if provided."""
+    if not STRUCTURED_KNOWLEDGE:
+        return ""
+    snippet = "--- STRUCTURED TRUTH SHEET (VERIFIED KNOWLEDGE) ---\n"
+    # Links
+    free_online = STRUCTURED_KNOWLEDGE.get('free_online_class', {}).get('link', '')
+    if not preference or preference.lower() == 'online':
+        snippet += f"Free Online Class: {free_online}\n"
+    kids = STRUCTURED_KNOWLEDGE.get('kids_classes', {})
+    if not preference or preference.lower() == 'online':
+        snippet += f"Kids Classes (Online): {kids.get('online_link', '')}\n"
+    if not preference or preference.lower() == 'instudio':
+        snippet += f"Kids Classes (Atlanta): {kids.get('atlanta_link', '')}\n"
+    summit = STRUCTURED_KNOWLEDGE.get('summit', {})
+    snippet += f"Summit: {summit.get('link', '')} - {summit.get('description', '')}\n"
+    # Instructors
+    instructors = STRUCTURED_KNOWLEDGE.get('instructors', [])
+    if instructors:
+        snippet += "Instructors & Roles (STRICT):\n"
+        for inst in instructors:
+            not_roles = inst.get('not_roles', [])
+            not_str = f" [NOT: {', '.join(not_roles)}]" if not_roles else ""
+            snippet += f"- {inst['name']}: {inst['role']}{not_str}\n"
+    # Paths
+    paths = STRUCTURED_KNOWLEDGE.get('paths', {})
+    if not preference or preference.lower() == 'online':
+        snippet += f"Online Path: {paths.get('online', '')}\n"
+    if not preference or preference.lower() == 'instudio':
+        snippet += f"Atlanta Path: {paths.get('atlanta', '')}\n"
+    snippet += "--------------------------------------------------\n"
+    return snippet
+# ============================================================================
+# HELPER FUNCTIONS
+# ============================================================================
+def calculate_workshop_confidence(w: Dict) -> float:
+    """Calculate confidence score of retrieved workshop data"""
+    score = 0.0
+    if w.get('title'): score += 0.3
+    if w.get('instructor_name'): score += 0.3
+    if w.get('date'): score += 0.2
+    if w.get('time'): score += 0.1
+    if w.get('source_url'): score += 0.1
+    return round(score, 2)
+# ============================================================================
+# WORKSHOP FUNCTIONS
+# ============================================================================
+def get_current_workshops():
+    """Get current workshops with caching"""
+    global workshop_cache
+    now = datetime.now()
+    # Check if cache is still valid
+    if (workshop_cache['last_updated'] and
+        now - workshop_cache['last_updated'] < workshop_cache['cache_duration'] and
+        workshop_cache['data']):
+        print("Using cached workshop data")
+        return workshop_cache['data'], workshop_cache['embeddings']
+    print("Fetching fresh workshop data...")
+    # Use robust Squarespace scraping system
+    online_workshops = scrape_workshops_from_squarespace("https://www.getscenestudios.com/online")
+    instudio_workshops = scrape_workshops_from_squarespace("https://www.getscenestudios.com/instudio")
+    all_workshops = online_workshops + instudio_workshops
+    # Data Integrity: Validate and score workshops
+    valid_workshops = []
+    total_score = 0
+    for w in all_workshops:
+        conf = calculate_workshop_confidence(w)
+        if conf >= 0.8:
+            valid_workshops.append(w)
+            total_score += conf
+        else:
+            print(f"⚠️ Rejecting weak record (Confidence: {conf}): {w.get('title', 'Unknown')}", flush=True)
+    avg_conf = total_score / len(valid_workshops) if valid_workshops else 0
+    print(f"📊 DATA INTEGRITY: Found {len(all_workshops)} total, {len(valid_workshops)} valid (Confidence >= 0.8)", flush=True)
+    print(f"📈 Retrieval Confidence: {avg_conf:.2f} (Average)", flush=True)
+    all_workshops = valid_workshops
+    if not all_workshops:
+        if workshop_cache['data']:
+            print("Scraping failed, using cached data")
+            return workshop_cache['data'], workshop_cache['embeddings']
+        else:
+            print("No workshop data available")
+            return [], []
+    # Generate embeddings for workshops
+    workshop_embeddings = []
+    for workshop in all_workshops:
+        try:
+            embedding = get_embedding(workshop['full_text'])
+            workshop_embeddings.append(embedding)
+        except Exception as e:
+            print(f"Error generating embedding for workshop: {e}")
+            workshop_embeddings.append([0] * 1536)
+    # Update cache
+    workshop_cache['data'] = all_workshops
+    workshop_cache['embeddings'] = workshop_embeddings
+    workshop_cache['last_updated'] = now
+    print(f"Cached {len(all_workshops)} workshops")
+    return all_workshops, workshop_embeddings
+def find_top_workshops(user_embedding, k=3):
+    """Find top matching workshops using real-time data"""
+    workshops, workshop_embeddings = get_current_workshops()
+    if not workshops:
+        return []
+    scored = []
+    for i, (workshop, emb) in enumerate(zip(workshops, workshop_embeddings)):
+        try:
+            score = cosine_similarity(user_embedding, emb)
+            scored.append((score, i, workshop['full_text'], workshop))
+        except Exception as e:
+            print(f"Error calculating similarity: {e}")
+            continue
+    scored.sort(reverse=True)
+    return scored[:k]
+# ============================================================================
+# PROMPT BUILDING FUNCTIONS
+# ============================================================================
+def generate_enriched_links(row):
+    base_url = row.get("youtube_url")
+    guest_name = row.get("guest_name", "")
+    highlights = json.loads(row.get("highlight_json", "[]"))
+    summary = highlights[0]["summary"] if highlights else ""
+    # Truncate summary to first sentence only
+    if summary:
+        first_sentence = summary.split('.')[0] + '.'
+        if len(first_sentence) > 120:
+            short_summary = first_sentence[:117] + "..."
+        else:
+            short_summary = first_sentence
+    else:
+        short_summary = "Industry insights for actors"
+    markdown = f"🎧 [Watch {guest_name}'s episode here]({base_url}) - {short_summary}"
+    return [markdown]
+def build_enhanced_prompt(user_question, context_results, top_workshops, user_preference=None, user_type='unknown', enriched_podcast_links=None, wants_details=False, current_topic=None, mode="Mode B", is_low_confidence=False):
+    """Builds the system prompt with strict formatting rules."""
+    # Dynamic Links from Structured Knowledge
+    free_class_url = STRUCTURED_KNOWLEDGE.get('free_online_class', {}).get('link', "https://www.getscenestudios.com/online")
+    if user_preference and user_preference.lower() == 'instudio':
+        free_class_url = STRUCTURED_KNOWLEDGE.get('paths', {}).get('atlanta', "https://www.getscenestudios.com/instudio")
+    atlanta_link = STRUCTURED_KNOWLEDGE.get('paths', {}).get('atlanta', "https://www.getscenestudios.com/instudio")
+    online_link = STRUCTURED_KNOWLEDGE.get('paths', {}).get('online', "https://www.getscenestudios.com/online")
+    truth_sheet_snippet = get_structured_knowledge_snippet(preference=user_preference)
+    # Placeholder removed to ensure strict usage of retrieved data
+    single_podcast = ""
+    # helper for clean links
+    def format_workshop(w):
+        # Strict validation: Title, Instructor, Date must be present and non-empty
+        if not w.get('title') or not w.get('instructor_name') or not w.get('date'):
+            return None
+        # Strict formatting: [{Title}]({Link}) with {Instructor} ({Format}) on {Date}
+        link = "https://www.getscenestudios.com/instudio" if "/instudio" in w.get('source_url', '') else "https://www.getscenestudios.com/online"
+        # User Preference Filtering
+        w_type = "Online" if "online" in w.get('source_url', '') else "In-Studio"
+        if user_preference:
+            if user_preference.lower() != w_type.lower():
+                return None
+        # Calculate confidence
+        confidence = calculate_workshop_confidence(w)
+        if confidence < 0.70:
+            return None
+        return f"- [{w['title']}]({link}) with {w['instructor_name']} ({w_type}) on {w['date']} at {w.get('time', '')}"
+    # Prepare workshop list (Top 3 max)
+    workshop_lines = []
+    if top_workshops:
+        for _, _, _, w_data in top_workshops[:5]:  # Check top 5, take top 3 valid
+            formatted = format_workshop(w_data)
+            if formatted:
+                workshop_lines.append(formatted)
+    workshop_text = ""
+    if workshop_lines:
+        workshop_text = "\n".join(workshop_lines[:3])
+    else:
+        # Improved fallback to avoid generic/placeholder-like feeling
+        label = f"{user_preference.capitalize()} " if user_preference else ""
+        link = online_link if user_preference == 'online' else atlanta_link if user_preference == 'instudio' else online_link
+        # Mandatory Hyperlink Enforcement
+        workshop_text = f"We are constantly updating our schedule! You can view and [register for upcoming {label}workshops here]({link})."
+    # Handle missing podcast data strictly
+    if not enriched_podcast_links:
+        single_podcast = "Our latest industry insights are available on YouTube: https://www.youtube.com/@GetSceneStudios"
+    else:
+        single_podcast = enriched_podcast_links[0]
+    # --- EMOTIONAL / SUPPORT MODE CHECK ---
+    is_emotional = detect_response_type(user_question) == "support"
+    if is_emotional:
+        prompt = f"""{PERSONA_INSTRUCTION}
+You are acting in SUPPORT MODE.
+CRITICAL INSTRUCTIONS:
+1. ACKNOWLEDGE their feelings first (e.g., "I hear how frustrating it is to feel stuck...").
+2. Provide SUPPORTIVE language (2-3 sentences max).
+3. Offer EXACTLY ONE gentle follow-up resource: either the podcast OR the free class.
+4. DO NOT suggest paid workshops or upsell in this response.
+5. KEEP IT BRIEF (≤150 words).
+USER'S QUESTION: {user_question}
+REQUIRED RESPONSE FORMAT:
+[Your empathetic, supportive acknowledgment]
+Here's a free resource that might help you move forward:
+[Pick ONE: {single_podcast} OR Free Class at {free_class_url}]
+Questions? Contact info@getscenestudios.com"""
+        return prompt
+    # --- STANDARD LOGIC FOR CONTEXT SNIPPET ---
+    question_lower = user_question.lower()
+    context_snippet = ""
+    # Priority 1: Direct Keywords in current question
+    detected_topic = None
+    if any(word in question_lower for word in ['agent', 'representation', 'rep', 'manager']):
+        detected_topic = 'agent'
+    elif any(word in question_lower for word in ['beginner', 'new', 'start', 'beginning']):
+        detected_topic = 'beginner'
+    elif any(word in question_lower for word in ['callback', 'audition', 'tape', 'self-tape', 'booking']):
+        detected_topic = 'audition'
+    elif any(word in question_lower for word in ['mentorship', 'coaching']):
+        detected_topic = 'mentorship'
+    elif any(word in question_lower for word in ['price', 'cost', 'how much']):
+        detected_topic = 'pricing'
+    elif any(word in question_lower for word in ['class', 'workshop', 'training', 'learn']):
+        detected_topic = 'classes'
+    elif any(word in question_lower for word in ['membership', 'gsp', 'plus']):
+        detected_topic = 'membership'
+    # Priority 2: Fallback to session context if current question is ambiguous
+    if not detected_topic and current_topic:
+        topic_map = {
+            'agent_seeking': 'agent',
+            'beginner': 'beginner',
+            'audition_help': 'audition',
+            'mentorship': 'mentorship',
+            'pricing': 'pricing',
+            'classes': 'classes',
+            'membership': 'membership'
+        }
+        detected_topic = topic_map.get(current_topic)
+    # Assign snippet based on topic
+    if detected_topic == 'agent':
+        context_snippet = "Get Scene Studios has helped 1000+ actors land representation. Total Agent Prep offers live practice with working agents (age 16+, limited to 12 actors)."
+    elif detected_topic == 'beginner':
+        context_snippet = "Get Scene Studios specializes in getting actors audition-ready fast with camera technique and professional self-tape skills."
+    elif detected_topic == 'audition':
+        context_snippet = "Get Scene offers Crush the Callback (Zoom simulation) and Perfect Submission (self-tape mastery) for actors refining their technique."
+    elif detected_topic == 'mentorship':
+        context_snippet = "Working Actor Mentorship is a 6-month program ($3,000) with structured feedback and industry access."
+    elif detected_topic == 'pricing':
+        context_snippet = "Get Scene Studios pricing varies by program. Most workshops cap at 12-14 actors for personalized feedback."
+    elif detected_topic == 'classes':
+         link = online_link if user_preference == 'online' else atlanta_link
+         context_snippet = f"Get Scene Studios offers world-class {user_preference or ''} acting workshops. Our sessions focus on camera technique and industry readiness. Full details at {link}."
+    elif detected_topic == 'membership':
+         context_snippet = "Get Scene Plus (GSP) is our membership program that provides ongoing access to industry pros and audition insights."
+    elif 'summit' in question_lower:
+         context_snippet = "The Get Scene Summit is a premier special event featuring massive line-ups of agents, managers, and casting directors. It is NOT a recursive workshop."
+    else:
+        context_snippet = "Get Scene Studios (founded by Jesse Malinowski) offers training for TV/film actors at all levels."
+    preference_instruction = ""
+    if not user_preference:
+        preference_instruction = """
+IMPORTANT: We need to know if the user prefers "Online" or "In-Studio" workshops.
+If their question is broad (e.g., "starting acting", "kids classes", "workshops", "training", "classes") and they haven't specified a format, you MUST START your response with this exact question: "Are you looking for online training or in-studio in Atlanta?"
+NO PREFIXES, NO "WARM" TRANSITIONS, NO PARAPHRASING.
+FEW-SHOT EXAMPLES:
+User: "I want to start acting"
+Response: "Are you looking for online training or in-studio in Atlanta? That's a fantastic decision! With Get Scene Studios..."
+User: "Do you have kids classes?"
+Response: "Are you looking for online training or in-studio in Atlanta? Absolutely, we offer kids classes in both formats..."
+"""
+    else:
+         preference_instruction = f"""
+USER PREFERENCE KNOWN: {user_preference.upper()}
+1. DO NOT ask "Online or In-Studio" again.
+2. Ensure your recommendations align with {user_preference.upper()} where possible.
+"""
+    BUSINESS_RULES_INSTRUCTION = f"""
+TOP-PRIORITY BUSINESS RULES (NO EXCEPTIONS):
+1. **NO AUDITING**: Workshops can NEVER be audited. Do not reason about this. Tell the user "We do not allow auditing for our workshops" and immediately redirect them to the Free Online Class.
+2. **FREE CLASS FIRST**: The Free Online Class is the MANDATORY first step for ALL new users. If a user is "starting out", "new to acting", or asking "how to begin", you MUST route them to the Free Online Class link below as their primary next step.
+3. **NO IMMEDIATE PAID RECOMMENDATIONS**: For new or unclear users, do NOT recommend specific paid workshops yet. Focus entirely on the Free Online Class as the entry point.
+4. **KIDS CLASSES**: We offer kids classes both Online and in Atlanta (In-Studio).
+5. **SUMMIT**: The Summit is a special event offering, NOT a regular workshop.
+{"6. **STRICT LINK FILTERING**: User prefers " + user_preference.upper() + ". You MUST ONLY provide links for " + user_preference.upper() + " training. OMIT any " + ("In-Studio" if user_preference.lower() == 'online' else "Online") + " links entirely." if user_preference else ""}
+7. **ROLE INTEGRITY (STRICT)**:
+   - **THE TRUTH SHEET IS THE ABSOLUTE AND FINAL AUTHORITY.** It overrides ANY information found in podcast descriptions, workshop titles, or suggested by the user.
+   - ONLY use the roles explicitly defined in the Truth Sheet.
+   - **NEVER infer a role** from the context of a workshop or podcast.
+   - If someone is teaching a class, do NOT assume they are an "Instructor" unless the Truth Sheet says so.
+   - If someone is labeled as an "Agent", do NOT call them an "Instructor" or "Mentor" unless explicitly listed as such in the TRUTH SHEET.
+   - Pay attention to the "[NOT: ...]" list for each person in the Truth Sheet. For example, if someone is listed as "[NOT: Instructor]", NEVER call them an instructor, even if they are described as one in a podcast or workshop description.
+   - **NEVER** guess or invent a role for anyone.
+"""
+    # Brevity & Cognitive Load: Direct instructions based on user intent
+    detail_instruction = "Answer the user's question briefly (2-3 sentences max, ≤150 words total)."
+    if wants_details:
+        target = f" regarding {detected_topic or 'the current recommendations'}"
+        detail_instruction = f"Provide a detailed and thorough explanation for the user's request{target}. Focus on being helpful and providing deep value as a mentor."
+    # Email contact line is conditional
+    email_contact = ""
+    if should_include_email(user_question):
+        email_contact = "\n \nQuestions? Contact info@getscenestudios.com"
+    # Context inclusion
+    retrieved_info = ""
+    if context_results:
+        retrieved_info = f"\nRELEVANT INFORMATION FROM KNOWLEDGE BASE:\n{context_results}\n"
+    is_beginner = (detected_topic == 'beginner')
+    beginner_enforcement = ""
+    if is_beginner:
+        beginner_enforcement = """
+CRITICAL: The user is a BEGINNER. You MUST prioritize the Free Online Class above all else.
+1. Do NOT recommend specific paid workshops in your numbered list.
+2. Instead, provide the Free Online Class as your primary recommendation.
+3. Your numbered list should be:
+   1. Free Online Class (The mandatory first step)
+   2. The Get Scene Podcast (For industry mindset)
+   3. [Choose a very general resource or a 1:1 consultation if available, but NOT a specific workshop]
+"""
+    user_type_instruction = ""
+    if user_type == 'new_actor':
+        user_type_instruction = "USER TYPE: NEW ACTOR. Focus heavily on foundation, the Free Online Class, and beginner-friendly mindset. Avoid advanced industry jargon."
+    elif user_type == 'experienced_actor':
+        user_type_instruction = "USER TYPE: EXPERIENCED ACTOR. Focus on advanced technique, Agent Prep, mentorship, and industry networking. Use professional terminology."
+    elif user_type == 'parent':
+        user_type_instruction = "USER TYPE: PARENT. Focus on kids/teen programs, safety, youth training paths, and parent-specific concerns."
+    elif user_type == 'current_student':
+        user_type_instruction = "USER TYPE: EXISTING STUDENT. Focus on GSP membership benefits, advanced mentorships (WAM), and specialized recurring workshops."
+    if mode == "Mode A":
+        # Recommendation Mode: Existing checklist applies
+        prompt = f"""{PERSONA_INSTRUCTION}
+{truth_sheet_snippet}
+{BUSINESS_RULES_INSTRUCTION}
+{user_type_instruction}
+{beginner_enforcement}
+{context_snippet}{retrieved_info}
+CRITICAL INSTRUCTIONS (RECOMMENDATION MODE):
+- {detail_instruction}
+- Use natural, human transitions between your answer and the recommendations.
+- For each recommendation, add a tiny bit of "mentor advice" on why it helps.
+- Use ONLY the provided links - do not invent recommendations.
+- **MANDATORY: Use direct hyperlinks.** For ANY mention of signing up, classes, kids programs, the Summit, or the free class, you MUST include the direct [Title](Link) format.
+- **NEVER say "check our website"** or "visit the link below". Embed the link directly into the relevant part of your mentor advice.
+- Focus on clean, readable formatting.{preference_instruction}
+CRITICAL ROLE GUARD (FINAL AUTHORITY):
+- Corey Lawson: Instructor/Actor [NOT an Agent]
+- Jacob Lawson: Agent/Owner [NOT an Instructor]
+- Jesse Malinowski: Founder/Mentor [NOT an Agent]
+- Alex White: Agent [NOT an Instructor/Mentor]
+- THE TRUTH SHEET IS THE ABSOLUTE AUTHORITY. It overrides ALL other info.
+- NEVER call Corey Lawson an agent. They are brothers with different roles.
+USER'S QUESTION: {user_question}
+REQUIRED RESPONSE FORMAT (STRICT):
+[Helpful, mentor-like answer]
+Here's your path forward:
+1. Free Online Class (Mandatory First Step): {free_class_url}
+2. Recommended Podcast Episode (For Industry Mindset):
+{single_podcast}
+3. Recommended Workshop/Next Step:
+{workshop_text}{email_contact}
+CRITICAL: YOU MUST USE THE ABOVE "1. 2. 3." STRUCTURE EXACTLY. DO NOT RENAME THE STEPS. DO NOT SKIP THE PODCAST.
+"""
+    else:
+        # Front Desk Mode: More conversational, direct answers, recommendations are optional but encouraged
+        prompt = f"""{PERSONA_INSTRUCTION}
+{truth_sheet_snippet}
+{BUSINESS_RULES_INSTRUCTION}
+{context_snippet}{retrieved_info}
+CRITICAL INSTRUCTIONS (FRONT DESK MODE):
+- You are acting as the warm and helpful Front Desk Mentor.
+- **MANDATORY: Ask a routing question AT THE BEGINNING** of your response (e.g., "Are you looking to start your journey or refine existing skills?").
+- Answer the user's question directly using the provided information but keep it punchy—**no essays**.
+- **MANDATORY: Provide direct hyperlinks** for ANY mention of registration, classes, kids programs, the Summit, or more information. Use EXACTLY these links as relevant:
+    - Free Online Class: [{free_class_url}]({free_class_url})
+    - Recommended for you: {single_podcast}
+    - Upcoming Workshops: {workshop_text}
+    - Southeast Actor Summit: [Southeast Actor Summit Registration](https://www.getscenestudios.com/southeast-actor-summit)
+- **NEVER say "go to the website"** or "check our site". Always provide the specific hyperlink directly in your answer.
+- **NEVER guess** or invent information. If it's not in the context, guide the user to clarify.
+- **MANDATORY: Guide the user to the next step** at the end of your response (e.g., "A great next step for you would be to sign up for our free class").
+- {detail_instruction}
+- Focus on being a helpful guide.{preference_instruction}
+{"MANDATORY: We don't have a high-confidence match for this specific question. Provide the CLOSEST possible link from our verified knowledge above for their general query." if is_low_confidence else ""}
+CRITICAL ROLE GUARD (FINAL AUTHORITY):
+- Corey Lawson: Instructor/Actor [NOT an Agent]
+- Jacob Lawson: Agent/Owner [NOT an Instructor]
+- Jesse Malinowski: Founder/Mentor [NOT an Agent]
+- Alex White: Agent [NOT an Instructor/Mentor]
+- THE TRUTH SHEET IS THE ABSOLUTE AUTHORITY. It overrides ALL other info.
+- NEVER call Corey Lawson an agent. They are brothers with different roles.
+USER'S QUESTION: {user_question}
+REQUIRED RESPONSE FORMAT:
+[Routing Question]
+[Helpful, punchy response with links]
+[Next step guidance]{email_contact}"""
+    return prompt
+# ============================================================================
+# DETECTION FUNCTIONS
+# ============================================================================
+def detect_question_category(question):
+    """Categorize user questions for better context injection"""
+    question_lower = question.lower()
+    categories = {
+        'agent_seeking': ['agent', 'representation', 'rep', 'manager', 'get an agent'],
+        'beginner': ['beginner', 'new', 'start', 'beginning', 'first time', 'never acted'],
+        'audition_help': ['audition', 'callback', 'tape', 'self-tape', 'submission'],
+        'mentorship': ['mentorship', 'coaching', 'intensive', 'mentor', 'one-on-one'],
+        'pricing': ['price', 'cost', 'pricing', '$', 'money', 'payment', 'fee'],
+        'classes': ['class', 'workshop', 'training', 'course', 'learn'],
+        'membership': ['membership', 'join', 'member', 'gsp', 'plus'],
+        'technical': ['self-tape', 'equipment', 'lighting', 'editing', 'camera']
+    }
+    detected = []
+    for category, keywords in categories.items():
+        if any(keyword in question_lower for keyword in keywords):
+            detected.append(category)
+    return detected
+def detect_response_type(question):
+    """Detect if question is emotional/support vs action/results oriented"""
+    question_lower = question.lower()
+    emotional_count = sum(1 for word in EMOTIONAL_KEYWORDS if word in question_lower)
+    action_count = sum(1 for word in ACTION_KEYWORDS if word in question_lower)
+    if emotional_count > 0 and emotional_count >= action_count:
+        return "support"
+    return "standard"
+def detect_policy_issue(question):
+    """Detect if question violates hard policy rules (refunds, attendance, etc.) using word boundaries"""
+    question_lower = question.lower()
+    for word in POLICY_KEYWORDS:
+        # Use regex word boundaries to prevent substring matches (e.g., 'submission' matching 'miss')
+        pattern = rf'\b{re.escape(word)}\b'
+        if re.search(pattern, question_lower):
+            return True
+    return False
+def detect_preference(question):
+    """Detect if user is stating a preference"""
+    q_lower = question.lower()
+    if 'online' in q_lower and 'studio' not in q_lower:
+        return 'online'
+    if ('studio' in q_lower or 'person' in q_lower or 'atlanta' in q_lower) and 'online' not in q_lower:
+        return 'instudio'
+    return None
+def get_contextual_business_info(categories):
+    """Return relevant business information based on detected question categories"""
+    context_map = {
+        'agent_seeking': {
+            'programs': ['Total Agent Prep', 'Working Actor Mentorship'],
+            'key_info': 'Live pitch practice with real agents, Actors Access optimization',
+            'journey': 'Total Agent Prep → GSP → Mentorship for sustained progress'
+        },
+        'beginner': {
+            'programs': ['Free Classes', 'Get Scene 360', 'Get Scene Plus'],
+            'key_info': 'Start with holistic foundation, build consistency',
+            'journey': 'Free class → Get Scene 360 → GSP membership'
+        },
+        'audition_help': {
+            'programs': ['Perfect Submission', 'Crush the Callback', 'Audition Insight'],
+            'key_info': 'Self-tape mastery, callback simulation, pro feedback',
+            'journey': 'Perfect Submission → GSP for ongoing Audition Insight'
+        },
+        'mentorship': {
+            'programs': ['Working Actor Mentorship'],
+            'key_info': '6-month intensive with structured feedback and accountability',
+            'journey': 'Ready for commitment → WAM → Advanced workshops'
+        }
+    }
+    relevant_info = {}
+    for category in categories:
+        if category in context_map:
+            relevant_info[category] = context_map[category]
+    return relevant_info
+# ============================================================================
+# MAIN CHATBOT LOGIC
+# ============================================================================
+def update_knowledge_from_question(session_id: str, question: str):
+    """Extract attributes and update knowledge dictionary"""
+    updates = {}
+    # Extract Format
+    pref = detect_preference(question)
+    if pref:
+        updates['format'] = pref
+    # Extract Topic
+    cats = detect_question_category(question)
+    if cats:
+        # Prioritize specific topics over generic ones
+        priority_topics = ['agent_seeking', 'beginner', 'audition_help', 'mentorship', 'pricing']
+        for topic in priority_topics:
+            if topic in cats:
+                updates['topic'] = topic
+                break
+        if 'topic' not in updates and cats:
+             updates['topic'] = cats[0]
+    if updates:
+        update_session_state(session_id, knowledge_update=updates, increment_count=False)
+        return updates
+    return {}
+def process_question(question: str, current_session_id: str):
+    """Main function to process user questions - replaces Flask /ask endpoint"""
+    try:
+        if not question:
+            return "Question is required"
+        # 0. INTENT CLASSIFICATION
+        activated_mode = "Mode B"  # Default safe value
+        # 1. HARD POLICY CHECK (Internal critical issues only)
+        if detect_policy_issue(question) and should_include_email(question):
+            log_question(
+                question=question,
+                session_id=current_session_id,
+                category="policy_violation",
+                answer="Please email info@getscenestudios.com.",
+                detected_mode="Mode B",
+                routing_question=None,
+                rule_triggered="policy_email_only",
+                link_provided=False
+            )
+            return "Please email info@getscenestudios.com."
+        # 2. Handle Session & Knowledge State
+        update_knowledge_from_question(current_session_id, question)
+        session_state = get_session_state(current_session_id)
+        try:
+            knowledge = json.loads(session_state.get('knowledge_context', '{}'))
+        except:
+            knowledge = {}
+        user_type = knowledge.get('user_type', 'unknown')
+        # Update User Type if unknown or enough turn count
+        if user_type == 'unknown' or session_state.get('msg_count', 0) % 3 == 0:
+            new_user_type = classify_user_type(question)
+            if new_user_type != 'unknown':
+                user_type = new_user_type
+                knowledge['user_type'] = user_type
+                update_session_state(current_session_id, knowledge_update=knowledge, increment_count=False)
+        user_preference = knowledge.get('format')
+        current_topic = knowledge.get('topic')
+        if not user_preference:
+            user_preference = session_state.get('preference')
+        update_session_state(current_session_id, increment_count=True)
+        # 3. ROUTING: Use classification LLM to decide Mode A or Mode B
+        activated_mode = classify_intent(question)
+        last_mode = knowledge.get('last_mode')
+        if session_state.get('clarification_count', 0) > 0 and last_mode:
+            if len(question.split()) < 5 or any(k in question.lower() for k in ['yes', 'no', 'sure', 'not sure', 'dont know']):
+                activated_mode = last_mode
+        # Store mode for next turn's potentially sticky logic
+        knowledge['last_mode'] = activated_mode
+        print(f"DEBUG: [{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Activated Mode for session {current_session_id}: {activated_mode}")
+        update_session_state(current_session_id, knowledge_update=knowledge, increment_count=False)
+        # 4. SEMANTIC SEARCH: Create embedding of user question
+        user_embedding = get_embedding(question)
+        # Check FAQ embeddings first
+        faq_data = fetch_all_faq_embeddings()
+        top_faqs = []
+        for entry_id, question_text, answer_text, emb in faq_data:
+            score = cosine_similarity(user_embedding, emb)
+            top_faqs.append((score, entry_id, question_text, answer_text))
+            top_faqs.sort(reverse=True)
+        faq_threshold = 0.85
+        ambiguous_threshold = 0.65
+        is_low_confidence = False  # Default safe initialization
+        context_results = None
+        if top_faqs and top_faqs[0][0] >= faq_threshold:
+            best_score, faq_id, question_text, answer_text = top_faqs[0]
+            print(f"DEBUG: Processing FAQ match through LLM and Truth Sheet rules...")
+            context_results = answer_text
+        elif activated_mode == "Mode A":
+            # Mode A: Any score < 0.85 triggers Clarification -> Email
+            clarification_count = session_state.get('clarification_count', 0)
+            if clarification_count == 0:
+                 update_session_state(current_session_id, increment_clarification=True, increment_count=False)
+                 return "I want to make sure I give you the best advice. Are you looking for classes in [Atlanta](https://www.getscenestudios.com/instudio), [Online](https://www.getscenestudios.com/online), or something else like getting an agent? You can also start right now with our [Free Online Class](https://www.getscenestudios.com/online)!"
+            else:
+                 update_session_state(current_session_id, reset_clarification=True)
+                 return "I'm still not quite sure, and I want to make sure you get the right answer! Please email our team at info@getscenestudios.com and we'll help you directly. In the meantime, you can explore or [register for our Online Path](https://www.getscenestudios.com/online) or [In-Studio classes in Atlanta](https://www.getscenestudios.com/instudio)."
+        elif top_faqs and top_faqs[0][0] >= ambiguous_threshold:
+            # Mode B: Ambiguous Score (0.65 - 0.85) -> Ask "Did you mean?"
+            update_session_state(current_session_id, increment_clarification=True, increment_count=False)
+            best_match_q = top_faqs[0][2]
+            return f"Did you mean: {best_match_q}?"
+        else:
+            # 5. HALLUCINATION GUARD: Check if query is acting-related before blocking
+            categories = detect_question_category(question)
+            has_session_context = (current_topic is not None) or (user_preference is not None)
+            FOLLOWUP_KEYWORDS = ['yes', 'no', 'sure', 'okay', 'thanks', 'thank you', 'please', 'go ahead', 'continue', 'more']
+            ACTING_KEYWORDS = ['class', 'workshop', 'coaching', 'studio', 'acting', 'online', 'person', 'atlanta', 'training', 'prefer', 'preference', 'format', 'recommendation', 'online class', 'online workshop','instudio class','instudio workshop', 'actor', 'scene', 'audition', 'theatre', 'film', 'tv', 'commercial', 'agent', 'rep', 'manager', 'instructor', 'role', 'auditing', 'audit', 'representation', 'summit', 'sign up', 'sign-up', 'register', 'enroll', 'schedule', 'cancel', 'reschedule', 'how do i']
+            is_acting_related = (
+                len(categories) > 0 or
+                detect_response_type(question) == "support" or
+                any(re.search(rf'\b{re.escape(k)}\b', question.lower()) for k in ACTION_KEYWORDS) or
+                any(re.search(rf'\b{re.escape(k)}\b', question.lower()) for k in DETAIL_SYNONYMS) or
+                any(re.search(rf'\b{re.escape(k)}\b', question.lower()) for k in ACTING_KEYWORDS) or
+                (has_session_context and any(re.search(rf'\b{re.escape(k)}\b', question.lower().strip('.!')) for k in FOLLOWUP_KEYWORDS)) or
+                (session_state.get('clarification_count', 0) > 0 and len(question.split()) < 5)  # Only allow short answers to bypass
+            )
+            if not is_acting_related:
+                return "I'm not exactly sure about that. Could you clarify your question?"
+        # Flag for Mode B Low Confidence
+        is_low_confidence = (activated_mode == "Mode B" and not context_results)
+        # 6. LLM PATH: No high-confidence FAQ match, or Mode B FAQ formatting
+        update_session_state(current_session_id, reset_clarification=True, increment_count=False)
+        # RAG: Fetch relevant workshops and podcasts
+        podcast_data = fetch_all_embeddings("podcast_episodes")
+        top_workshops = find_top_workshops(user_embedding, k=3)
+        top_podcasts = find_top_k_matches(user_embedding, podcast_data, k=3)
+        enriched_podcast_links = []
+        for _, podcast_id, _ in top_podcasts:
+            row = fetch_row_by_id("podcast_episodes", podcast_id)
+            enriched_podcast_links.extend(generate_enriched_links(row))
+        if not enriched_podcast_links:
+            fallback = fetch_row_by_id("podcast_episodes", podcast_data[0][0])
+            enriched_podcast_links = generate_enriched_links(fallback)
+        # Brevity & Detail Detection
+        wants_details = any(syn in question.lower() for syn in DETAIL_SYNONYMS)
+        # Use enhanced prompt building
+        final_prompt = build_enhanced_prompt(
+            question,
+            context_results,
+            top_workshops,
+            user_preference=user_preference,
+            user_type=user_type,
+            enriched_podcast_links=enriched_podcast_links,
+            wants_details=wants_details,
+            current_topic=current_topic,
+            mode=activated_mode,
+            is_low_confidence=is_low_confidence
+        )
+        # Invoke LLM
+        print(f"DEBUG FINAL PROMPT:\n{final_prompt}\n--- END PROMPT ---")
+        response = openai.chat.completions.create(
+            model=GEN_MODEL,
+            messages=[
+                {"role": "system", "content": final_prompt},
+                {"role": "user", "content": question}
+            ]
+        )
+        answer_text = response.choices[0].message.content.strip()
+        # 7. ROUTING QUESTION ENFORCEMENT (Python-level Fallback)
+        routing_q = "Are you looking for online training or in-studio in Atlanta?"
+        broad_triggers = ['start acting', 'beginner', 'new actor', 'kids class', 'workshops', 'training', 'classes']
+        is_broad = any(t in question.lower() for t in broad_triggers)
+        if is_broad and not user_preference:
+            if not answer_text.lower().startswith(routing_q.lower()):
+                if routing_q.lower() in answer_text.lower():
+                    answer_text = re.sub(rf'{re.escape(routing_q)}[?!.]*', '', answer_text, flags=re.IGNORECASE).strip()
+                answer_text = f"{routing_q} {answer_text}"
+        # Detect if routing question was asked
+        routing_q_asked = routing_q if (is_broad and not user_preference and routing_q in answer_text) else None
+        # Detect if links were provided
+        has_links = bool(re.search(r'\[.*?\]\(http', answer_text))
+        # Log question with comprehensive metadata
+        log_question(
+            question=question,
+            session_id=current_session_id,
+            category="llm_generated",
+            answer=answer_text,
+            detected_mode=activated_mode,
+            routing_question=routing_q_asked,
+            rule_triggered=None,
+            link_provided=has_links
+        )
+        return answer_text
+    except Exception as e:
+        import traceback
+        print(f"❌ CRITICAL ERROR in process_question: {e}")
+        traceback.print_exc()
+        return f"I apologize, but I encountered an error processing your question. Please try again or email info@getscenestudios.com for assistance."
+# ============================================================================
+# GRADIO INTERFACE
+# ============================================================================
+def chat_with_bot(message, history):
+    """
+    Process message directly without Flask API
+    Args:
+        message: User's current message
+        history: Chat history (list of message dictionaries)
+    Returns:
+        Updated history with new exchange
+    """
+    global session_id
+    if not message.strip():
+        return history
+    try:
+        # Process question directly
+        bot_reply = process_question(message, session_id)
+    except Exception as e:
+        bot_reply = f"❌ Error: {str(e)}"
+    # Append to history in Gradio 6.0 format
+    history.append({"role": "user", "content": message})
+    history.append({"role": "assistant", "content": bot_reply})
+    return history
+def reset_session():
+    """Reset session ID for new conversation"""
+    global session_id
+    session_id = str(uuid.uuid4())
+    return [] #, f"🔄 New session started: {session_id[:8]}..."
+# Create Gradio interface
+with gr.Blocks(title="Get Scene Studios Chatbot") as demo:
+    gr.Markdown(
+        """
+        # 🎬 Get Scene Studios AI Chatbot
+        Ask questions about acting classes, workshops and more!
+        """
+    )
+    # Chatbot interface
+    chatbot = gr.Chatbot(
+        label="Conversation",
+        height=500
+    )
+    # Input area
+    with gr.Row():
+        msg = gr.Textbox(
+            label="Your Message",
+            lines=2,
+            scale=4
+        )
+        submit_btn = gr.Button("Send 📤", scale=1, variant="primary")
+    # Action buttons
+    with gr.Row():
+        clear_btn = gr.Button("Clear Chat 🗑️", scale=1)
+        reset_btn = gr.Button("New Session 🔄", scale=1)
+    # Event handlers
+    submit_btn.click(
+        fn=chat_with_bot,
+        inputs=[msg, chatbot],
+        outputs=[chatbot]
+    ).then(
+        fn=lambda: "",
+        inputs=None,
+        outputs=[msg]
+    )
+    msg.submit(
+        fn=chat_with_bot,
+        inputs=[msg, chatbot],
+        outputs=[chatbot]
+    ).then(
+        fn=lambda: "",
+        inputs=None,
+        outputs=[msg]
+    )
+    clear_btn.click(
+        fn=lambda: [],
+        inputs=None,
+        outputs=[chatbot]
+    )
+    reset_btn.click(
+        fn=reset_session,
+        inputs=None,
+        outputs=[chatbot]
+    )
+# Launch the app
+if __name__ == "__main__":
+    print("\n" + "="*60)
+    print("🎬 Get Scene Studios Chatbot")
+    print("="*60)
+    print("\n✅ No Flask API needed - all processing is done directly!")
+    print("🌐 Gradio interface will open in your browser")
+    print("="*60 + "\n")
+    demo.launch()

config.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import os
+from datetime import timedelta
+from dotenv import load_dotenv
+load_dotenv()
+# API Keys
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+# Database
+DB_PATH = "getscene_ai.sqlite"
+# Models
+EMBED_MODEL = "text-embedding-3-small"
+GEN_MODEL = "gpt-4o"
+FAST_MODEL = "gpt-4o-mini"
+# Caching
+CACHE_DURATION = timedelta(hours=24)
+# Keyword Lists
+EMOTIONAL_KEYWORDS = [
+    'stuck', 'frustrated', 'discouraged', 'overwhelmed', 'scared',
+    'nervous', 'anxious', 'worried', 'fear', 'doubt', 'confidence',
+    'insecure', 'lost', 'confused', 'struggling', 'hard time',
+    'giving up', 'burnout', 'rejection', 'failed', 'can\'t',
+    'feeling', 'feel', 'emotional', 'depressed', 'sad', 'unmotivated',
+    'hopeless', 'stressed', 'pressure', 'imposter'
+]
+ACTION_KEYWORDS = [
+    'get an agent', 'find agent', 'need agent', 'want agent', 'sign with agent',
+    'more auditions', 'book', 'booking', 'callbacks', 'improve',
+    'better', 'self-tape', 'materials', 'headshots', 'reel',
+    'network', 'connections', 'industry', 'career', 'strategy',
+    'agent prep', 'total agent prep', 'workshop', 'class', 'training',
+    'results', 'success', 'grow', 'advance', 'level up'
+]
+POLICY_KEYWORDS = [
+    'refund', 'refunds', 'money back',
+    'attend', 'attendance', 'miss', 'missed', 'missing', 'absent',
+    'late', 'lateness', 'tardy',
+    'reschedule', 'change date', 'move class',
+    'credit', 'credits',
+    'cancel', 'cancellation', 'canceling',
+    'policy', 'policies'
+]
+EMAIL_ONLY_KEYWORDS = [
+    'payment', 'pay', 'billing', 'charge', 'refund', 'money back',
+    'attend', 'attendance', 'miss', 'missed', 'late', 'reschedule',
+    'account', 'login', 'password', 'sign in', 'membership'
+]
+DETAIL_SYNONYMS = [
+    'detail', 'details', 'explain', 'elaborate', 'tell me more',
+    'more info', 'describe', 'thorough', 'comprehensive'
+]
+PERSONA_INSTRUCTION = """
+You are a warm, encouraging mentor at Get Scene Studios. Your goal is to help actors navigate their careers with confidence.
+- User Context: The user is already on getscenestudios.com. Behave as if you are a guide right there with them.
+- Negative Constraint: NEVER use the phrase "Visit the website" or "Check our site". Instead, use "You can see here..." or "Click this link below..." or similar language that implies current presence.
+- Sound natural and human, not scripted or robotic. Use conversational transitions like "I'd suggest starting with..." or "A great way to approach this is..."
+- Be encouraging but practical. Acknowledge that the acting journey is a marathon, not a sprint.
+- Help the user THINK: Instead of just giving an answer, add a brief "mentorship flourish" that explains the value of a recommendation (e.g., "This workshop is great because it gets you comfortable with the pressure of a real callback.")
+"""

database.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import sqlite3
+import json
+from contextlib import contextmanager
+from typing import List, Dict, Any, Tuple
+from config import DB_PATH
+@contextmanager
+def get_db_connection():
+    """Context manager for database connections."""
+    conn = sqlite3.connect(DB_PATH)
+    conn.row_factory = sqlite3.Row
+    try:
+        yield conn
+    finally:
+        conn.close()
+def fetch_all_embeddings(table: str) -> List[Tuple[int, str, List[float]]]:
+    """Fetch all embeddings from a table."""
+    with get_db_connection() as conn:
+        cur = conn.cursor()
+        cur.execute(f"SELECT id, full_text, embedding FROM {table}")
+        rows = cur.fetchall()
+    parsed = []
+    for row in rows:
+        try:
+            parsed.append((row['id'], row['full_text'], json.loads(row['embedding'])))
+        except (json.JSONDecodeError, TypeError):
+            continue
+    return parsed
+def fetch_row_by_id(table: str, row_id: int) -> Dict[str, Any]:
+    """Fetch a single row by ID."""
+    with get_db_connection() as conn:
+        cur = conn.cursor()
+        cur.execute(f"SELECT * FROM {table} WHERE id = ?", (row_id,))
+        row = cur.fetchone()
+        return dict(row) if row else {}
+def fetch_all_faq_embeddings() -> List[Tuple[int, str, str, List[float]]]:
+    """Fetch all FAQ embeddings."""
+    with get_db_connection() as conn:
+        cur = conn.cursor()
+        cur.execute("SELECT id, question, answer, embedding FROM faq_entries")
+        rows = cur.fetchall()
+    parsed = []
+    for row in rows:
+        try:
+            parsed.append((row['id'], row['question'], row['answer'], json.loads(row['embedding'])))
+        except (json.JSONDecodeError, TypeError):
+            continue
+    return parsed
+def log_question(
+    question: str,
+    session_id: str = None,
+    category: str = None,
+    answer: str = None,
+    detected_mode: str = None,
+    routing_question: str = None,
+    rule_triggered: str = None,
+    link_provided: bool = False
+):
+    """Log a user question to the database with comprehensive observability metadata.
+    Args:
+        question: The user's question
+        session_id: Session identifier
+        category: Question category (e.g., 'faq_match', 'llm_generated', 'policy_violation')
+        answer: The bot's response
+        detected_mode: Operating mode ('Mode A' or 'Mode B')
+        routing_question: The routing question asked (if any)
+        rule_triggered: Business rule that was triggered (e.g., 'audit_rule', 'free_class_first')
+        link_provided: Whether a direct link was included in the response
+    """
+    with get_db_connection() as conn:
+        cur = conn.cursor()
+        try:
+            cur.execute("""
+                INSERT INTO question_logs (
+                    session_id, question, category, answer,
+                    detected_mode, routing_question, rule_triggered, link_provided
+                )
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+            """, (
+                session_id, question, category, answer,
+                detected_mode, routing_question, rule_triggered,
+                1 if link_provided else 0
+            ))
+        except sqlite3.OperationalError as e:
+            # Fallback for older schema versions (shouldn't happen after migration)
+            print(f"⚠️  Logging error: {e}. Falling back to basic logging.")
+            cur.execute("INSERT INTO question_logs (question) VALUES (?)", (question,))
+        conn.commit()
+def get_session_state(session_id: str) -> Dict[str, Any]:
+    """Get session state from DB"""
+    with get_db_connection() as conn:
+        cur = conn.cursor()
+        cur.execute("SELECT * FROM user_sessions WHERE session_id = ?", (session_id,))
+        row = cur.fetchone()
+        if row:
+            return dict(row)
+    return {"preference": None, "msg_count": 0, "clarification_count": 0, "knowledge_context": "{}"}
+def update_session_state(session_id: str, preference: str = None, increment_count: bool = True, increment_clarification: bool = False, reset_clarification: bool = False, knowledge_update: Dict = None):
+    """Update session state with Knowledge Dictionary support"""
+    with get_db_connection() as conn:
+        cur = conn.cursor()
+        # Check if exists
+        cur.execute("SELECT preference, msg_count, clarification_count, knowledge_context FROM user_sessions WHERE session_id = ?", (session_id,))
+        row = cur.fetchone()
+        current_knowledge = {}
+        if row:
+            curr_pref, curr_count, curr_clarification, curr_knowledge_json = row
+            try:
+                current_knowledge = json.loads(curr_knowledge_json)
+            except:
+                current_knowledge = {}
+            new_pref = preference if preference else curr_pref
+            new_count = curr_count + 1 if increment_count else curr_count
+            # 10-Message Memory Rule: Reset if we hit the limit
+            if new_count > 10:
+                print(f"🔄 Session {session_id} reached 10 messages. Resetting memory context.")
+                new_count = 1
+                new_pref = None
+                current_knowledge = {}
+                new_clarification = 0
+            else:
+                new_clarification = curr_clarification
+                if reset_clarification:
+                    new_clarification = 0
+                elif increment_clarification:
+                    new_clarification = curr_clarification + 1
+            # Merge knowledge updates
+            if knowledge_update:
+                current_knowledge.update(knowledge_update)
+            new_knowledge_json = json.dumps(current_knowledge)
+            cur.execute("""
+                UPDATE user_sessions
+                SET preference = ?, msg_count = ?, clarification_count = ?, knowledge_context = ?, last_updated = CURRENT_TIMESTAMP
+                WHERE session_id = ?
+            """, (new_pref, new_count, new_clarification, new_knowledge_json, session_id))
+        else:
+            new_pref = preference
+            new_count = 1 if increment_count else 0
+            new_clarification = 1 if increment_clarification else 0
+            if knowledge_update:
+                current_knowledge.update(knowledge_update)
+            new_knowledge_json = json.dumps(current_knowledge)
+            cur.execute("""
+                INSERT INTO user_sessions (session_id, preference, msg_count, clarification_count, knowledge_context)
+                VALUES (?, ?, ?, ?, ?)
+            """, (session_id, new_pref, new_count, new_clarification, new_knowledge_json))
+        conn.commit()

scraper.py ADDED Viewed

	@@ -0,0 +1,347 @@

+import requests
+import json
+import re
+from bs4 import BeautifulSoup
+from typing import List, Dict, Any, Tuple
+from utils import clean_time
+def scrape_workshops_from_squarespace(url: str) -> List[Dict[str, str]]:
+    """
+    Extract workshops using our robust Squarespace JSON + HTML parsing system
+    """
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+    }
+    try:
+        # First try the Squarespace JSON API
+        json_url = f"{url}?format=json"
+        print(f"🔍 Trying Squarespace JSON API: {json_url}")
+        response = requests.get(json_url, headers=headers, timeout=10)
+        if response.status_code == 200:
+            try:
+                json_data = response.json()
+                workshops = extract_workshops_from_json(json_data, json_url)
+                if workshops:
+                    print(f"✅ Extracted {len(workshops)} workshops from JSON API")
+                    return workshops
+                else:
+                    print("❌ No workshops found in JSON, falling back to HTML")
+            except json.JSONDecodeError:
+                print("❌ Invalid JSON response, falling back to HTML")
+        # Fallback to HTML scraping if JSON fails
+        print(f"📄 Falling back to HTML scraping for {url}")
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.content, 'html.parser')
+        workshops = parse_workshops_from_html(soup, url)
+        if workshops:
+            print(f"✅ Extracted {len(workshops)} workshops from HTML parsing")
+            return workshops
+        else:
+            print("❌ No workshops found in HTML")
+            return []
+    except Exception as e:
+        print(f"❌ Error scraping workshops from {url}: {e}")
+        return []
+def extract_workshops_from_json(data: Any, source_url: str) -> List[Dict[str, str]]:
+    """Extract workshop information from Squarespace JSON data"""
+    workshops = []
+    # Check if there's mainContent HTML to parse
+    if isinstance(data, dict) and 'mainContent' in data:
+        main_content_html = data['mainContent']
+        if isinstance(main_content_html, str):
+            print(f"🎯 Found mainContent HTML! Length: {len(main_content_html)} characters")
+            soup = BeautifulSoup(main_content_html, 'html.parser')
+            workshops = parse_workshops_from_html(soup, source_url)
+            if workshops:
+                return workshops
+    return workshops
+def parse_workshops_from_html(soup, source_url: str) -> List[Dict[str, str]]:
+    """Enhanced HTML parsing specifically for workshop content"""
+    workshops = []
+    workshop_texts = set()
+    print(f"🔍 ENHANCED HTML PARSING:")
+    # Method 1: Find individual workshop containers
+    potential_containers = soup.find_all(['div', 'section', 'article'],
+                                       attrs={'class': re.compile(r'(item|card|product|workshop|class)', re.I)})
+    print(f"   Found {len(potential_containers)} potential workshop containers")
+    for container in potential_containers:
+        workshop_text = container.get_text(strip=True)
+        if len(workshop_text) < 30 or workshop_text in workshop_texts:
+            continue
+        if any(keyword in workshop_text.lower() for keyword in ['with', 'casting', 'director', 'agent', 'perfect submission', 'crush the callback', 'get scene']):
+            workshop = extract_single_workshop_from_text(workshop_text, source_url)
+            if workshop and not is_duplicate_workshop(workshop, workshops):
+                workshops.append(workshop)
+                workshop_texts.add(workshop_text)
+    # Method 2: Pattern-based extraction from full text
+    all_text = soup.get_text()
+    workshop_patterns = [
+        # Pattern 1: "Workshop Title with Professional Title Name on Date @ Time"
+        r'((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+with\s+((?:Casting\s+Director|DDO\s+Agent|Manager|Director|Producer|Agent|Acting\s+Coach|Talent\s+Agent|Executive\s+Casting\s+Producer)\s+[A-Za-z\s]+?)\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
+        # Pattern 2: "Professional Title Name, Workshop Title on Date @ Time"
+        r'((?:Atlanta\s+Models\s+&\s+Talent\s+President|Talent\s+Agent|Casting\s+Director|Manager|Director|Producer|Agent)\s+[A-Za-z\s]+?),\s+((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
+        # Pattern 3: "Casting Director Name, Date @ Time"
+        r'(Casting\s+Director)\s+([A-Za-z\s\-]+?),\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*(?:at\s+)?([0-9:]+\s*(?:AM|PM))?',
+    ]
+    for i, pattern in enumerate(workshop_patterns):
+        matches = re.findall(pattern, all_text, re.IGNORECASE)
+        for match in matches:
+            workshop = parse_refined_workshop_match(match, i+1, source_url)
+            if workshop and not is_duplicate_workshop(workshop, workshops):
+                workshops.append(workshop)
+    print(f"🎯 TOTAL UNIQUE WORKSHOPS FOUND: {len(workshops)}")
+    return workshops
+def extract_single_workshop_from_text(text: str, source_url: str) -> Dict[str, str]:
+    """Extract workshop info from a single text block"""
+    # Clean up the text
+    text = re.sub(r'\$[0-9,]+\.00', '', text)
+    text = re.sub(r'Featured|Sold Out', '', text, flags=re.IGNORECASE)
+    text = re.sub(r'\s+', ' ', text).strip()
+    text = re.sub(r'\n+', ' ', text)
+    patterns = [
+        # Pattern A: "Title with Professional Name on Date @ Time"
+        r'((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+with\s+((?:Casting\s+Director|CD|DDO\s+Agent|Manager|Director|Producer|Agent|Acting\s+Coach|Talent\s+Agent|Executive\s+Casting\s+Producer|Atlanta\s+Models\s+&\s+Talent\s+President)\s+[A-Za-z\s\-]+?)\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
+        # Pattern B: "Professional Name, Title on Date @ Time"
+        r'((?:Atlanta\s+Models\s+&\s+Talent\s+President|Talent\s+Agent|Casting\s+Director|Casting\s+Associate|Manager|Director|Producer|Agent|Executive\s+Casting\s+Producer)\s+[A-Za-z\s\-]+?),\s+((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
+        # Pattern C: "Casting Director Name, Date at Time"
+        r'(Casting\s+Director|Casting\s+Associate)\s+([A-Za-z\s\-]+?),\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*(?:at\s+)?([0-9:]+\s*(?:AM|PM))?',
+        # Pattern D: "Company Executive Producer Name on Date"
+        r"([A-Za-z']+\s+(?:Executive\s+Casting\s+Producer|Studios\s+Casting\s+Associate))\s+([A-Za-z\s]+?)\s+(?:on\s+)?(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?",
+        # Pattern E: "Company Agent Name Date" (fixed "on" issue)
+        r'([A-Za-z\s]+)\s+(Agent|Talent)\s+([A-Za-z\s]+?)\s+(?:on\s+)?(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
+        # Pattern F: "Company, Person, Title on Date"
+        r'([A-Za-z\s]+\s+Talent),\s+([A-Za-z\s\.]+?),\s+((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
+        # Pattern G: Flexible fallback
+        r'^([A-Za-z\s&\']{3,25}(?:Director|Agent|Manager|Producer|President|Coach))\s+([A-Za-z\s\-]{3,30}?)\s+(?:on\s+)?(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?$'
+    ]
+    for i, pattern in enumerate(patterns):
+        match = re.search(pattern, text, re.IGNORECASE)
+        if match:
+            return parse_pattern_match(match, i, source_url)
+    return None
+def parse_pattern_match(match, pattern_index: int, source_url: str) -> Dict[str, str]:
+    """Parse a regex match or tuple based on pattern type"""
+    # Use a helper to get group content whether it's a match object or tuple
+    def get_grp(m, idx):
+        val = ""
+        if hasattr(m, 'group'):
+            try:
+                val = m.group(idx)
+            except IndexError:
+                val = ""
+        # If it's a tuple (from findall), idx is 1-based in standard regex terminology
+        # but 0-indexed in the tuple.
+        elif isinstance(m, (tuple, list)):
+            if 0 <= idx-1 < len(m):
+                val = m[idx-1]
+        return val if val is not None else ""
+    # Initialize variables
+    workshop_title = ""
+    instructor_title = ""
+    instructor_name = ""
+    date_str = ""
+    time_str = ""
+    try:
+        if pattern_index == 0:  # Pattern A/1
+            workshop_title = get_grp(match, 1).strip()
+            professional_full = get_grp(match, 2).strip()
+            date_str = get_grp(match, 3).strip()
+            time_str = get_grp(match, 4).strip()
+            if professional_full.startswith('CD '):
+                professional_full = 'Casting Director ' + professional_full[3:]
+            instructor_title, instructor_name = parse_professional_info(professional_full)
+        elif pattern_index == 1:  # Pattern B/2
+            professional_full = get_grp(match, 1).strip()
+            workshop_title = get_grp(match, 2).strip()
+            date_str = get_grp(match, 3).strip()
+            time_str = get_grp(match, 4).strip()
+            instructor_title, instructor_name = parse_professional_info(professional_full)
+        elif pattern_index == 2:  # Pattern C/3
+            instructor_title = get_grp(match, 1).strip()
+            instructor_name = get_grp(match, 2).strip()
+            date_str = get_grp(match, 3).strip()
+            time_str = get_grp(match, 4).strip()
+            workshop_title = "Casting Workshop"
+        elif pattern_index == 3:  # Pattern D
+            instructor_title = get_grp(match, 1).strip()
+            instructor_name = get_grp(match, 2).strip()
+            date_str = get_grp(match, 3).strip()
+            time_str = get_grp(match, 4).strip()
+            workshop_title = "Industry Workshop"
+        elif pattern_index == 4:  # Pattern E
+            company_name = get_grp(match, 1).strip()
+            agent_type = get_grp(match, 2).strip()
+            instructor_name = get_grp(match, 3).strip()
+            date_str = get_grp(match, 4).strip()
+            time_str = get_grp(match, 5).strip()
+            instructor_title = f"{company_name} {agent_type}"
+            workshop_title = "Industry Workshop"
+        elif pattern_index == 5:  # Pattern F
+            company_name = get_grp(match, 1).strip()
+            instructor_name = get_grp(match, 2).strip()
+            workshop_title = get_grp(match, 3).strip()
+            date_str = get_grp(match, 4).strip()
+            time_str = get_grp(match, 5).strip()
+            instructor_title = company_name
+        else:  # Pattern G
+            professional_full = get_grp(match, 1).strip() + " " + get_grp(match, 2).strip()
+            date_str = get_grp(match, 3).strip()
+            time_str = get_grp(match, 4).strip()
+            workshop_title = "Industry Workshop"
+            if len(professional_full) > 50 or '\n' in professional_full:
+                return None
+            instructor_title, instructor_name = parse_professional_info(professional_full)
+        if instructor_name and date_str:
+            # Create full_text for embedding (required by existing Flask API)
+            full_text = f"{workshop_title} with {instructor_title} {instructor_name}"
+            if date_str:
+                full_text += f" on {date_str}"
+            if time_str:
+                full_text += f" at {clean_time(time_str)}"
+            return {
+                'title': workshop_title,
+                'instructor_name': instructor_name,
+                'instructor_title': instructor_title,
+                'date': date_str,
+                'time': clean_time(time_str),
+                'full_text': full_text,  # Required for existing embedding system
+                'source_url': source_url
+            }
+    except Exception as e:
+        print(f"Error parsing pattern match: {e}")
+    return None
+def parse_professional_info(professional_full: str) -> tuple:
+    """Parse professional title and name from full string"""
+    professional_full = re.sub(r'\s+', ' ', professional_full).strip()
+    # Handle specific multi-word titles
+    specific_titles = [
+        'Atlanta Models & Talent President',
+        'Executive Casting Producer',
+        'Casting Director',
+        'Casting Associate',
+        'DDO Agent',
+        'Talent Agent',
+        'Acting Coach'
+    ]
+    for title in specific_titles:
+        if title in professional_full:
+            title_pos = professional_full.find(title)
+            if title_pos == 0:
+                name_part = professional_full[len(title):].strip()
+                return title, name_part
+            else:
+                name_part = professional_full[:title_pos].strip().rstrip(',')
+                return title, name_part
+    # Fallback for single-word titles
+    single_word_titles = ['Manager', 'Director', 'Producer', 'Agent', 'Coach', 'President']
+    words = professional_full.split()
+    for i, word in enumerate(words):
+        if word in single_word_titles:
+            if i > 0 and words[i-1] in ['Casting', 'Talent', 'Executive', 'DDO', 'Acting']:
+                title = f"{words[i-1]} {word}"
+                name_parts = words[:i-1] + words[i+1:]
+            else:
+                title = word
+                name_parts = words[:i] + words[i+1:]
+            name = ' '.join(name_parts).strip()
+            return title, name
+    # Final fallback
+    if len(words) >= 2:
+        return words[0], ' '.join(words[1:])
+    return '', professional_full
+def parse_refined_workshop_match(match, pattern_num: int, source_url: str) -> Dict[str, str]:
+    """Parse a regex match into a clean workshop dictionary"""
+    return parse_pattern_match(match, pattern_num-1, source_url)  # Adjust for 0-based indexing
+def is_duplicate_workshop(new_workshop: Dict, existing_workshops: List[Dict]) -> bool:
+    """Enhanced duplicate detection"""
+    for existing in existing_workshops:
+        if (existing.get('instructor_name', '').strip().lower() == new_workshop.get('instructor_name', '').strip().lower() and
+            existing.get('date', '').strip().lower() == new_workshop.get('date', '').strip().lower()):
+            existing_title = existing.get('title', '').strip().lower()
+            new_title = new_workshop.get('title', '').strip().lower()
+            if (existing_title == new_title or
+                'workshop' in existing_title and 'workshop' in new_title or
+                existing_title in new_title or new_title in existing_title):
+                return True
+    return False
+def calculate_workshop_confidence(w: Dict) -> float:
+    """Calculate confidence score of retrieved workshop data"""
+    score = 0.0
+    if w.get('title'): score += 0.3
+    if w.get('instructor_name'): score += 0.3
+    if w.get('date'): score += 0.2
+    if w.get('time'): score += 0.1
+    if w.get('source_url'): score += 0.1
+    return round(score, 2)

utils.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import openai
+import numpy as np
+import re
+from typing import List, Tuple
+from config import EMBED_MODEL
+def get_embedding(text: str) -> List[float]:
+    """Generate embedding for a given text."""
+    text_strip = text.replace("\n", " ").strip()
+    response = openai.embeddings.create(input=[text_strip], model=EMBED_MODEL)
+    return response.data[0].embedding
+def cosine_similarity(a: List[float], b: List[float]) -> float:
+    """Calculate cosine similarity between two vectors."""
+    a = np.array(a)
+    b = np.array(b)
+    if np.linalg.norm(a) == 0 or np.linalg.norm(b) == 0:
+        return 0.0
+    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
+def clean_time(time_str: str) -> str:
+    """Clean up time string."""
+    if not time_str:
+        return ""
+    time_match = re.search(r'(\d{1,2}):?(\d{0,2})\s*(AM|PM)', time_str, re.IGNORECASE)
+    if time_match:
+        hour = time_match.group(1)
+        minute = time_match.group(2) or "00"
+        ampm = time_match.group(3).upper()
+        return f"{hour}:{minute} {ampm}"
+    return time_str.strip()
+def find_top_k_matches(user_embedding, dataset, k=3):
+    """Find top k matching entries from a dataset."""
+    scored = []
+    for entry_id, text, emb in dataset:
+        score = cosine_similarity(user_embedding, emb)
+        scored.append((score, entry_id, text))
+    scored.sort(reverse=True)
+    return scored[:k]
+def classify_intent(question: str) -> str:
+    """
+    Classify the user's intent into:
+    Mode A: Recommendation Mode (Workshops, Dates, Availability, Recommendations)
+    Mode B: Front Desk Mode (Default - Everything else)
+    """
+    prompt = f"""Classify the following user question into one of two modes:
+1. "Mode A - Recommendation Mode": Use this if the user is asking about workshops, specific dates, what's available this month, asking for recommendations, or career goals (like getting an agent).
+2. "Mode B - Front Desk Mode": Use this for broad introductory questions, kids classes, signing up, summit, instructor roles, auditing, online vs in-studio, general policies, or specific questions about existing classes.
+User Question: "{question}"
+Response must be exactly "Mode A" or "Mode B"."""
+    try:
+        response = openai.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0,
+            max_tokens=5
+        )
+        prediction = response.choices[0].message.content.strip()
+        if "Mode A" in prediction:
+            return "Mode A"
+        return "Mode B"
+    except Exception as e:
+        print(f"Error in intent classification: {e}")
+        return "Mode B"  # Default to Front Desk Mode
+def should_include_email(question: str) -> bool:
+    """
+    Determine if the contact email should be shown based on user intent.
+    Allowed for: Payments, Refunds, Attendance issues, Account problems.
+    """
+    from config import EMAIL_ONLY_KEYWORDS
+    import re
+    question_lower = question.lower()
+    for word in EMAIL_ONLY_KEYWORDS:
+        pattern = rf'\b{re.escape(word)}\b'
+        if re.search(pattern, question_lower):
+            return True
+    return False
+def classify_user_type(question: str, history: List[dict] = None) -> str:
+    """
+    Classify the user type into:
+    - new_actor
+    - experienced_actor
+    - parent
+    - current_student
+    - unknown
+    """
+    history_str = ""
+    if history:
+        history_str = "\nConversation context:\n" + "\n".join([f"{m['role']}: {m['content'][:100]}..." for m in history[-3:]])
+    prompt = f"""Classify the user into exactly one of these categories based on their question and context:
+1. "new_actor": Just starting out, has no experience, or is asking how to begin.
+2. "experienced_actor": Already has credits, mentions agents, looking for advanced workshops, or refers to their career progress.
+3. "parent": Asking on behalf of their child, mentions "my kid", "my son", "my daughter", "teens".
+4. "current_student": Refers to past/current classes at Get Scene, mentions a specific GSP membership, or asks about recurring student workshops.
+5. "unknown": Not enough information yet.
+User Question: "{question}"{history_str}
+Response must be exactly one of: new_actor, experienced_actor, parent, current_student, unknown."""
+    try:
+        response = openai.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0,
+            max_tokens=10
+        )
+        prediction = response.choices[0].message.content.strip().lower()
+        valid_types = ["new_actor", "experienced_actor", "parent", "current_student", "unknown"]
+        for t in valid_types:
+            if t in prediction:
+                return t
+        return "unknown"
+    except Exception as e:
+        print(f"Error in user type classification: {e}")
+        return "unknown"