saim1309 commited on
Commit
41b57f8
·
verified ·
1 Parent(s): e730fc6

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +976 -0
  2. config.py +68 -0
  3. database.py +168 -0
  4. scraper.py +347 -0
  5. utils.py +128 -0
app.py ADDED
@@ -0,0 +1,976 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import openai
3
+ import json
4
+ import re
5
+ import os
6
+ from datetime import datetime, timedelta
7
+ import uuid
8
+ from typing import Dict
9
+
10
+ from config import (
11
+ OPENAI_API_KEY, DB_PATH, EMBED_MODEL,
12
+ GEN_MODEL, FAST_MODEL,
13
+ EMOTIONAL_KEYWORDS, ACTION_KEYWORDS, POLICY_KEYWORDS,
14
+ EMAIL_ONLY_KEYWORDS, DETAIL_SYNONYMS, PERSONA_INSTRUCTION
15
+ )
16
+ from utils import (
17
+ get_embedding, cosine_similarity, find_top_k_matches,
18
+ classify_intent, should_include_email, classify_user_type
19
+ )
20
+ from scraper import scrape_workshops_from_squarespace
21
+ from database import (
22
+ fetch_all_embeddings,
23
+ fetch_row_by_id,
24
+ fetch_all_faq_embeddings,
25
+ get_session_state,
26
+ update_session_state,
27
+ log_question
28
+ )
29
+
30
+ # ============================================================================
31
+ # CONFIGURATION
32
+ # ============================================================================
33
+
34
+ if not OPENAI_API_KEY:
35
+ raise ValueError("OPENAI_API_KEY not found in .env file")
36
+
37
+ openai.api_key = OPENAI_API_KEY
38
+
39
+
40
+ # Store session ID for the conversation
41
+ session_id = str(uuid.uuid4())
42
+
43
+ # Cache for workshop data and embeddings
44
+ workshop_cache = {
45
+ 'data': [],
46
+ 'embeddings': [],
47
+ 'last_updated': None,
48
+ 'cache_duration': timedelta(hours=24)
49
+ }
50
+
51
+ # Load Truth Sheet (Structured Knowledge)
52
+ STRUCTURED_KNOWLEDGE = {}
53
+ try:
54
+ knowledge_path = os.path.join(os.path.dirname(__file__), "structured_knowledge.json")
55
+ with open(knowledge_path, "r") as f:
56
+ STRUCTURED_KNOWLEDGE = json.load(f)
57
+ except Exception as e:
58
+ print(f"Error loading structured_knowledge.json: {e}")
59
+
60
+ def get_structured_knowledge_snippet(preference=None):
61
+ """Formats structured knowledge into a text snippet for the prompt, filtering by preference if provided."""
62
+ if not STRUCTURED_KNOWLEDGE:
63
+ return ""
64
+
65
+ snippet = "--- STRUCTURED TRUTH SHEET (VERIFIED KNOWLEDGE) ---\n"
66
+
67
+ # Links
68
+ free_online = STRUCTURED_KNOWLEDGE.get('free_online_class', {}).get('link', '')
69
+ if not preference or preference.lower() == 'online':
70
+ snippet += f"Free Online Class: {free_online}\n"
71
+
72
+ kids = STRUCTURED_KNOWLEDGE.get('kids_classes', {})
73
+ if not preference or preference.lower() == 'online':
74
+ snippet += f"Kids Classes (Online): {kids.get('online_link', '')}\n"
75
+ if not preference or preference.lower() == 'instudio':
76
+ snippet += f"Kids Classes (Atlanta): {kids.get('atlanta_link', '')}\n"
77
+
78
+ summit = STRUCTURED_KNOWLEDGE.get('summit', {})
79
+ snippet += f"Summit: {summit.get('link', '')} - {summit.get('description', '')}\n"
80
+
81
+ # Instructors
82
+ instructors = STRUCTURED_KNOWLEDGE.get('instructors', [])
83
+ if instructors:
84
+ snippet += "Instructors & Roles (STRICT):\n"
85
+ for inst in instructors:
86
+ not_roles = inst.get('not_roles', [])
87
+ not_str = f" [NOT: {', '.join(not_roles)}]" if not_roles else ""
88
+ snippet += f"- {inst['name']}: {inst['role']}{not_str}\n"
89
+
90
+ # Paths
91
+ paths = STRUCTURED_KNOWLEDGE.get('paths', {})
92
+ if not preference or preference.lower() == 'online':
93
+ snippet += f"Online Path: {paths.get('online', '')}\n"
94
+ if not preference or preference.lower() == 'instudio':
95
+ snippet += f"Atlanta Path: {paths.get('atlanta', '')}\n"
96
+
97
+ snippet += "--------------------------------------------------\n"
98
+ return snippet
99
+
100
+ # ============================================================================
101
+ # HELPER FUNCTIONS
102
+ # ============================================================================
103
+
104
+ def calculate_workshop_confidence(w: Dict) -> float:
105
+ """Calculate confidence score of retrieved workshop data"""
106
+ score = 0.0
107
+ if w.get('title'): score += 0.3
108
+ if w.get('instructor_name'): score += 0.3
109
+ if w.get('date'): score += 0.2
110
+ if w.get('time'): score += 0.1
111
+ if w.get('source_url'): score += 0.1
112
+ return round(score, 2)
113
+
114
+ # ============================================================================
115
+ # WORKSHOP FUNCTIONS
116
+ # ============================================================================
117
+
118
+ def get_current_workshops():
119
+ """Get current workshops with caching"""
120
+ global workshop_cache
121
+
122
+ now = datetime.now()
123
+
124
+ # Check if cache is still valid
125
+ if (workshop_cache['last_updated'] and
126
+ now - workshop_cache['last_updated'] < workshop_cache['cache_duration'] and
127
+ workshop_cache['data']):
128
+ print("Using cached workshop data")
129
+ return workshop_cache['data'], workshop_cache['embeddings']
130
+
131
+ print("Fetching fresh workshop data...")
132
+
133
+ # Use robust Squarespace scraping system
134
+ online_workshops = scrape_workshops_from_squarespace("https://www.getscenestudios.com/online")
135
+ instudio_workshops = scrape_workshops_from_squarespace("https://www.getscenestudios.com/instudio")
136
+
137
+ all_workshops = online_workshops + instudio_workshops
138
+
139
+ # Data Integrity: Validate and score workshops
140
+ valid_workshops = []
141
+ total_score = 0
142
+ for w in all_workshops:
143
+ conf = calculate_workshop_confidence(w)
144
+ if conf >= 0.8:
145
+ valid_workshops.append(w)
146
+ total_score += conf
147
+ else:
148
+ print(f"⚠️ Rejecting weak record (Confidence: {conf}): {w.get('title', 'Unknown')}", flush=True)
149
+
150
+ avg_conf = total_score / len(valid_workshops) if valid_workshops else 0
151
+ print(f"📊 DATA INTEGRITY: Found {len(all_workshops)} total, {len(valid_workshops)} valid (Confidence >= 0.8)", flush=True)
152
+ print(f"📈 Retrieval Confidence: {avg_conf:.2f} (Average)", flush=True)
153
+
154
+ all_workshops = valid_workshops
155
+
156
+ if not all_workshops:
157
+ if workshop_cache['data']:
158
+ print("Scraping failed, using cached data")
159
+ return workshop_cache['data'], workshop_cache['embeddings']
160
+ else:
161
+ print("No workshop data available")
162
+ return [], []
163
+
164
+ # Generate embeddings for workshops
165
+ workshop_embeddings = []
166
+ for workshop in all_workshops:
167
+ try:
168
+ embedding = get_embedding(workshop['full_text'])
169
+ workshop_embeddings.append(embedding)
170
+ except Exception as e:
171
+ print(f"Error generating embedding for workshop: {e}")
172
+ workshop_embeddings.append([0] * 1536)
173
+
174
+ # Update cache
175
+ workshop_cache['data'] = all_workshops
176
+ workshop_cache['embeddings'] = workshop_embeddings
177
+ workshop_cache['last_updated'] = now
178
+
179
+ print(f"Cached {len(all_workshops)} workshops")
180
+ return all_workshops, workshop_embeddings
181
+
182
+ def find_top_workshops(user_embedding, k=3):
183
+ """Find top matching workshops using real-time data"""
184
+ workshops, workshop_embeddings = get_current_workshops()
185
+
186
+ if not workshops:
187
+ return []
188
+
189
+ scored = []
190
+ for i, (workshop, emb) in enumerate(zip(workshops, workshop_embeddings)):
191
+ try:
192
+ score = cosine_similarity(user_embedding, emb)
193
+ scored.append((score, i, workshop['full_text'], workshop))
194
+ except Exception as e:
195
+ print(f"Error calculating similarity: {e}")
196
+ continue
197
+
198
+ scored.sort(reverse=True)
199
+ return scored[:k]
200
+
201
+ # ============================================================================
202
+ # PROMPT BUILDING FUNCTIONS
203
+ # ============================================================================
204
+
205
+ def generate_enriched_links(row):
206
+ base_url = row.get("youtube_url")
207
+ guest_name = row.get("guest_name", "")
208
+ highlights = json.loads(row.get("highlight_json", "[]"))
209
+ summary = highlights[0]["summary"] if highlights else ""
210
+
211
+ # Truncate summary to first sentence only
212
+ if summary:
213
+ first_sentence = summary.split('.')[0] + '.'
214
+
215
+ if len(first_sentence) > 120:
216
+ short_summary = first_sentence[:117] + "..."
217
+ else:
218
+ short_summary = first_sentence
219
+ else:
220
+ short_summary = "Industry insights for actors"
221
+
222
+ markdown = f"🎧 [Watch {guest_name}'s episode here]({base_url}) - {short_summary}"
223
+ return [markdown]
224
+
225
+ def build_enhanced_prompt(user_question, context_results, top_workshops, user_preference=None, user_type='unknown', enriched_podcast_links=None, wants_details=False, current_topic=None, mode="Mode B", is_low_confidence=False):
226
+ """Builds the system prompt with strict formatting rules."""
227
+
228
+ # Dynamic Links from Structured Knowledge
229
+ free_class_url = STRUCTURED_KNOWLEDGE.get('free_online_class', {}).get('link', "https://www.getscenestudios.com/online")
230
+ if user_preference and user_preference.lower() == 'instudio':
231
+ free_class_url = STRUCTURED_KNOWLEDGE.get('paths', {}).get('atlanta', "https://www.getscenestudios.com/instudio")
232
+
233
+ atlanta_link = STRUCTURED_KNOWLEDGE.get('paths', {}).get('atlanta', "https://www.getscenestudios.com/instudio")
234
+ online_link = STRUCTURED_KNOWLEDGE.get('paths', {}).get('online', "https://www.getscenestudios.com/online")
235
+
236
+ truth_sheet_snippet = get_structured_knowledge_snippet(preference=user_preference)
237
+ # Placeholder removed to ensure strict usage of retrieved data
238
+ single_podcast = ""
239
+
240
+ # helper for clean links
241
+ def format_workshop(w):
242
+ # Strict validation: Title, Instructor, Date must be present and non-empty
243
+ if not w.get('title') or not w.get('instructor_name') or not w.get('date'):
244
+ return None
245
+
246
+ # Strict formatting: [{Title}]({Link}) with {Instructor} ({Format}) on {Date}
247
+ link = "https://www.getscenestudios.com/instudio" if "/instudio" in w.get('source_url', '') else "https://www.getscenestudios.com/online"
248
+
249
+ # User Preference Filtering
250
+ w_type = "Online" if "online" in w.get('source_url', '') else "In-Studio"
251
+ if user_preference:
252
+ if user_preference.lower() != w_type.lower():
253
+ return None
254
+
255
+ # Calculate confidence
256
+ confidence = calculate_workshop_confidence(w)
257
+ if confidence < 0.70:
258
+ return None
259
+
260
+ return f"- [{w['title']}]({link}) with {w['instructor_name']} ({w_type}) on {w['date']} at {w.get('time', '')}"
261
+
262
+ # Prepare workshop list (Top 3 max)
263
+ workshop_lines = []
264
+ if top_workshops:
265
+ for _, _, _, w_data in top_workshops[:5]: # Check top 5, take top 3 valid
266
+ formatted = format_workshop(w_data)
267
+ if formatted:
268
+ workshop_lines.append(formatted)
269
+
270
+
271
+ workshop_text = ""
272
+ if workshop_lines:
273
+ workshop_text = "\n".join(workshop_lines[:3])
274
+ else:
275
+ # Improved fallback to avoid generic/placeholder-like feeling
276
+ label = f"{user_preference.capitalize()} " if user_preference else ""
277
+ link = online_link if user_preference == 'online' else atlanta_link if user_preference == 'instudio' else online_link
278
+ # Mandatory Hyperlink Enforcement
279
+ workshop_text = f"We are constantly updating our schedule! You can view and [register for upcoming {label}workshops here]({link})."
280
+
281
+ # Handle missing podcast data strictly
282
+ if not enriched_podcast_links:
283
+ single_podcast = "Our latest industry insights are available on YouTube: https://www.youtube.com/@GetSceneStudios"
284
+ else:
285
+ single_podcast = enriched_podcast_links[0]
286
+
287
+ # --- EMOTIONAL / SUPPORT MODE CHECK ---
288
+ is_emotional = detect_response_type(user_question) == "support"
289
+
290
+ if is_emotional:
291
+ prompt = f"""{PERSONA_INSTRUCTION}
292
+
293
+ You are acting in SUPPORT MODE.
294
+
295
+ CRITICAL INSTRUCTIONS:
296
+ 1. ACKNOWLEDGE their feelings first (e.g., "I hear how frustrating it is to feel stuck...").
297
+ 2. Provide SUPPORTIVE language (2-3 sentences max).
298
+ 3. Offer EXACTLY ONE gentle follow-up resource: either the podcast OR the free class.
299
+ 4. DO NOT suggest paid workshops or upsell in this response.
300
+ 5. KEEP IT BRIEF (≤150 words).
301
+
302
+ USER'S QUESTION: {user_question}
303
+
304
+ REQUIRED RESPONSE FORMAT:
305
+ [Your empathetic, supportive acknowledgment]
306
+
307
+ Here's a free resource that might help you move forward:
308
+ [Pick ONE: {single_podcast} OR Free Class at {free_class_url}]
309
+
310
+ Questions? Contact info@getscenestudios.com"""
311
+ return prompt
312
+
313
+ # --- STANDARD LOGIC FOR CONTEXT SNIPPET ---
314
+ question_lower = user_question.lower()
315
+ context_snippet = ""
316
+
317
+ # Priority 1: Direct Keywords in current question
318
+ detected_topic = None
319
+ if any(word in question_lower for word in ['agent', 'representation', 'rep', 'manager']):
320
+ detected_topic = 'agent'
321
+ elif any(word in question_lower for word in ['beginner', 'new', 'start', 'beginning']):
322
+ detected_topic = 'beginner'
323
+ elif any(word in question_lower for word in ['callback', 'audition', 'tape', 'self-tape', 'booking']):
324
+ detected_topic = 'audition'
325
+ elif any(word in question_lower for word in ['mentorship', 'coaching']):
326
+ detected_topic = 'mentorship'
327
+ elif any(word in question_lower for word in ['price', 'cost', 'how much']):
328
+ detected_topic = 'pricing'
329
+ elif any(word in question_lower for word in ['class', 'workshop', 'training', 'learn']):
330
+ detected_topic = 'classes'
331
+ elif any(word in question_lower for word in ['membership', 'gsp', 'plus']):
332
+ detected_topic = 'membership'
333
+
334
+ # Priority 2: Fallback to session context if current question is ambiguous
335
+ if not detected_topic and current_topic:
336
+ topic_map = {
337
+ 'agent_seeking': 'agent',
338
+ 'beginner': 'beginner',
339
+ 'audition_help': 'audition',
340
+ 'mentorship': 'mentorship',
341
+ 'pricing': 'pricing',
342
+ 'classes': 'classes',
343
+ 'membership': 'membership'
344
+ }
345
+ detected_topic = topic_map.get(current_topic)
346
+
347
+ # Assign snippet based on topic
348
+ if detected_topic == 'agent':
349
+ context_snippet = "Get Scene Studios has helped 1000+ actors land representation. Total Agent Prep offers live practice with working agents (age 16+, limited to 12 actors)."
350
+ elif detected_topic == 'beginner':
351
+ context_snippet = "Get Scene Studios specializes in getting actors audition-ready fast with camera technique and professional self-tape skills."
352
+ elif detected_topic == 'audition':
353
+ context_snippet = "Get Scene offers Crush the Callback (Zoom simulation) and Perfect Submission (self-tape mastery) for actors refining their technique."
354
+ elif detected_topic == 'mentorship':
355
+ context_snippet = "Working Actor Mentorship is a 6-month program ($3,000) with structured feedback and industry access."
356
+ elif detected_topic == 'pricing':
357
+ context_snippet = "Get Scene Studios pricing varies by program. Most workshops cap at 12-14 actors for personalized feedback."
358
+ elif detected_topic == 'classes':
359
+ link = online_link if user_preference == 'online' else atlanta_link
360
+ context_snippet = f"Get Scene Studios offers world-class {user_preference or ''} acting workshops. Our sessions focus on camera technique and industry readiness. Full details at {link}."
361
+ elif detected_topic == 'membership':
362
+ context_snippet = "Get Scene Plus (GSP) is our membership program that provides ongoing access to industry pros and audition insights."
363
+ elif 'summit' in question_lower:
364
+ context_snippet = "The Get Scene Summit is a premier special event featuring massive line-ups of agents, managers, and casting directors. It is NOT a recursive workshop."
365
+ else:
366
+ context_snippet = "Get Scene Studios (founded by Jesse Malinowski) offers training for TV/film actors at all levels."
367
+
368
+ preference_instruction = ""
369
+ if not user_preference:
370
+ preference_instruction = """
371
+ IMPORTANT: We need to know if the user prefers "Online" or "In-Studio" workshops.
372
+ If their question is broad (e.g., "starting acting", "kids classes", "workshops", "training", "classes") and they haven't specified a format, you MUST START your response with this exact question: "Are you looking for online training or in-studio in Atlanta?"
373
+ NO PREFIXES, NO "WARM" TRANSITIONS, NO PARAPHRASING.
374
+
375
+ FEW-SHOT EXAMPLES:
376
+ User: "I want to start acting"
377
+ Response: "Are you looking for online training or in-studio in Atlanta? That's a fantastic decision! With Get Scene Studios..."
378
+
379
+ User: "Do you have kids classes?"
380
+ Response: "Are you looking for online training or in-studio in Atlanta? Absolutely, we offer kids classes in both formats..."
381
+ """
382
+ else:
383
+ preference_instruction = f"""
384
+ USER PREFERENCE KNOWN: {user_preference.upper()}
385
+ 1. DO NOT ask "Online or In-Studio" again.
386
+ 2. Ensure your recommendations align with {user_preference.upper()} where possible.
387
+ """
388
+
389
+ BUSINESS_RULES_INSTRUCTION = f"""
390
+ TOP-PRIORITY BUSINESS RULES (NO EXCEPTIONS):
391
+ 1. **NO AUDITING**: Workshops can NEVER be audited. Do not reason about this. Tell the user "We do not allow auditing for our workshops" and immediately redirect them to the Free Online Class.
392
+ 2. **FREE CLASS FIRST**: The Free Online Class is the MANDATORY first step for ALL new users. If a user is "starting out", "new to acting", or asking "how to begin", you MUST route them to the Free Online Class link below as their primary next step.
393
+ 3. **NO IMMEDIATE PAID RECOMMENDATIONS**: For new or unclear users, do NOT recommend specific paid workshops yet. Focus entirely on the Free Online Class as the entry point.
394
+ 4. **KIDS CLASSES**: We offer kids classes both Online and in Atlanta (In-Studio).
395
+ 5. **SUMMIT**: The Summit is a special event offering, NOT a regular workshop.
396
+ {"6. **STRICT LINK FILTERING**: User prefers " + user_preference.upper() + ". You MUST ONLY provide links for " + user_preference.upper() + " training. OMIT any " + ("In-Studio" if user_preference.lower() == 'online' else "Online") + " links entirely." if user_preference else ""}
397
+ 7. **ROLE INTEGRITY (STRICT)**:
398
+ - **THE TRUTH SHEET IS THE ABSOLUTE AND FINAL AUTHORITY.** It overrides ANY information found in podcast descriptions, workshop titles, or suggested by the user.
399
+ - ONLY use the roles explicitly defined in the Truth Sheet.
400
+ - **NEVER infer a role** from the context of a workshop or podcast.
401
+ - If someone is teaching a class, do NOT assume they are an "Instructor" unless the Truth Sheet says so.
402
+ - If someone is labeled as an "Agent", do NOT call them an "Instructor" or "Mentor" unless explicitly listed as such in the TRUTH SHEET.
403
+ - Pay attention to the "[NOT: ...]" list for each person in the Truth Sheet. For example, if someone is listed as "[NOT: Instructor]", NEVER call them an instructor, even if they are described as one in a podcast or workshop description.
404
+ - **NEVER** guess or invent a role for anyone.
405
+ """
406
+ # Brevity & Cognitive Load: Direct instructions based on user intent
407
+ detail_instruction = "Answer the user's question briefly (2-3 sentences max, ≤150 words total)."
408
+ if wants_details:
409
+ target = f" regarding {detected_topic or 'the current recommendations'}"
410
+ detail_instruction = f"Provide a detailed and thorough explanation for the user's request{target}. Focus on being helpful and providing deep value as a mentor."
411
+
412
+ # Email contact line is conditional
413
+ email_contact = ""
414
+ if should_include_email(user_question):
415
+ email_contact = "\n \nQuestions? Contact info@getscenestudios.com"
416
+
417
+ # Context inclusion
418
+ retrieved_info = ""
419
+ if context_results:
420
+ retrieved_info = f"\nRELEVANT INFORMATION FROM KNOWLEDGE BASE:\n{context_results}\n"
421
+
422
+ is_beginner = (detected_topic == 'beginner')
423
+ beginner_enforcement = ""
424
+ if is_beginner:
425
+ beginner_enforcement = """
426
+ CRITICAL: The user is a BEGINNER. You MUST prioritize the Free Online Class above all else.
427
+ 1. Do NOT recommend specific paid workshops in your numbered list.
428
+ 2. Instead, provide the Free Online Class as your primary recommendation.
429
+ 3. Your numbered list should be:
430
+ 1. Free Online Class (The mandatory first step)
431
+ 2. The Get Scene Podcast (For industry mindset)
432
+ 3. [Choose a very general resource or a 1:1 consultation if available, but NOT a specific workshop]
433
+ """
434
+
435
+ user_type_instruction = ""
436
+ if user_type == 'new_actor':
437
+ user_type_instruction = "USER TYPE: NEW ACTOR. Focus heavily on foundation, the Free Online Class, and beginner-friendly mindset. Avoid advanced industry jargon."
438
+ elif user_type == 'experienced_actor':
439
+ user_type_instruction = "USER TYPE: EXPERIENCED ACTOR. Focus on advanced technique, Agent Prep, mentorship, and industry networking. Use professional terminology."
440
+ elif user_type == 'parent':
441
+ user_type_instruction = "USER TYPE: PARENT. Focus on kids/teen programs, safety, youth training paths, and parent-specific concerns."
442
+ elif user_type == 'current_student':
443
+ user_type_instruction = "USER TYPE: EXISTING STUDENT. Focus on GSP membership benefits, advanced mentorships (WAM), and specialized recurring workshops."
444
+
445
+ if mode == "Mode A":
446
+ # Recommendation Mode: Existing checklist applies
447
+ prompt = f"""{PERSONA_INSTRUCTION}
448
+
449
+ {truth_sheet_snippet}
450
+
451
+ {BUSINESS_RULES_INSTRUCTION}
452
+
453
+ {user_type_instruction}
454
+
455
+ {beginner_enforcement}
456
+
457
+ {context_snippet}{retrieved_info}
458
+
459
+ CRITICAL INSTRUCTIONS (RECOMMENDATION MODE):
460
+ - {detail_instruction}
461
+ - Use natural, human transitions between your answer and the recommendations.
462
+ - For each recommendation, add a tiny bit of "mentor advice" on why it helps.
463
+ - Use ONLY the provided links - do not invent recommendations.
464
+ - **MANDATORY: Use direct hyperlinks.** For ANY mention of signing up, classes, kids programs, the Summit, or the free class, you MUST include the direct [Title](Link) format.
465
+ - **NEVER say "check our website"** or "visit the link below". Embed the link directly into the relevant part of your mentor advice.
466
+ - Focus on clean, readable formatting.{preference_instruction}
467
+
468
+ CRITICAL ROLE GUARD (FINAL AUTHORITY):
469
+ - Corey Lawson: Instructor/Actor [NOT an Agent]
470
+ - Jacob Lawson: Agent/Owner [NOT an Instructor]
471
+ - Jesse Malinowski: Founder/Mentor [NOT an Agent]
472
+ - Alex White: Agent [NOT an Instructor/Mentor]
473
+ - THE TRUTH SHEET IS THE ABSOLUTE AUTHORITY. It overrides ALL other info.
474
+ - NEVER call Corey Lawson an agent. They are brothers with different roles.
475
+
476
+ USER'S QUESTION: {user_question}
477
+
478
+ REQUIRED RESPONSE FORMAT (STRICT):
479
+ [Helpful, mentor-like answer]
480
+
481
+ Here's your path forward:
482
+ 1. Free Online Class (Mandatory First Step): {free_class_url}
483
+ 2. Recommended Podcast Episode (For Industry Mindset):
484
+ {single_podcast}
485
+ 3. Recommended Workshop/Next Step:
486
+ {workshop_text}{email_contact}
487
+
488
+ CRITICAL: YOU MUST USE THE ABOVE "1. 2. 3." STRUCTURE EXACTLY. DO NOT RENAME THE STEPS. DO NOT SKIP THE PODCAST.
489
+ """
490
+ else:
491
+ # Front Desk Mode: More conversational, direct answers, recommendations are optional but encouraged
492
+ prompt = f"""{PERSONA_INSTRUCTION}
493
+
494
+ {truth_sheet_snippet}
495
+
496
+ {BUSINESS_RULES_INSTRUCTION}
497
+
498
+ {context_snippet}{retrieved_info}
499
+
500
+ CRITICAL INSTRUCTIONS (FRONT DESK MODE):
501
+ - You are acting as the warm and helpful Front Desk Mentor.
502
+ - **MANDATORY: Ask a routing question AT THE BEGINNING** of your response (e.g., "Are you looking to start your journey or refine existing skills?").
503
+ - Answer the user's question directly using the provided information but keep it punchy—**no essays**.
504
+ - **MANDATORY: Provide direct hyperlinks** for ANY mention of registration, classes, kids programs, the Summit, or more information. Use EXACTLY these links as relevant:
505
+ - Free Online Class: [{free_class_url}]({free_class_url})
506
+ - Recommended for you: {single_podcast}
507
+ - Upcoming Workshops: {workshop_text}
508
+ - Southeast Actor Summit: [Southeast Actor Summit Registration](https://www.getscenestudios.com/southeast-actor-summit)
509
+ - **NEVER say "go to the website"** or "check our site". Always provide the specific hyperlink directly in your answer.
510
+ - **NEVER guess** or invent information. If it's not in the context, guide the user to clarify.
511
+ - **MANDATORY: Guide the user to the next step** at the end of your response (e.g., "A great next step for you would be to sign up for our free class").
512
+ - {detail_instruction}
513
+ - Focus on being a helpful guide.{preference_instruction}
514
+ {"MANDATORY: We don't have a high-confidence match for this specific question. Provide the CLOSEST possible link from our verified knowledge above for their general query." if is_low_confidence else ""}
515
+
516
+ CRITICAL ROLE GUARD (FINAL AUTHORITY):
517
+ - Corey Lawson: Instructor/Actor [NOT an Agent]
518
+ - Jacob Lawson: Agent/Owner [NOT an Instructor]
519
+ - Jesse Malinowski: Founder/Mentor [NOT an Agent]
520
+ - Alex White: Agent [NOT an Instructor/Mentor]
521
+ - THE TRUTH SHEET IS THE ABSOLUTE AUTHORITY. It overrides ALL other info.
522
+ - NEVER call Corey Lawson an agent. They are brothers with different roles.
523
+
524
+ USER'S QUESTION: {user_question}
525
+
526
+ REQUIRED RESPONSE FORMAT:
527
+ [Routing Question]
528
+ [Helpful, punchy response with links]
529
+ [Next step guidance]{email_contact}"""
530
+
531
+ return prompt
532
+
533
+ # ============================================================================
534
+ # DETECTION FUNCTIONS
535
+ # ============================================================================
536
+
537
+ def detect_question_category(question):
538
+ """Categorize user questions for better context injection"""
539
+ question_lower = question.lower()
540
+
541
+ categories = {
542
+ 'agent_seeking': ['agent', 'representation', 'rep', 'manager', 'get an agent'],
543
+ 'beginner': ['beginner', 'new', 'start', 'beginning', 'first time', 'never acted'],
544
+ 'audition_help': ['audition', 'callback', 'tape', 'self-tape', 'submission'],
545
+ 'mentorship': ['mentorship', 'coaching', 'intensive', 'mentor', 'one-on-one'],
546
+ 'pricing': ['price', 'cost', 'pricing', '$', 'money', 'payment', 'fee'],
547
+ 'classes': ['class', 'workshop', 'training', 'course', 'learn'],
548
+ 'membership': ['membership', 'join', 'member', 'gsp', 'plus'],
549
+ 'technical': ['self-tape', 'equipment', 'lighting', 'editing', 'camera']
550
+ }
551
+
552
+ detected = []
553
+ for category, keywords in categories.items():
554
+ if any(keyword in question_lower for keyword in keywords):
555
+ detected.append(category)
556
+
557
+ return detected
558
+
559
+ def detect_response_type(question):
560
+ """Detect if question is emotional/support vs action/results oriented"""
561
+ question_lower = question.lower()
562
+
563
+ emotional_count = sum(1 for word in EMOTIONAL_KEYWORDS if word in question_lower)
564
+ action_count = sum(1 for word in ACTION_KEYWORDS if word in question_lower)
565
+
566
+ if emotional_count > 0 and emotional_count >= action_count:
567
+ return "support"
568
+ return "standard"
569
+
570
+ def detect_policy_issue(question):
571
+ """Detect if question violates hard policy rules (refunds, attendance, etc.) using word boundaries"""
572
+ question_lower = question.lower()
573
+ for word in POLICY_KEYWORDS:
574
+ # Use regex word boundaries to prevent substring matches (e.g., 'submission' matching 'miss')
575
+ pattern = rf'\b{re.escape(word)}\b'
576
+ if re.search(pattern, question_lower):
577
+ return True
578
+ return False
579
+
580
+ def detect_preference(question):
581
+ """Detect if user is stating a preference"""
582
+ q_lower = question.lower()
583
+ if 'online' in q_lower and 'studio' not in q_lower:
584
+ return 'online'
585
+ if ('studio' in q_lower or 'person' in q_lower or 'atlanta' in q_lower) and 'online' not in q_lower:
586
+ return 'instudio'
587
+ return None
588
+
589
+ def get_contextual_business_info(categories):
590
+ """Return relevant business information based on detected question categories"""
591
+
592
+ context_map = {
593
+ 'agent_seeking': {
594
+ 'programs': ['Total Agent Prep', 'Working Actor Mentorship'],
595
+ 'key_info': 'Live pitch practice with real agents, Actors Access optimization',
596
+ 'journey': 'Total Agent Prep → GSP → Mentorship for sustained progress'
597
+ },
598
+ 'beginner': {
599
+ 'programs': ['Free Classes', 'Get Scene 360', 'Get Scene Plus'],
600
+ 'key_info': 'Start with holistic foundation, build consistency',
601
+ 'journey': 'Free class → Get Scene 360 → GSP membership'
602
+ },
603
+ 'audition_help': {
604
+ 'programs': ['Perfect Submission', 'Crush the Callback', 'Audition Insight'],
605
+ 'key_info': 'Self-tape mastery, callback simulation, pro feedback',
606
+ 'journey': 'Perfect Submission → GSP for ongoing Audition Insight'
607
+ },
608
+ 'mentorship': {
609
+ 'programs': ['Working Actor Mentorship'],
610
+ 'key_info': '6-month intensive with structured feedback and accountability',
611
+ 'journey': 'Ready for commitment → WAM → Advanced workshops'
612
+ }
613
+ }
614
+
615
+ relevant_info = {}
616
+ for category in categories:
617
+ if category in context_map:
618
+ relevant_info[category] = context_map[category]
619
+
620
+ return relevant_info
621
+
622
+ # ============================================================================
623
+ # MAIN CHATBOT LOGIC
624
+ # ============================================================================
625
+
626
+ def update_knowledge_from_question(session_id: str, question: str):
627
+ """Extract attributes and update knowledge dictionary"""
628
+ updates = {}
629
+
630
+ # Extract Format
631
+ pref = detect_preference(question)
632
+ if pref:
633
+ updates['format'] = pref
634
+
635
+ # Extract Topic
636
+ cats = detect_question_category(question)
637
+ if cats:
638
+ # Prioritize specific topics over generic ones
639
+ priority_topics = ['agent_seeking', 'beginner', 'audition_help', 'mentorship', 'pricing']
640
+ for topic in priority_topics:
641
+ if topic in cats:
642
+ updates['topic'] = topic
643
+ break
644
+ if 'topic' not in updates and cats:
645
+ updates['topic'] = cats[0]
646
+
647
+ if updates:
648
+ update_session_state(session_id, knowledge_update=updates, increment_count=False)
649
+ return updates
650
+ return {}
651
+
652
+ def process_question(question: str, current_session_id: str):
653
+ """Main function to process user questions - replaces Flask /ask endpoint"""
654
+
655
+ try:
656
+ if not question:
657
+ return "Question is required"
658
+
659
+ # 0. INTENT CLASSIFICATION
660
+ activated_mode = "Mode B" # Default safe value
661
+
662
+ # 1. HARD POLICY CHECK (Internal critical issues only)
663
+ if detect_policy_issue(question) and should_include_email(question):
664
+ log_question(
665
+ question=question,
666
+ session_id=current_session_id,
667
+ category="policy_violation",
668
+ answer="Please email info@getscenestudios.com.",
669
+ detected_mode="Mode B",
670
+ routing_question=None,
671
+ rule_triggered="policy_email_only",
672
+ link_provided=False
673
+ )
674
+ return "Please email info@getscenestudios.com."
675
+
676
+ # 2. Handle Session & Knowledge State
677
+ update_knowledge_from_question(current_session_id, question)
678
+
679
+ session_state = get_session_state(current_session_id)
680
+
681
+ try:
682
+ knowledge = json.loads(session_state.get('knowledge_context', '{}'))
683
+ except:
684
+ knowledge = {}
685
+
686
+ user_type = knowledge.get('user_type', 'unknown')
687
+
688
+ # Update User Type if unknown or enough turn count
689
+ if user_type == 'unknown' or session_state.get('msg_count', 0) % 3 == 0:
690
+ new_user_type = classify_user_type(question)
691
+ if new_user_type != 'unknown':
692
+ user_type = new_user_type
693
+ knowledge['user_type'] = user_type
694
+ update_session_state(current_session_id, knowledge_update=knowledge, increment_count=False)
695
+
696
+ user_preference = knowledge.get('format')
697
+ current_topic = knowledge.get('topic')
698
+
699
+ if not user_preference:
700
+ user_preference = session_state.get('preference')
701
+
702
+ update_session_state(current_session_id, increment_count=True)
703
+
704
+ # 3. ROUTING: Use classification LLM to decide Mode A or Mode B
705
+ activated_mode = classify_intent(question)
706
+ last_mode = knowledge.get('last_mode')
707
+
708
+ if session_state.get('clarification_count', 0) > 0 and last_mode:
709
+ if len(question.split()) < 5 or any(k in question.lower() for k in ['yes', 'no', 'sure', 'not sure', 'dont know']):
710
+ activated_mode = last_mode
711
+
712
+ # Store mode for next turn's potentially sticky logic
713
+ knowledge['last_mode'] = activated_mode
714
+ print(f"DEBUG: [{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Activated Mode for session {current_session_id}: {activated_mode}")
715
+ update_session_state(current_session_id, knowledge_update=knowledge, increment_count=False)
716
+
717
+ # 4. SEMANTIC SEARCH: Create embedding of user question
718
+ user_embedding = get_embedding(question)
719
+
720
+ # Check FAQ embeddings first
721
+ faq_data = fetch_all_faq_embeddings()
722
+ top_faqs = []
723
+
724
+ for entry_id, question_text, answer_text, emb in faq_data:
725
+ score = cosine_similarity(user_embedding, emb)
726
+ top_faqs.append((score, entry_id, question_text, answer_text))
727
+ top_faqs.sort(reverse=True)
728
+
729
+ faq_threshold = 0.85
730
+ ambiguous_threshold = 0.65
731
+
732
+ is_low_confidence = False # Default safe initialization
733
+ context_results = None
734
+
735
+ if top_faqs and top_faqs[0][0] >= faq_threshold:
736
+ best_score, faq_id, question_text, answer_text = top_faqs[0]
737
+ print(f"DEBUG: Processing FAQ match through LLM and Truth Sheet rules...")
738
+ context_results = answer_text
739
+
740
+ elif activated_mode == "Mode A":
741
+ # Mode A: Any score < 0.85 triggers Clarification -> Email
742
+ clarification_count = session_state.get('clarification_count', 0)
743
+ if clarification_count == 0:
744
+ update_session_state(current_session_id, increment_clarification=True, increment_count=False)
745
+ return "I want to make sure I give you the best advice. Are you looking for classes in [Atlanta](https://www.getscenestudios.com/instudio), [Online](https://www.getscenestudios.com/online), or something else like getting an agent? You can also start right now with our [Free Online Class](https://www.getscenestudios.com/online)!"
746
+ else:
747
+ update_session_state(current_session_id, reset_clarification=True)
748
+ return "I'm still not quite sure, and I want to make sure you get the right answer! Please email our team at info@getscenestudios.com and we'll help you directly. In the meantime, you can explore or [register for our Online Path](https://www.getscenestudios.com/online) or [In-Studio classes in Atlanta](https://www.getscenestudios.com/instudio)."
749
+
750
+ elif top_faqs and top_faqs[0][0] >= ambiguous_threshold:
751
+ # Mode B: Ambiguous Score (0.65 - 0.85) -> Ask "Did you mean?"
752
+ update_session_state(current_session_id, increment_clarification=True, increment_count=False)
753
+ best_match_q = top_faqs[0][2]
754
+ return f"Did you mean: {best_match_q}?"
755
+
756
+ else:
757
+ # 5. HALLUCINATION GUARD: Check if query is acting-related before blocking
758
+ categories = detect_question_category(question)
759
+
760
+ has_session_context = (current_topic is not None) or (user_preference is not None)
761
+
762
+ FOLLOWUP_KEYWORDS = ['yes', 'no', 'sure', 'okay', 'thanks', 'thank you', 'please', 'go ahead', 'continue', 'more']
763
+ ACTING_KEYWORDS = ['class', 'workshop', 'coaching', 'studio', 'acting', 'online', 'person', 'atlanta', 'training', 'prefer', 'preference', 'format', 'recommendation', 'online class', 'online workshop','instudio class','instudio workshop', 'actor', 'scene', 'audition', 'theatre', 'film', 'tv', 'commercial', 'agent', 'rep', 'manager', 'instructor', 'role', 'auditing', 'audit', 'representation', 'summit', 'sign up', 'sign-up', 'register', 'enroll', 'schedule', 'cancel', 'reschedule', 'how do i']
764
+
765
+ is_acting_related = (
766
+ len(categories) > 0 or
767
+ detect_response_type(question) == "support" or
768
+ any(re.search(rf'\b{re.escape(k)}\b', question.lower()) for k in ACTION_KEYWORDS) or
769
+ any(re.search(rf'\b{re.escape(k)}\b', question.lower()) for k in DETAIL_SYNONYMS) or
770
+ any(re.search(rf'\b{re.escape(k)}\b', question.lower()) for k in ACTING_KEYWORDS) or
771
+ (has_session_context and any(re.search(rf'\b{re.escape(k)}\b', question.lower().strip('.!')) for k in FOLLOWUP_KEYWORDS)) or
772
+ (session_state.get('clarification_count', 0) > 0 and len(question.split()) < 5) # Only allow short answers to bypass
773
+ )
774
+
775
+ if not is_acting_related:
776
+ return "I'm not exactly sure about that. Could you clarify your question?"
777
+
778
+ # Flag for Mode B Low Confidence
779
+ is_low_confidence = (activated_mode == "Mode B" and not context_results)
780
+
781
+ # 6. LLM PATH: No high-confidence FAQ match, or Mode B FAQ formatting
782
+ update_session_state(current_session_id, reset_clarification=True, increment_count=False)
783
+
784
+ # RAG: Fetch relevant workshops and podcasts
785
+ podcast_data = fetch_all_embeddings("podcast_episodes")
786
+ top_workshops = find_top_workshops(user_embedding, k=3)
787
+ top_podcasts = find_top_k_matches(user_embedding, podcast_data, k=3)
788
+
789
+ enriched_podcast_links = []
790
+ for _, podcast_id, _ in top_podcasts:
791
+ row = fetch_row_by_id("podcast_episodes", podcast_id)
792
+ enriched_podcast_links.extend(generate_enriched_links(row))
793
+
794
+ if not enriched_podcast_links:
795
+ fallback = fetch_row_by_id("podcast_episodes", podcast_data[0][0])
796
+ enriched_podcast_links = generate_enriched_links(fallback)
797
+
798
+ # Brevity & Detail Detection
799
+ wants_details = any(syn in question.lower() for syn in DETAIL_SYNONYMS)
800
+
801
+ # Use enhanced prompt building
802
+ final_prompt = build_enhanced_prompt(
803
+ question,
804
+ context_results,
805
+ top_workshops,
806
+ user_preference=user_preference,
807
+ user_type=user_type,
808
+ enriched_podcast_links=enriched_podcast_links,
809
+ wants_details=wants_details,
810
+ current_topic=current_topic,
811
+ mode=activated_mode,
812
+ is_low_confidence=is_low_confidence
813
+ )
814
+
815
+ # Invoke LLM
816
+ print(f"DEBUG FINAL PROMPT:\n{final_prompt}\n--- END PROMPT ---")
817
+
818
+ response = openai.chat.completions.create(
819
+ model=GEN_MODEL,
820
+ messages=[
821
+ {"role": "system", "content": final_prompt},
822
+ {"role": "user", "content": question}
823
+ ]
824
+ )
825
+
826
+ answer_text = response.choices[0].message.content.strip()
827
+
828
+ # 7. ROUTING QUESTION ENFORCEMENT (Python-level Fallback)
829
+ routing_q = "Are you looking for online training or in-studio in Atlanta?"
830
+ broad_triggers = ['start acting', 'beginner', 'new actor', 'kids class', 'workshops', 'training', 'classes']
831
+ is_broad = any(t in question.lower() for t in broad_triggers)
832
+
833
+ if is_broad and not user_preference:
834
+ if not answer_text.lower().startswith(routing_q.lower()):
835
+ if routing_q.lower() in answer_text.lower():
836
+ answer_text = re.sub(rf'{re.escape(routing_q)}[?!.]*', '', answer_text, flags=re.IGNORECASE).strip()
837
+
838
+ answer_text = f"{routing_q} {answer_text}"
839
+
840
+ # Detect if routing question was asked
841
+ routing_q_asked = routing_q if (is_broad and not user_preference and routing_q in answer_text) else None
842
+
843
+ # Detect if links were provided
844
+ has_links = bool(re.search(r'\[.*?\]\(http', answer_text))
845
+
846
+ # Log question with comprehensive metadata
847
+ log_question(
848
+ question=question,
849
+ session_id=current_session_id,
850
+ category="llm_generated",
851
+ answer=answer_text,
852
+ detected_mode=activated_mode,
853
+ routing_question=routing_q_asked,
854
+ rule_triggered=None,
855
+ link_provided=has_links
856
+ )
857
+
858
+ return answer_text
859
+
860
+ except Exception as e:
861
+ import traceback
862
+ print(f"❌ CRITICAL ERROR in process_question: {e}")
863
+ traceback.print_exc()
864
+ return f"I apologize, but I encountered an error processing your question. Please try again or email info@getscenestudios.com for assistance."
865
+
866
+ # ============================================================================
867
+ # GRADIO INTERFACE
868
+ # ============================================================================
869
+
870
+ def chat_with_bot(message, history):
871
+ """
872
+ Process message directly without Flask API
873
+
874
+ Args:
875
+ message: User's current message
876
+ history: Chat history (list of message dictionaries)
877
+
878
+ Returns:
879
+ Updated history with new exchange
880
+ """
881
+ global session_id
882
+
883
+ if not message.strip():
884
+ return history
885
+
886
+ try:
887
+ # Process question directly
888
+ bot_reply = process_question(message, session_id)
889
+ except Exception as e:
890
+ bot_reply = f"❌ Error: {str(e)}"
891
+
892
+ # Append to history in Gradio 6.0 format
893
+ history.append({"role": "user", "content": message})
894
+ history.append({"role": "assistant", "content": bot_reply})
895
+ return history
896
+
897
+ def reset_session():
898
+ """Reset session ID for new conversation"""
899
+ global session_id
900
+ session_id = str(uuid.uuid4())
901
+ return [] #, f"🔄 New session started: {session_id[:8]}..."
902
+
903
+ # Create Gradio interface
904
+ with gr.Blocks(title="Get Scene Studios Chatbot") as demo:
905
+
906
+ gr.Markdown(
907
+ """
908
+ # 🎬 Get Scene Studios AI Chatbot
909
+
910
+ Ask questions about acting classes, workshops and more!
911
+ """
912
+ )
913
+
914
+ # Chatbot interface
915
+ chatbot = gr.Chatbot(
916
+ label="Conversation",
917
+ height=500
918
+ )
919
+
920
+ # Input area
921
+ with gr.Row():
922
+ msg = gr.Textbox(
923
+ label="Your Message",
924
+ lines=2,
925
+ scale=4
926
+ )
927
+ submit_btn = gr.Button("Send 📤", scale=1, variant="primary")
928
+
929
+ # Action buttons
930
+ with gr.Row():
931
+ clear_btn = gr.Button("Clear Chat 🗑️", scale=1)
932
+ reset_btn = gr.Button("New Session 🔄", scale=1)
933
+
934
+ # Event handlers
935
+ submit_btn.click(
936
+ fn=chat_with_bot,
937
+ inputs=[msg, chatbot],
938
+ outputs=[chatbot]
939
+ ).then(
940
+ fn=lambda: "",
941
+ inputs=None,
942
+ outputs=[msg]
943
+ )
944
+
945
+ msg.submit(
946
+ fn=chat_with_bot,
947
+ inputs=[msg, chatbot],
948
+ outputs=[chatbot]
949
+ ).then(
950
+ fn=lambda: "",
951
+ inputs=None,
952
+ outputs=[msg]
953
+ )
954
+
955
+ clear_btn.click(
956
+ fn=lambda: [],
957
+ inputs=None,
958
+ outputs=[chatbot]
959
+ )
960
+
961
+ reset_btn.click(
962
+ fn=reset_session,
963
+ inputs=None,
964
+ outputs=[chatbot]
965
+ )
966
+
967
+ # Launch the app
968
+ if __name__ == "__main__":
969
+ print("\n" + "="*60)
970
+ print("🎬 Get Scene Studios Chatbot")
971
+ print("="*60)
972
+ print("\n✅ No Flask API needed - all processing is done directly!")
973
+ print("🌐 Gradio interface will open in your browser")
974
+ print("="*60 + "\n")
975
+
976
+ demo.launch()
config.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datetime import timedelta
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+ # API Keys
8
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
9
+
10
+ # Database
11
+ DB_PATH = "getscene_ai.sqlite"
12
+
13
+ # Models
14
+ EMBED_MODEL = "text-embedding-3-small"
15
+ GEN_MODEL = "gpt-4o"
16
+ FAST_MODEL = "gpt-4o-mini"
17
+
18
+ # Caching
19
+ CACHE_DURATION = timedelta(hours=24)
20
+
21
+ # Keyword Lists
22
+ EMOTIONAL_KEYWORDS = [
23
+ 'stuck', 'frustrated', 'discouraged', 'overwhelmed', 'scared',
24
+ 'nervous', 'anxious', 'worried', 'fear', 'doubt', 'confidence',
25
+ 'insecure', 'lost', 'confused', 'struggling', 'hard time',
26
+ 'giving up', 'burnout', 'rejection', 'failed', 'can\'t',
27
+ 'feeling', 'feel', 'emotional', 'depressed', 'sad', 'unmotivated',
28
+ 'hopeless', 'stressed', 'pressure', 'imposter'
29
+ ]
30
+
31
+ ACTION_KEYWORDS = [
32
+ 'get an agent', 'find agent', 'need agent', 'want agent', 'sign with agent',
33
+ 'more auditions', 'book', 'booking', 'callbacks', 'improve',
34
+ 'better', 'self-tape', 'materials', 'headshots', 'reel',
35
+ 'network', 'connections', 'industry', 'career', 'strategy',
36
+ 'agent prep', 'total agent prep', 'workshop', 'class', 'training',
37
+ 'results', 'success', 'grow', 'advance', 'level up'
38
+ ]
39
+
40
+ POLICY_KEYWORDS = [
41
+ 'refund', 'refunds', 'money back',
42
+ 'attend', 'attendance', 'miss', 'missed', 'missing', 'absent',
43
+ 'late', 'lateness', 'tardy',
44
+ 'reschedule', 'change date', 'move class',
45
+ 'credit', 'credits',
46
+ 'cancel', 'cancellation', 'canceling',
47
+ 'policy', 'policies'
48
+ ]
49
+
50
+ EMAIL_ONLY_KEYWORDS = [
51
+ 'payment', 'pay', 'billing', 'charge', 'refund', 'money back',
52
+ 'attend', 'attendance', 'miss', 'missed', 'late', 'reschedule',
53
+ 'account', 'login', 'password', 'sign in', 'membership'
54
+ ]
55
+
56
+ DETAIL_SYNONYMS = [
57
+ 'detail', 'details', 'explain', 'elaborate', 'tell me more',
58
+ 'more info', 'describe', 'thorough', 'comprehensive'
59
+ ]
60
+
61
+ PERSONA_INSTRUCTION = """
62
+ You are a warm, encouraging mentor at Get Scene Studios. Your goal is to help actors navigate their careers with confidence.
63
+ - User Context: The user is already on getscenestudios.com. Behave as if you are a guide right there with them.
64
+ - Negative Constraint: NEVER use the phrase "Visit the website" or "Check our site". Instead, use "You can see here..." or "Click this link below..." or similar language that implies current presence.
65
+ - Sound natural and human, not scripted or robotic. Use conversational transitions like "I'd suggest starting with..." or "A great way to approach this is..."
66
+ - Be encouraging but practical. Acknowledge that the acting journey is a marathon, not a sprint.
67
+ - Help the user THINK: Instead of just giving an answer, add a brief "mentorship flourish" that explains the value of a recommendation (e.g., "This workshop is great because it gets you comfortable with the pressure of a real callback.")
68
+ """
database.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ import json
3
+ from contextlib import contextmanager
4
+ from typing import List, Dict, Any, Tuple
5
+ from config import DB_PATH
6
+
7
+ @contextmanager
8
+ def get_db_connection():
9
+ """Context manager for database connections."""
10
+ conn = sqlite3.connect(DB_PATH)
11
+ conn.row_factory = sqlite3.Row
12
+ try:
13
+ yield conn
14
+ finally:
15
+ conn.close()
16
+
17
+ def fetch_all_embeddings(table: str) -> List[Tuple[int, str, List[float]]]:
18
+ """Fetch all embeddings from a table."""
19
+ with get_db_connection() as conn:
20
+ cur = conn.cursor()
21
+ cur.execute(f"SELECT id, full_text, embedding FROM {table}")
22
+ rows = cur.fetchall()
23
+
24
+ parsed = []
25
+ for row in rows:
26
+ try:
27
+ parsed.append((row['id'], row['full_text'], json.loads(row['embedding'])))
28
+ except (json.JSONDecodeError, TypeError):
29
+ continue
30
+ return parsed
31
+
32
+ def fetch_row_by_id(table: str, row_id: int) -> Dict[str, Any]:
33
+ """Fetch a single row by ID."""
34
+ with get_db_connection() as conn:
35
+ cur = conn.cursor()
36
+ cur.execute(f"SELECT * FROM {table} WHERE id = ?", (row_id,))
37
+ row = cur.fetchone()
38
+ return dict(row) if row else {}
39
+
40
+ def fetch_all_faq_embeddings() -> List[Tuple[int, str, str, List[float]]]:
41
+ """Fetch all FAQ embeddings."""
42
+ with get_db_connection() as conn:
43
+ cur = conn.cursor()
44
+ cur.execute("SELECT id, question, answer, embedding FROM faq_entries")
45
+ rows = cur.fetchall()
46
+
47
+ parsed = []
48
+ for row in rows:
49
+ try:
50
+ parsed.append((row['id'], row['question'], row['answer'], json.loads(row['embedding'])))
51
+ except (json.JSONDecodeError, TypeError):
52
+ continue
53
+ return parsed
54
+
55
+ def log_question(
56
+ question: str,
57
+ session_id: str = None,
58
+ category: str = None,
59
+ answer: str = None,
60
+ detected_mode: str = None,
61
+ routing_question: str = None,
62
+ rule_triggered: str = None,
63
+ link_provided: bool = False
64
+ ):
65
+ """Log a user question to the database with comprehensive observability metadata.
66
+
67
+ Args:
68
+ question: The user's question
69
+ session_id: Session identifier
70
+ category: Question category (e.g., 'faq_match', 'llm_generated', 'policy_violation')
71
+ answer: The bot's response
72
+ detected_mode: Operating mode ('Mode A' or 'Mode B')
73
+ routing_question: The routing question asked (if any)
74
+ rule_triggered: Business rule that was triggered (e.g., 'audit_rule', 'free_class_first')
75
+ link_provided: Whether a direct link was included in the response
76
+ """
77
+ with get_db_connection() as conn:
78
+ cur = conn.cursor()
79
+
80
+ try:
81
+ cur.execute("""
82
+ INSERT INTO question_logs (
83
+ session_id, question, category, answer,
84
+ detected_mode, routing_question, rule_triggered, link_provided
85
+ )
86
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
87
+ """, (
88
+ session_id, question, category, answer,
89
+ detected_mode, routing_question, rule_triggered,
90
+ 1 if link_provided else 0
91
+ ))
92
+ except sqlite3.OperationalError as e:
93
+ # Fallback for older schema versions (shouldn't happen after migration)
94
+ print(f"⚠️ Logging error: {e}. Falling back to basic logging.")
95
+ cur.execute("INSERT INTO question_logs (question) VALUES (?)", (question,))
96
+
97
+ conn.commit()
98
+
99
+ def get_session_state(session_id: str) -> Dict[str, Any]:
100
+ """Get session state from DB"""
101
+ with get_db_connection() as conn:
102
+ cur = conn.cursor()
103
+ cur.execute("SELECT * FROM user_sessions WHERE session_id = ?", (session_id,))
104
+ row = cur.fetchone()
105
+ if row:
106
+ return dict(row)
107
+ return {"preference": None, "msg_count": 0, "clarification_count": 0, "knowledge_context": "{}"}
108
+
109
+ def update_session_state(session_id: str, preference: str = None, increment_count: bool = True, increment_clarification: bool = False, reset_clarification: bool = False, knowledge_update: Dict = None):
110
+ """Update session state with Knowledge Dictionary support"""
111
+ with get_db_connection() as conn:
112
+ cur = conn.cursor()
113
+
114
+ # Check if exists
115
+ cur.execute("SELECT preference, msg_count, clarification_count, knowledge_context FROM user_sessions WHERE session_id = ?", (session_id,))
116
+ row = cur.fetchone()
117
+
118
+ current_knowledge = {}
119
+ if row:
120
+ curr_pref, curr_count, curr_clarification, curr_knowledge_json = row
121
+ try:
122
+ current_knowledge = json.loads(curr_knowledge_json)
123
+ except:
124
+ current_knowledge = {}
125
+
126
+ new_pref = preference if preference else curr_pref
127
+ new_count = curr_count + 1 if increment_count else curr_count
128
+
129
+ # 10-Message Memory Rule: Reset if we hit the limit
130
+ if new_count > 10:
131
+ print(f"🔄 Session {session_id} reached 10 messages. Resetting memory context.")
132
+ new_count = 1
133
+ new_pref = None
134
+ current_knowledge = {}
135
+ new_clarification = 0
136
+ else:
137
+ new_clarification = curr_clarification
138
+ if reset_clarification:
139
+ new_clarification = 0
140
+ elif increment_clarification:
141
+ new_clarification = curr_clarification + 1
142
+
143
+ # Merge knowledge updates
144
+ if knowledge_update:
145
+ current_knowledge.update(knowledge_update)
146
+
147
+ new_knowledge_json = json.dumps(current_knowledge)
148
+
149
+ cur.execute("""
150
+ UPDATE user_sessions
151
+ SET preference = ?, msg_count = ?, clarification_count = ?, knowledge_context = ?, last_updated = CURRENT_TIMESTAMP
152
+ WHERE session_id = ?
153
+ """, (new_pref, new_count, new_clarification, new_knowledge_json, session_id))
154
+ else:
155
+ new_pref = preference
156
+ new_count = 1 if increment_count else 0
157
+ new_clarification = 1 if increment_clarification else 0
158
+
159
+ if knowledge_update:
160
+ current_knowledge.update(knowledge_update)
161
+ new_knowledge_json = json.dumps(current_knowledge)
162
+
163
+ cur.execute("""
164
+ INSERT INTO user_sessions (session_id, preference, msg_count, clarification_count, knowledge_context)
165
+ VALUES (?, ?, ?, ?, ?)
166
+ """, (session_id, new_pref, new_count, new_clarification, new_knowledge_json))
167
+
168
+ conn.commit()
scraper.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import re
4
+ from bs4 import BeautifulSoup
5
+ from typing import List, Dict, Any, Tuple
6
+ from utils import clean_time
7
+
8
+ def scrape_workshops_from_squarespace(url: str) -> List[Dict[str, str]]:
9
+ """
10
+ Extract workshops using our robust Squarespace JSON + HTML parsing system
11
+ """
12
+ headers = {
13
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
14
+ }
15
+
16
+ try:
17
+ # First try the Squarespace JSON API
18
+ json_url = f"{url}?format=json"
19
+ print(f"🔍 Trying Squarespace JSON API: {json_url}")
20
+
21
+ response = requests.get(json_url, headers=headers, timeout=10)
22
+ if response.status_code == 200:
23
+ try:
24
+ json_data = response.json()
25
+ workshops = extract_workshops_from_json(json_data, json_url)
26
+ if workshops:
27
+ print(f"✅ Extracted {len(workshops)} workshops from JSON API")
28
+ return workshops
29
+ else:
30
+ print("❌ No workshops found in JSON, falling back to HTML")
31
+ except json.JSONDecodeError:
32
+ print("❌ Invalid JSON response, falling back to HTML")
33
+
34
+ # Fallback to HTML scraping if JSON fails
35
+ print(f"📄 Falling back to HTML scraping for {url}")
36
+ response = requests.get(url, headers=headers, timeout=10)
37
+ response.raise_for_status()
38
+
39
+ soup = BeautifulSoup(response.content, 'html.parser')
40
+ workshops = parse_workshops_from_html(soup, url)
41
+
42
+ if workshops:
43
+ print(f"✅ Extracted {len(workshops)} workshops from HTML parsing")
44
+ return workshops
45
+ else:
46
+ print("❌ No workshops found in HTML")
47
+ return []
48
+
49
+ except Exception as e:
50
+ print(f"❌ Error scraping workshops from {url}: {e}")
51
+ return []
52
+
53
+ def extract_workshops_from_json(data: Any, source_url: str) -> List[Dict[str, str]]:
54
+ """Extract workshop information from Squarespace JSON data"""
55
+ workshops = []
56
+
57
+ # Check if there's mainContent HTML to parse
58
+ if isinstance(data, dict) and 'mainContent' in data:
59
+ main_content_html = data['mainContent']
60
+ if isinstance(main_content_html, str):
61
+ print(f"🎯 Found mainContent HTML! Length: {len(main_content_html)} characters")
62
+
63
+ soup = BeautifulSoup(main_content_html, 'html.parser')
64
+ workshops = parse_workshops_from_html(soup, source_url)
65
+
66
+ if workshops:
67
+ return workshops
68
+
69
+ return workshops
70
+
71
+ def parse_workshops_from_html(soup, source_url: str) -> List[Dict[str, str]]:
72
+ """Enhanced HTML parsing specifically for workshop content"""
73
+ workshops = []
74
+ workshop_texts = set()
75
+
76
+ print(f"🔍 ENHANCED HTML PARSING:")
77
+
78
+ # Method 1: Find individual workshop containers
79
+ potential_containers = soup.find_all(['div', 'section', 'article'],
80
+ attrs={'class': re.compile(r'(item|card|product|workshop|class)', re.I)})
81
+
82
+ print(f" Found {len(potential_containers)} potential workshop containers")
83
+
84
+ for container in potential_containers:
85
+ workshop_text = container.get_text(strip=True)
86
+
87
+ if len(workshop_text) < 30 or workshop_text in workshop_texts:
88
+ continue
89
+
90
+ if any(keyword in workshop_text.lower() for keyword in ['with', 'casting', 'director', 'agent', 'perfect submission', 'crush the callback', 'get scene']):
91
+ workshop = extract_single_workshop_from_text(workshop_text, source_url)
92
+ if workshop and not is_duplicate_workshop(workshop, workshops):
93
+ workshops.append(workshop)
94
+ workshop_texts.add(workshop_text)
95
+
96
+ # Method 2: Pattern-based extraction from full text
97
+ all_text = soup.get_text()
98
+
99
+ workshop_patterns = [
100
+ # Pattern 1: "Workshop Title with Professional Title Name on Date @ Time"
101
+ r'((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+with\s+((?:Casting\s+Director|DDO\s+Agent|Manager|Director|Producer|Agent|Acting\s+Coach|Talent\s+Agent|Executive\s+Casting\s+Producer)\s+[A-Za-z\s]+?)\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
102
+
103
+ # Pattern 2: "Professional Title Name, Workshop Title on Date @ Time"
104
+ r'((?:Atlanta\s+Models\s+&\s+Talent\s+President|Talent\s+Agent|Casting\s+Director|Manager|Director|Producer|Agent)\s+[A-Za-z\s]+?),\s+((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
105
+
106
+ # Pattern 3: "Casting Director Name, Date @ Time"
107
+ r'(Casting\s+Director)\s+([A-Za-z\s\-]+?),\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*(?:at\s+)?([0-9:]+\s*(?:AM|PM))?',
108
+ ]
109
+
110
+ for i, pattern in enumerate(workshop_patterns):
111
+ matches = re.findall(pattern, all_text, re.IGNORECASE)
112
+ for match in matches:
113
+ workshop = parse_refined_workshop_match(match, i+1, source_url)
114
+ if workshop and not is_duplicate_workshop(workshop, workshops):
115
+ workshops.append(workshop)
116
+
117
+ print(f"🎯 TOTAL UNIQUE WORKSHOPS FOUND: {len(workshops)}")
118
+ return workshops
119
+
120
+ def extract_single_workshop_from_text(text: str, source_url: str) -> Dict[str, str]:
121
+ """Extract workshop info from a single text block"""
122
+
123
+ # Clean up the text
124
+ text = re.sub(r'\$[0-9,]+\.00', '', text)
125
+ text = re.sub(r'Featured|Sold Out', '', text, flags=re.IGNORECASE)
126
+ text = re.sub(r'\s+', ' ', text).strip()
127
+ text = re.sub(r'\n+', ' ', text)
128
+
129
+ patterns = [
130
+ # Pattern A: "Title with Professional Name on Date @ Time"
131
+ r'((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+with\s+((?:Casting\s+Director|CD|DDO\s+Agent|Manager|Director|Producer|Agent|Acting\s+Coach|Talent\s+Agent|Executive\s+Casting\s+Producer|Atlanta\s+Models\s+&\s+Talent\s+President)\s+[A-Za-z\s\-]+?)\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
132
+
133
+ # Pattern B: "Professional Name, Title on Date @ Time"
134
+ r'((?:Atlanta\s+Models\s+&\s+Talent\s+President|Talent\s+Agent|Casting\s+Director|Casting\s+Associate|Manager|Director|Producer|Agent|Executive\s+Casting\s+Producer)\s+[A-Za-z\s\-]+?),\s+((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
135
+
136
+ # Pattern C: "Casting Director Name, Date at Time"
137
+ r'(Casting\s+Director|Casting\s+Associate)\s+([A-Za-z\s\-]+?),\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*(?:at\s+)?([0-9:]+\s*(?:AM|PM))?',
138
+
139
+ # Pattern D: "Company Executive Producer Name on Date"
140
+ r"([A-Za-z']+\s+(?:Executive\s+Casting\s+Producer|Studios\s+Casting\s+Associate))\s+([A-Za-z\s]+?)\s+(?:on\s+)?(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?",
141
+
142
+ # Pattern E: "Company Agent Name Date" (fixed "on" issue)
143
+ r'([A-Za-z\s]+)\s+(Agent|Talent)\s+([A-Za-z\s]+?)\s+(?:on\s+)?(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
144
+
145
+ # Pattern F: "Company, Person, Title on Date"
146
+ r'([A-Za-z\s]+\s+Talent),\s+([A-Za-z\s\.]+?),\s+((?:The\s+)?(?:Perfect\s+Submission|Crush\s+the\s+Callback|Get\s+Scene\s+360?))\s+on\s+(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?',
147
+
148
+ # Pattern G: Flexible fallback
149
+ r'^([A-Za-z\s&\']{3,25}(?:Director|Agent|Manager|Producer|President|Coach))\s+([A-Za-z\s\-]{3,30}?)\s+(?:on\s+)?(\w+\s+\d+(?:st|nd|rd|th)?)\s*[@\s]*([0-9:]+\s*(?:AM|PM))?$'
150
+ ]
151
+
152
+ for i, pattern in enumerate(patterns):
153
+ match = re.search(pattern, text, re.IGNORECASE)
154
+ if match:
155
+ return parse_pattern_match(match, i, source_url)
156
+
157
+ return None
158
+
159
+ def parse_pattern_match(match, pattern_index: int, source_url: str) -> Dict[str, str]:
160
+ """Parse a regex match or tuple based on pattern type"""
161
+ # Use a helper to get group content whether it's a match object or tuple
162
+ def get_grp(m, idx):
163
+ val = ""
164
+ if hasattr(m, 'group'):
165
+ try:
166
+ val = m.group(idx)
167
+ except IndexError:
168
+ val = ""
169
+ # If it's a tuple (from findall), idx is 1-based in standard regex terminology
170
+ # but 0-indexed in the tuple.
171
+ elif isinstance(m, (tuple, list)):
172
+ if 0 <= idx-1 < len(m):
173
+ val = m[idx-1]
174
+
175
+ return val if val is not None else ""
176
+
177
+ # Initialize variables
178
+ workshop_title = ""
179
+ instructor_title = ""
180
+ instructor_name = ""
181
+ date_str = ""
182
+ time_str = ""
183
+
184
+ try:
185
+ if pattern_index == 0: # Pattern A/1
186
+ workshop_title = get_grp(match, 1).strip()
187
+ professional_full = get_grp(match, 2).strip()
188
+ date_str = get_grp(match, 3).strip()
189
+ time_str = get_grp(match, 4).strip()
190
+
191
+ if professional_full.startswith('CD '):
192
+ professional_full = 'Casting Director ' + professional_full[3:]
193
+
194
+ instructor_title, instructor_name = parse_professional_info(professional_full)
195
+
196
+ elif pattern_index == 1: # Pattern B/2
197
+ professional_full = get_grp(match, 1).strip()
198
+ workshop_title = get_grp(match, 2).strip()
199
+ date_str = get_grp(match, 3).strip()
200
+ time_str = get_grp(match, 4).strip()
201
+
202
+ instructor_title, instructor_name = parse_professional_info(professional_full)
203
+
204
+ elif pattern_index == 2: # Pattern C/3
205
+ instructor_title = get_grp(match, 1).strip()
206
+ instructor_name = get_grp(match, 2).strip()
207
+ date_str = get_grp(match, 3).strip()
208
+ time_str = get_grp(match, 4).strip()
209
+ workshop_title = "Casting Workshop"
210
+
211
+ elif pattern_index == 3: # Pattern D
212
+ instructor_title = get_grp(match, 1).strip()
213
+ instructor_name = get_grp(match, 2).strip()
214
+ date_str = get_grp(match, 3).strip()
215
+ time_str = get_grp(match, 4).strip()
216
+ workshop_title = "Industry Workshop"
217
+
218
+ elif pattern_index == 4: # Pattern E
219
+ company_name = get_grp(match, 1).strip()
220
+ agent_type = get_grp(match, 2).strip()
221
+ instructor_name = get_grp(match, 3).strip()
222
+ date_str = get_grp(match, 4).strip()
223
+ time_str = get_grp(match, 5).strip()
224
+
225
+ instructor_title = f"{company_name} {agent_type}"
226
+ workshop_title = "Industry Workshop"
227
+
228
+ elif pattern_index == 5: # Pattern F
229
+ company_name = get_grp(match, 1).strip()
230
+ instructor_name = get_grp(match, 2).strip()
231
+ workshop_title = get_grp(match, 3).strip()
232
+ date_str = get_grp(match, 4).strip()
233
+ time_str = get_grp(match, 5).strip()
234
+
235
+ instructor_title = company_name
236
+
237
+ else: # Pattern G
238
+ professional_full = get_grp(match, 1).strip() + " " + get_grp(match, 2).strip()
239
+ date_str = get_grp(match, 3).strip()
240
+ time_str = get_grp(match, 4).strip()
241
+ workshop_title = "Industry Workshop"
242
+
243
+ if len(professional_full) > 50 or '\n' in professional_full:
244
+ return None
245
+
246
+ instructor_title, instructor_name = parse_professional_info(professional_full)
247
+
248
+ if instructor_name and date_str:
249
+ # Create full_text for embedding (required by existing Flask API)
250
+ full_text = f"{workshop_title} with {instructor_title} {instructor_name}"
251
+ if date_str:
252
+ full_text += f" on {date_str}"
253
+ if time_str:
254
+ full_text += f" at {clean_time(time_str)}"
255
+
256
+ return {
257
+ 'title': workshop_title,
258
+ 'instructor_name': instructor_name,
259
+ 'instructor_title': instructor_title,
260
+ 'date': date_str,
261
+ 'time': clean_time(time_str),
262
+ 'full_text': full_text, # Required for existing embedding system
263
+ 'source_url': source_url
264
+ }
265
+
266
+ except Exception as e:
267
+ print(f"Error parsing pattern match: {e}")
268
+
269
+ return None
270
+
271
+ def parse_professional_info(professional_full: str) -> tuple:
272
+ """Parse professional title and name from full string"""
273
+
274
+ professional_full = re.sub(r'\s+', ' ', professional_full).strip()
275
+
276
+ # Handle specific multi-word titles
277
+ specific_titles = [
278
+ 'Atlanta Models & Talent President',
279
+ 'Executive Casting Producer',
280
+ 'Casting Director',
281
+ 'Casting Associate',
282
+ 'DDO Agent',
283
+ 'Talent Agent',
284
+ 'Acting Coach'
285
+ ]
286
+
287
+ for title in specific_titles:
288
+ if title in professional_full:
289
+ title_pos = professional_full.find(title)
290
+
291
+ if title_pos == 0:
292
+ name_part = professional_full[len(title):].strip()
293
+ return title, name_part
294
+ else:
295
+ name_part = professional_full[:title_pos].strip().rstrip(',')
296
+ return title, name_part
297
+
298
+ # Fallback for single-word titles
299
+ single_word_titles = ['Manager', 'Director', 'Producer', 'Agent', 'Coach', 'President']
300
+
301
+ words = professional_full.split()
302
+ for i, word in enumerate(words):
303
+ if word in single_word_titles:
304
+ if i > 0 and words[i-1] in ['Casting', 'Talent', 'Executive', 'DDO', 'Acting']:
305
+ title = f"{words[i-1]} {word}"
306
+ name_parts = words[:i-1] + words[i+1:]
307
+ else:
308
+ title = word
309
+ name_parts = words[:i] + words[i+1:]
310
+
311
+ name = ' '.join(name_parts).strip()
312
+ return title, name
313
+
314
+ # Final fallback
315
+ if len(words) >= 2:
316
+ return words[0], ' '.join(words[1:])
317
+
318
+ return '', professional_full
319
+
320
+ def parse_refined_workshop_match(match, pattern_num: int, source_url: str) -> Dict[str, str]:
321
+ """Parse a regex match into a clean workshop dictionary"""
322
+ return parse_pattern_match(match, pattern_num-1, source_url) # Adjust for 0-based indexing
323
+
324
+ def is_duplicate_workshop(new_workshop: Dict, existing_workshops: List[Dict]) -> bool:
325
+ """Enhanced duplicate detection"""
326
+ for existing in existing_workshops:
327
+ if (existing.get('instructor_name', '').strip().lower() == new_workshop.get('instructor_name', '').strip().lower() and
328
+ existing.get('date', '').strip().lower() == new_workshop.get('date', '').strip().lower()):
329
+
330
+ existing_title = existing.get('title', '').strip().lower()
331
+ new_title = new_workshop.get('title', '').strip().lower()
332
+
333
+ if (existing_title == new_title or
334
+ 'workshop' in existing_title and 'workshop' in new_title or
335
+ existing_title in new_title or new_title in existing_title):
336
+ return True
337
+ return False
338
+
339
+ def calculate_workshop_confidence(w: Dict) -> float:
340
+ """Calculate confidence score of retrieved workshop data"""
341
+ score = 0.0
342
+ if w.get('title'): score += 0.3
343
+ if w.get('instructor_name'): score += 0.3
344
+ if w.get('date'): score += 0.2
345
+ if w.get('time'): score += 0.1
346
+ if w.get('source_url'): score += 0.1
347
+ return round(score, 2)
utils.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import numpy as np
3
+ import re
4
+ from typing import List, Tuple
5
+ from config import EMBED_MODEL
6
+
7
+ def get_embedding(text: str) -> List[float]:
8
+ """Generate embedding for a given text."""
9
+ text_strip = text.replace("\n", " ").strip()
10
+ response = openai.embeddings.create(input=[text_strip], model=EMBED_MODEL)
11
+ return response.data[0].embedding
12
+
13
+ def cosine_similarity(a: List[float], b: List[float]) -> float:
14
+ """Calculate cosine similarity between two vectors."""
15
+ a = np.array(a)
16
+ b = np.array(b)
17
+ if np.linalg.norm(a) == 0 or np.linalg.norm(b) == 0:
18
+ return 0.0
19
+ return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
20
+
21
+ def clean_time(time_str: str) -> str:
22
+ """Clean up time string."""
23
+ if not time_str:
24
+ return ""
25
+
26
+ time_match = re.search(r'(\d{1,2}):?(\d{0,2})\s*(AM|PM)', time_str, re.IGNORECASE)
27
+ if time_match:
28
+ hour = time_match.group(1)
29
+ minute = time_match.group(2) or "00"
30
+ ampm = time_match.group(3).upper()
31
+ return f"{hour}:{minute} {ampm}"
32
+
33
+ return time_str.strip()
34
+
35
+ def find_top_k_matches(user_embedding, dataset, k=3):
36
+ """Find top k matching entries from a dataset."""
37
+ scored = []
38
+ for entry_id, text, emb in dataset:
39
+ score = cosine_similarity(user_embedding, emb)
40
+ scored.append((score, entry_id, text))
41
+ scored.sort(reverse=True)
42
+ return scored[:k]
43
+
44
+ def classify_intent(question: str) -> str:
45
+ """
46
+ Classify the user's intent into:
47
+ Mode A: Recommendation Mode (Workshops, Dates, Availability, Recommendations)
48
+ Mode B: Front Desk Mode (Default - Everything else)
49
+ """
50
+ prompt = f"""Classify the following user question into one of two modes:
51
+ 1. "Mode A - Recommendation Mode": Use this if the user is asking about workshops, specific dates, what's available this month, asking for recommendations, or career goals (like getting an agent).
52
+ 2. "Mode B - Front Desk Mode": Use this for broad introductory questions, kids classes, signing up, summit, instructor roles, auditing, online vs in-studio, general policies, or specific questions about existing classes.
53
+
54
+ User Question: "{question}"
55
+
56
+ Response must be exactly "Mode A" or "Mode B"."""
57
+
58
+ try:
59
+ response = openai.chat.completions.create(
60
+ model="gpt-4o-mini",
61
+ messages=[{"role": "user", "content": prompt}],
62
+ temperature=0,
63
+ max_tokens=5
64
+ )
65
+ prediction = response.choices[0].message.content.strip()
66
+ if "Mode A" in prediction:
67
+ return "Mode A"
68
+ return "Mode B"
69
+ except Exception as e:
70
+ print(f"Error in intent classification: {e}")
71
+ return "Mode B" # Default to Front Desk Mode
72
+
73
+ def should_include_email(question: str) -> bool:
74
+ """
75
+ Determine if the contact email should be shown based on user intent.
76
+ Allowed for: Payments, Refunds, Attendance issues, Account problems.
77
+ """
78
+ from config import EMAIL_ONLY_KEYWORDS
79
+ import re
80
+
81
+ question_lower = question.lower()
82
+ for word in EMAIL_ONLY_KEYWORDS:
83
+ pattern = rf'\b{re.escape(word)}\b'
84
+ if re.search(pattern, question_lower):
85
+ return True
86
+
87
+ return False
88
+
89
+ def classify_user_type(question: str, history: List[dict] = None) -> str:
90
+ """
91
+ Classify the user type into:
92
+ - new_actor
93
+ - experienced_actor
94
+ - parent
95
+ - current_student
96
+ - unknown
97
+ """
98
+ history_str = ""
99
+ if history:
100
+ history_str = "\nConversation context:\n" + "\n".join([f"{m['role']}: {m['content'][:100]}..." for m in history[-3:]])
101
+
102
+ prompt = f"""Classify the user into exactly one of these categories based on their question and context:
103
+ 1. "new_actor": Just starting out, has no experience, or is asking how to begin.
104
+ 2. "experienced_actor": Already has credits, mentions agents, looking for advanced workshops, or refers to their career progress.
105
+ 3. "parent": Asking on behalf of their child, mentions "my kid", "my son", "my daughter", "teens".
106
+ 4. "current_student": Refers to past/current classes at Get Scene, mentions a specific GSP membership, or asks about recurring student workshops.
107
+ 5. "unknown": Not enough information yet.
108
+
109
+ User Question: "{question}"{history_str}
110
+
111
+ Response must be exactly one of: new_actor, experienced_actor, parent, current_student, unknown."""
112
+
113
+ try:
114
+ response = openai.chat.completions.create(
115
+ model="gpt-4o-mini",
116
+ messages=[{"role": "user", "content": prompt}],
117
+ temperature=0,
118
+ max_tokens=10
119
+ )
120
+ prediction = response.choices[0].message.content.strip().lower()
121
+ valid_types = ["new_actor", "experienced_actor", "parent", "current_student", "unknown"]
122
+ for t in valid_types:
123
+ if t in prediction:
124
+ return t
125
+ return "unknown"
126
+ except Exception as e:
127
+ print(f"Error in user type classification: {e}")
128
+ return "unknown"