""" Voice Analysis API for Salesforce ================================== Endpoints: /analyze - Full analysis (diarization + overlap + voice metrics) Returns JSON that Salesforce can parse. Models used: - pyannote/speaker-diarization-3.1 (who spoke when) - pyannote/overlapped-speech-detection (coaching detection) """ import gradio as gr import os import json import torch from pyannote.audio import Pipeline import numpy as np # ============================================================ # CONFIGURATION # ============================================================ HF_TOKEN = os.environ.get("HF_TOKEN") if not HF_TOKEN: print("WARNING: HF_TOKEN not set. Gated models will fail.") # ============================================================ # LOAD MODELS (runs once at startup) # ============================================================ print("Loading diarization model...") try: diarization_pipeline = Pipeline.from_pretrained( "pyannote/speaker-diarization-3.1", use_auth_token=HF_TOKEN ) print("✅ Diarization model loaded") except Exception as e: print(f"❌ Diarization model failed: {e}") diarization_pipeline = None print("Loading overlap detection model...") try: overlap_pipeline = Pipeline.from_pretrained( "pyannote/overlapped-speech-detection", use_auth_token=HF_TOKEN ) print("✅ Overlap detection model loaded") except Exception as e: print(f"❌ Overlap detection failed: {e}") overlap_pipeline = None # ============================================================ # ANALYSIS FUNCTIONS # ============================================================ def analyze_diarization(audio_path): """ Identifies different speakers and their timestamps. Returns list of segments with speaker labels. """ if diarization_pipeline is None: return {"error": "Diarization model not loaded"} try: diarization = diarization_pipeline(audio_path) segments = [] for turn, _, speaker in diarization.itertracks(yield_label=True): segments.append({ "speaker": speaker, "start": round(turn.start, 2), "end": round(turn.end, 2), "duration": round(turn.end - turn.start, 2) }) # Identify borrower (assumes agent speaks first) speakers = list(set([s["speaker"] for s in segments])) agent_speaker = segments[0]["speaker"] if segments else None borrower_speaker = None for s in speakers: if s != agent_speaker: borrower_speaker = s break return { "segments": segments, "speaker_count": len(speakers), "agent_speaker": agent_speaker, "borrower_speaker": borrower_speaker, "total_segments": len(segments) } except Exception as e: return {"error": str(e)} def analyze_overlap(audio_path): """ Detects overlapping speech (multiple people talking at once). Used for coaching detection. """ if overlap_pipeline is None: return {"error": "Overlap detection model not loaded"} try: overlap = overlap_pipeline(audio_path) overlap_segments = [] for segment, _, label in overlap.itertracks(yield_label=True): overlap_segments.append({ "start": round(segment.start, 2), "end": round(segment.end, 2), "duration": round(segment.end - segment.start, 2) }) total_overlap_duration = sum([s["duration"] for s in overlap_segments]) return { "overlap_segments": overlap_segments, "overlap_count": len(overlap_segments), "total_overlap_duration": round(total_overlap_duration, 2) } except Exception as e: return {"error": str(e)} def detect_coaching(diarization_result, overlap_result): """ Cross-references overlap with borrower segments. Overlap during borrower's speech = potential coaching. """ coaching_flags = [] if "error" in diarization_result or "error" in overlap_result: return { "coaching_detected": False, "error": "Could not analyze - model error" } borrower_speaker = diarization_result.get("borrower_speaker") if not borrower_speaker: return { "coaching_detected": False, "reason": "Could not identify borrower" } # Get borrower segments borrower_segments = [ s for s in diarization_result["segments"] if s["speaker"] == borrower_speaker ] # Get overlap segments overlap_segments = overlap_result.get("overlap_segments", []) # Check if any overlap falls within borrower's speaking time for overlap in overlap_segments: for borrower_seg in borrower_segments: # Check if overlap is during borrower's speech if (overlap["start"] >= borrower_seg["start"] and overlap["start"] <= borrower_seg["end"]): coaching_flags.append({ "overlap_time": f"{overlap['start']}-{overlap['end']}", "during_borrower_segment": f"{borrower_seg['start']}-{borrower_seg['end']}", "duration": overlap["duration"] }) return { "coaching_detected": len(coaching_flags) > 0, "coaching_instances": len(coaching_flags), "coaching_flags": coaching_flags, "borrower_segments_analyzed": len(borrower_segments) } def analyze_voice_metrics(audio_path): """ Basic voice analysis - pause detection, speaking rate. For hesitation indicators. """ try: import librosa # Load audio y, sr = librosa.load(audio_path, sr=16000) duration = len(y) / sr # Simple energy-based silence detection energy = np.abs(y) threshold = np.mean(energy) * 0.1 silence_samples = np.sum(energy < threshold) silence_ratio = silence_samples / len(y) return { "duration_seconds": round(duration, 2), "silence_ratio": round(silence_ratio, 3), "has_long_pauses": silence_ratio > 0.3 } except Exception as e: return {"error": str(e), "duration_seconds": 0, "silence_ratio": 0, "has_long_pauses": False} # ============================================================ # MAIN ANALYSIS FUNCTION # ============================================================ def full_analysis(audio_file): """ Complete audio analysis - called by Gradio/API. Returns JSON with all results. """ if audio_file is None: return json.dumps({"error": "No audio file provided"}, indent=2) results = { "status": "success", "analysis": {} } try: # Run all analyses print(f"Analyzing: {audio_file}") # 1. Diarization print("Running diarization...") diarization_result = analyze_diarization(audio_file) results["analysis"]["diarization"] = diarization_result # 2. Overlap detection print("Running overlap detection...") overlap_result = analyze_overlap(audio_file) results["analysis"]["overlap"] = overlap_result # 3. Coaching detection (cross-reference) print("Analyzing coaching...") coaching_result = detect_coaching(diarization_result, overlap_result) results["analysis"]["coaching"] = coaching_result # 4. Voice metrics print("Analyzing voice metrics...") voice_result = analyze_voice_metrics(audio_file) results["analysis"]["voice_metrics"] = voice_result # 5. Summary results["summary"] = { "speaker_count": diarization_result.get("speaker_count", 0), "coaching_detected": coaching_result.get("coaching_detected", False), "coaching_instances": coaching_result.get("coaching_instances", 0), "has_long_pauses": voice_result.get("has_long_pauses", False), "total_overlap_duration": overlap_result.get("total_overlap_duration", 0) } print("Analysis complete!") except Exception as e: results["status"] = "error" results["error"] = str(e) return json.dumps(results, indent=2) # ============================================================ # GRADIO INTERFACE # ============================================================ demo = gr.Interface( fn=full_analysis, inputs=gr.Audio(type="filepath", label="Upload Audio (MP3, WAV, M4A)"), outputs=gr.JSON(label="Analysis Results"), title="🎙️ Voice Analysis API for Salesforce", description=""" Upload a call recording to analyze: - **Speaker Diarization**: Who spoke when - **Coaching Detection**: Overlapping speech during borrower's responses - **Voice Metrics**: Pause detection, silence ratio Returns JSON that Salesforce can parse via Apex callout. """, examples=[], allow_flagging="never" ) # Launch with API enabled demo.launch()