Spaces:
Runtime error
Runtime error
| """ | |
| Voice Analysis API for Salesforce | |
| ================================== | |
| Endpoints: | |
| /analyze - Full analysis (diarization + overlap + voice metrics) | |
| Returns JSON that Salesforce can parse. | |
| Models used: | |
| - pyannote/speaker-diarization-3.1 (who spoke when) | |
| - pyannote/overlapped-speech-detection (coaching detection) | |
| """ | |
| import gradio as gr | |
| import os | |
| import json | |
| import torch | |
| from pyannote.audio import Pipeline | |
| import numpy as np | |
| # ============================================================ | |
| # CONFIGURATION | |
| # ============================================================ | |
| HF_TOKEN = os.environ.get("HF_TOKEN") | |
| if not HF_TOKEN: | |
| print("WARNING: HF_TOKEN not set. Gated models will fail.") | |
| # ============================================================ | |
| # LOAD MODELS (runs once at startup) | |
| # ============================================================ | |
| print("Loading diarization model...") | |
| try: | |
| diarization_pipeline = Pipeline.from_pretrained( | |
| "pyannote/speaker-diarization-3.1", | |
| use_auth_token=HF_TOKEN | |
| ) | |
| print("β Diarization model loaded") | |
| except Exception as e: | |
| print(f"β Diarization model failed: {e}") | |
| diarization_pipeline = None | |
| print("Loading overlap detection model...") | |
| try: | |
| overlap_pipeline = Pipeline.from_pretrained( | |
| "pyannote/overlapped-speech-detection", | |
| use_auth_token=HF_TOKEN | |
| ) | |
| print("β Overlap detection model loaded") | |
| except Exception as e: | |
| print(f"β Overlap detection failed: {e}") | |
| overlap_pipeline = None | |
| # ============================================================ | |
| # ANALYSIS FUNCTIONS | |
| # ============================================================ | |
| def analyze_diarization(audio_path): | |
| """ | |
| Identifies different speakers and their timestamps. | |
| Returns list of segments with speaker labels. | |
| """ | |
| if diarization_pipeline is None: | |
| return {"error": "Diarization model not loaded"} | |
| try: | |
| diarization = diarization_pipeline(audio_path) | |
| segments = [] | |
| for turn, _, speaker in diarization.itertracks(yield_label=True): | |
| segments.append({ | |
| "speaker": speaker, | |
| "start": round(turn.start, 2), | |
| "end": round(turn.end, 2), | |
| "duration": round(turn.end - turn.start, 2) | |
| }) | |
| # Identify borrower (assumes agent speaks first) | |
| speakers = list(set([s["speaker"] for s in segments])) | |
| agent_speaker = segments[0]["speaker"] if segments else None | |
| borrower_speaker = None | |
| for s in speakers: | |
| if s != agent_speaker: | |
| borrower_speaker = s | |
| break | |
| return { | |
| "segments": segments, | |
| "speaker_count": len(speakers), | |
| "agent_speaker": agent_speaker, | |
| "borrower_speaker": borrower_speaker, | |
| "total_segments": len(segments) | |
| } | |
| except Exception as e: | |
| return {"error": str(e)} | |
| def analyze_overlap(audio_path): | |
| """ | |
| Detects overlapping speech (multiple people talking at once). | |
| Used for coaching detection. | |
| """ | |
| if overlap_pipeline is None: | |
| return {"error": "Overlap detection model not loaded"} | |
| try: | |
| overlap = overlap_pipeline(audio_path) | |
| overlap_segments = [] | |
| for segment, _, label in overlap.itertracks(yield_label=True): | |
| overlap_segments.append({ | |
| "start": round(segment.start, 2), | |
| "end": round(segment.end, 2), | |
| "duration": round(segment.end - segment.start, 2) | |
| }) | |
| total_overlap_duration = sum([s["duration"] for s in overlap_segments]) | |
| return { | |
| "overlap_segments": overlap_segments, | |
| "overlap_count": len(overlap_segments), | |
| "total_overlap_duration": round(total_overlap_duration, 2) | |
| } | |
| except Exception as e: | |
| return {"error": str(e)} | |
| def detect_coaching(diarization_result, overlap_result): | |
| """ | |
| Cross-references overlap with borrower segments. | |
| Overlap during borrower's speech = potential coaching. | |
| """ | |
| coaching_flags = [] | |
| if "error" in diarization_result or "error" in overlap_result: | |
| return { | |
| "coaching_detected": False, | |
| "error": "Could not analyze - model error" | |
| } | |
| borrower_speaker = diarization_result.get("borrower_speaker") | |
| if not borrower_speaker: | |
| return { | |
| "coaching_detected": False, | |
| "reason": "Could not identify borrower" | |
| } | |
| # Get borrower segments | |
| borrower_segments = [ | |
| s for s in diarization_result["segments"] | |
| if s["speaker"] == borrower_speaker | |
| ] | |
| # Get overlap segments | |
| overlap_segments = overlap_result.get("overlap_segments", []) | |
| # Check if any overlap falls within borrower's speaking time | |
| for overlap in overlap_segments: | |
| for borrower_seg in borrower_segments: | |
| # Check if overlap is during borrower's speech | |
| if (overlap["start"] >= borrower_seg["start"] and | |
| overlap["start"] <= borrower_seg["end"]): | |
| coaching_flags.append({ | |
| "overlap_time": f"{overlap['start']}-{overlap['end']}", | |
| "during_borrower_segment": f"{borrower_seg['start']}-{borrower_seg['end']}", | |
| "duration": overlap["duration"] | |
| }) | |
| return { | |
| "coaching_detected": len(coaching_flags) > 0, | |
| "coaching_instances": len(coaching_flags), | |
| "coaching_flags": coaching_flags, | |
| "borrower_segments_analyzed": len(borrower_segments) | |
| } | |
| def analyze_voice_metrics(audio_path): | |
| """ | |
| Basic voice analysis - pause detection, speaking rate. | |
| For hesitation indicators. | |
| """ | |
| try: | |
| import librosa | |
| # Load audio | |
| y, sr = librosa.load(audio_path, sr=16000) | |
| duration = len(y) / sr | |
| # Simple energy-based silence detection | |
| energy = np.abs(y) | |
| threshold = np.mean(energy) * 0.1 | |
| silence_samples = np.sum(energy < threshold) | |
| silence_ratio = silence_samples / len(y) | |
| return { | |
| "duration_seconds": round(duration, 2), | |
| "silence_ratio": round(silence_ratio, 3), | |
| "has_long_pauses": silence_ratio > 0.3 | |
| } | |
| except Exception as e: | |
| return {"error": str(e), "duration_seconds": 0, "silence_ratio": 0, "has_long_pauses": False} | |
| # ============================================================ | |
| # MAIN ANALYSIS FUNCTION | |
| # ============================================================ | |
| def full_analysis(audio_file): | |
| """ | |
| Complete audio analysis - called by Gradio/API. | |
| Returns JSON with all results. | |
| """ | |
| if audio_file is None: | |
| return json.dumps({"error": "No audio file provided"}, indent=2) | |
| results = { | |
| "status": "success", | |
| "analysis": {} | |
| } | |
| try: | |
| # Run all analyses | |
| print(f"Analyzing: {audio_file}") | |
| # 1. Diarization | |
| print("Running diarization...") | |
| diarization_result = analyze_diarization(audio_file) | |
| results["analysis"]["diarization"] = diarization_result | |
| # 2. Overlap detection | |
| print("Running overlap detection...") | |
| overlap_result = analyze_overlap(audio_file) | |
| results["analysis"]["overlap"] = overlap_result | |
| # 3. Coaching detection (cross-reference) | |
| print("Analyzing coaching...") | |
| coaching_result = detect_coaching(diarization_result, overlap_result) | |
| results["analysis"]["coaching"] = coaching_result | |
| # 4. Voice metrics | |
| print("Analyzing voice metrics...") | |
| voice_result = analyze_voice_metrics(audio_file) | |
| results["analysis"]["voice_metrics"] = voice_result | |
| # 5. Summary | |
| results["summary"] = { | |
| "speaker_count": diarization_result.get("speaker_count", 0), | |
| "coaching_detected": coaching_result.get("coaching_detected", False), | |
| "coaching_instances": coaching_result.get("coaching_instances", 0), | |
| "has_long_pauses": voice_result.get("has_long_pauses", False), | |
| "total_overlap_duration": overlap_result.get("total_overlap_duration", 0) | |
| } | |
| print("Analysis complete!") | |
| except Exception as e: | |
| results["status"] = "error" | |
| results["error"] = str(e) | |
| return json.dumps(results, indent=2) | |
| # ============================================================ | |
| # GRADIO INTERFACE | |
| # ============================================================ | |
| demo = gr.Interface( | |
| fn=full_analysis, | |
| inputs=gr.Audio(type="filepath", label="Upload Audio (MP3, WAV, M4A)"), | |
| outputs=gr.JSON(label="Analysis Results"), | |
| title="ποΈ Voice Analysis API for Salesforce", | |
| description=""" | |
| Upload a call recording to analyze: | |
| - **Speaker Diarization**: Who spoke when | |
| - **Coaching Detection**: Overlapping speech during borrower's responses | |
| - **Voice Metrics**: Pause detection, silence ratio | |
| Returns JSON that Salesforce can parse via Apex callout. | |
| """, | |
| examples=[], | |
| allow_flagging="never" | |
| ) | |
| # Launch with API enabled | |
| demo.launch() |