import gradio as gr import os import asyncio import logging import time from model_loader import engine from deepgram import DeepgramClient, PrerecordedOptions from huggingface_hub import snapshot_download # --- Setup & Configuration --- logging.basicConfig(level=logging.INFO) logger = logging.getLogger("smart_turn_gradio") MODEL_REPO_ID = "Rishi2455/smart-turn-model" local_model_path = snapshot_download(repo_id=MODEL_REPO_ID) DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY") dg_client = DeepgramClient(DEEPGRAM_API_KEY) if DEEPGRAM_API_KEY else None async def load_model(): if not engine.is_loaded: await engine.load_model(local_model_path) # --- Helper: Stylish Result Component with Latency --- def format_result_html(is_complete, confidence, latency, extra_info=""): label = "COMPLETE ✅" if is_complete else "INCOMPLETE ⏳" color = "#10b981" if is_complete else "#f59e0b" bg_color = "rgba(16, 185, 129, 0.1)" if is_complete else "rgba(245, 158, 11, 0.1)" return f"""

{label}

CONFIDENCE

{confidence:.2%}

Latency: {latency:.1f}ms

{f'
{extra_info}
' if extra_info else ''}
""" # --- Prediction Logic --- async def predict_text(text): if not text: return "
Please enter some text.
" await load_model() start_time = time.perf_counter() result = await engine.predict(text) latency = (time.perf_counter() - start_time) * 1000 return format_result_html(result["is_complete"], result["confidence"], latency) async def predict_audio(audio_path): if not audio_path: return "
Please record or upload audio.
" await load_model() # Transcription Step with open(audio_path, 'rb') as audio: source = {'buffer': audio.read()} options = PrerecordedOptions(model="nova-2", smart_format=True) response = dg_client.listen.rest.v("1").transcribe_file(source, options) transcript = response.results.channels[0].alternatives[0].transcript if not transcript: return "
No speech detected.
" # Model Inference Step start_time = time.perf_counter() result = await engine.predict(transcript) latency = (time.perf_counter() - start_time) * 1000 return format_result_html(result["is_complete"], result["confidence"], latency, extra_info=f"Transcript: \"{transcript}\"") # --- UI Layout --- with gr.Blocks(theme=gr.themes.Default(primary_hue="indigo"), css="#container {max-width: 800px; margin: auto; padding-top: 20px;}") as demo: with gr.Column(elem_id="container"): gr.Markdown("# 🤖 Smart Turn - EOU Detection") # Added target="_blank" to ensure the link opens in a new tab gr.Markdown(f"Classification of End-of-Utterance (EOU) using: {MODEL_REPO_ID}") with gr.Tabs(): with gr.Tab("📝 Text Prediction"): text_input = gr.Textbox(placeholder="Type a sentence...", label="Input Text", lines=3) text_btn = gr.Button("Analyze Text", variant="primary") with gr.Tab("🎙️ Audio Prediction"): audio_input = gr.Audio(type="filepath", label="Record/Upload Audio") audio_btn = gr.Button("Transcribe & Analyze", variant="primary") gr.Markdown("### 🔍 Model Result") output_html = gr.HTML("
Provide input and click Analyze
") gr.Examples( examples=[["i want to"], ["i want to book a flight"], ["can you help me with"]], inputs=text_input ) # Event Handlers text_btn.click(predict_text, inputs=text_input, outputs=output_html) audio_btn.click(predict_audio, inputs=audio_input, outputs=output_html) if __name__ == "__main__": demo.launch()