import gradio as gr
import os
import asyncio
import logging
import time
from model_loader import engine
from deepgram import DeepgramClient, PrerecordedOptions
from huggingface_hub import snapshot_download
# --- Setup & Configuration ---
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("smart_turn_gradio")
MODEL_REPO_ID = "Rishi2455/smart-turn-model"
local_model_path = snapshot_download(repo_id=MODEL_REPO_ID)
DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY")
dg_client = DeepgramClient(DEEPGRAM_API_KEY) if DEEPGRAM_API_KEY else None
async def load_model():
if not engine.is_loaded:
await engine.load_model(local_model_path)
# --- Helper: Stylish Result Component with Latency ---
def format_result_html(is_complete, confidence, latency, extra_info=""):
label = "COMPLETE ✅" if is_complete else "INCOMPLETE ⏳"
color = "#10b981" if is_complete else "#f59e0b"
bg_color = "rgba(16, 185, 129, 0.1)" if is_complete else "rgba(245, 158, 11, 0.1)"
return f"""
{label}
CONFIDENCE
{confidence:.2%}
Latency: {latency:.1f}ms
{f'
{extra_info}
' if extra_info else ''}
"""
# --- Prediction Logic ---
async def predict_text(text):
if not text:
return "Please enter some text.
"
await load_model()
start_time = time.perf_counter()
result = await engine.predict(text)
latency = (time.perf_counter() - start_time) * 1000
return format_result_html(result["is_complete"], result["confidence"], latency)
async def predict_audio(audio_path):
if not audio_path:
return "Please record or upload audio.
"
await load_model()
# Transcription Step
with open(audio_path, 'rb') as audio:
source = {'buffer': audio.read()}
options = PrerecordedOptions(model="nova-2", smart_format=True)
response = dg_client.listen.rest.v("1").transcribe_file(source, options)
transcript = response.results.channels[0].alternatives[0].transcript
if not transcript:
return "No speech detected.
"
# Model Inference Step
start_time = time.perf_counter()
result = await engine.predict(transcript)
latency = (time.perf_counter() - start_time) * 1000
return format_result_html(result["is_complete"], result["confidence"], latency, extra_info=f"Transcript: \"{transcript}\"")
# --- UI Layout ---
with gr.Blocks(theme=gr.themes.Default(primary_hue="indigo"), css="#container {max-width: 800px; margin: auto; padding-top: 20px;}") as demo:
with gr.Column(elem_id="container"):
gr.Markdown("# 🤖 Smart Turn - EOU Detection")
# Added target="_blank" to ensure the link opens in a new tab
gr.Markdown(f"Classification of End-of-Utterance (EOU) using: {MODEL_REPO_ID}")
with gr.Tabs():
with gr.Tab("📝 Text Prediction"):
text_input = gr.Textbox(placeholder="Type a sentence...", label="Input Text", lines=3)
text_btn = gr.Button("Analyze Text", variant="primary")
with gr.Tab("🎙️ Audio Prediction"):
audio_input = gr.Audio(type="filepath", label="Record/Upload Audio")
audio_btn = gr.Button("Transcribe & Analyze", variant="primary")
gr.Markdown("### 🔍 Model Result")
output_html = gr.HTML("Provide input and click Analyze
")
gr.Examples(
examples=[["i want to"], ["i want to book a flight"], ["can you help me with"]],
inputs=text_input
)
# Event Handlers
text_btn.click(predict_text, inputs=text_input, outputs=output_html)
audio_btn.click(predict_audio, inputs=audio_input, outputs=output_html)
if __name__ == "__main__":
demo.launch()