""" Fluency Benchmark App — Streamlit Interface Upload an audio file → full pipeline → fluency report. Run: streamlit run app.py """ import sys import tempfile from pathlib import Path import streamlit as st import numpy as np # Add app root to path sys.path.insert(0, str(Path(__file__).parent)) st.set_page_config(page_title="Fluency Benchmark", layout="wide") st.title("Fluency Benchmark Pipeline") st.markdown("Upload an English speech audio file to measure temporal fluency — flow, continuity, and pausing behavior.") # ── Sidebar ── st.sidebar.header("About") st.sidebar.markdown(""" **What this measures:** - Speech continuity (how much time is speech vs silence) - Pause behavior (frequency, duration, placement) - Articulation smoothness (legato vs staccato) - Word-level precision (confidence, filled pauses) **What this does NOT measure:** - Grammar correctness - Vocabulary richness - Pronunciation accuracy """) # ── Input ── input_method = st.radio("Choose input method", ["Upload File", "Record from Mic"], horizontal=True) audio_path = None if input_method == "Upload File": uploaded_file = st.file_uploader("Upload audio file", type=["wav", "mp3", "m4a", "ogg", "flac"]) if uploaded_file is not None: suffix = Path(uploaded_file.name).suffix with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: tmp.write(uploaded_file.read()) audio_path = tmp.name st.audio(uploaded_file, format=f"audio/{suffix.strip('.')}") else: mic_audio = st.audio_input("Record audio") if mic_audio is not None: with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: tmp.write(mic_audio.read()) audio_path = tmp.name st.audio(mic_audio, format="audio/wav") if audio_path is not None: if st.button("Analyze Fluency", type="primary"): # ── Step 1: VAD ── with st.spinner("Step 1/6: Running Voice Activity Detection..."): from pipeline.vad import run_vad vad_features = run_vad(audio_path) # ── Step 2: Transcription ── with st.spinner("Step 2/6: Transcribing and aligning words..."): from pipeline.transcribe import transcribe_and_align tx = transcribe_and_align(audio_path) words = tx["words"] transcript = tx["transcript"] # Update MLU with actual word count word_count = len(words) segments = vad_features["speech_segments"] vad_features["mlu"] = round(word_count / max(segments, 1), 2) # ── Step 3: Placement ── with st.spinner("Step 3/6: Classifying pause placement..."): from pipeline.placement import classify_pauses placement = classify_pauses(words, vad_features) # ── Step 4: FA Features ── with st.spinner("Step 4/6: Computing word-level features..."): from pipeline.fa_features import compute_fa_features fa = compute_fa_features(words, vad_features["total_duration_sec"]) # ── Step 5: Syntactic Features ── with st.spinner("Step 5/6: Analyzing syntactic pause context..."): from pipeline.syntactic_features import compute_syntactic_features syn = compute_syntactic_features(words, transcript) # Combine all features all_features = {**vad_features, **placement, **fa, **syn, "word_count": word_count} # ── Step 6: Model Inference + Composite ── with st.spinner("Step 6/6: Running models and scoring..."): from models.inference import predict from pipeline.composite import compute_composite predictions = predict(all_features) composite = compute_composite(all_features, predictions) # ════════════════════════════════════════════════════════════ # RESULTS DISPLAY # ════════════════════════════════════════════════════════════ st.divider() # ── Headline Score ── band = composite["fluency_band"] pct = composite["composite_percentile"] band_colors = {"LOW": "red", "MEDIUM": "orange", "HIGH": "green"} col1, col2, col3 = st.columns(3) with col1: st.metric("Fluency Percentile", f"{pct:.1f}%") with col2: st.metric("Fluency Band", band) with col3: ci = f"[{composite['composite_ci_low']:.2f}, {composite['composite_ci_high']:.2f}]" st.metric("95% Confidence Interval", ci) st.divider() # ── Transcript ── st.subheader("Transcript") st.text_area("", transcript, height=100, disabled=True) # ── Two columns: Temporal + Pauses ── col_left, col_right = st.columns(2) with col_left: st.subheader("Temporal Profile") st.metric("Speech Ratio", f"{vad_features['speech_ratio']:.1%}") st.metric("Mean Length of Utterance", f"{vad_features['mlu']:.1f} words") st.metric("Word Count", str(word_count)) st.metric("Duration", f"{vad_features['total_duration_sec']:.1f}s") with col_right: st.subheader("Pause Behavior") freq_label = predictions.get("pause_freq_ordinal_label", "?") dur_label = predictions.get("pause_dur_ordinal_label", "?") place_label = predictions.get("pause_place_ordinal_label", "?") st.metric("Pause Frequency", f"{freq_label} ({vad_features['pause_frequency_per_sec']:.2f}/s)") st.metric("Pause Duration", f"{dur_label} (avg {vad_features['mean_pause_duration_sec']:.2f}s)") st.metric("Pause Placement", place_label) st.metric("Long Pause Ratio", f"{vad_features['long_pause_ratio']:.0%}") st.divider() # ── Diagnosis ── col_a, col_b = st.columns(2) with col_a: st.subheader("Hesitation Diagnosis") cl = predictions.get("cognitive_load_ordinal_label", "?") uc = predictions.get("utterance_constraints_ordinal_label", "?") st.metric("Cognitive Load", cl) st.metric("Utterance Constraints", uc) # Dominance st.markdown("**Pause Type Breakdown**") pu = predictions.get("prop_unplanned_pred", 0) pp = predictions.get("prop_planned_pred", 0) pn = predictions.get("prop_neutral_pred", 0) st.progress(pu, text=f"Unplanned (hesitation): {pu:.0%}") st.progress(pp, text=f"Planned (breathing): {pp:.0%}") st.progress(pn, text=f"Neutral (filler): {pn:.0%}") with col_b: st.subheader("Articulation & Word Quality") art_label = predictions.get("articulation_ordinal_label", "?") st.metric("Articulation", art_label) st.metric("Word Confidence", f"{fa['fa_mean_word_confidence']:.1%}") st.metric("Filled Pauses", str(fa["fa_filled_pause_count"])) st.metric("Speech Rate CV", f"{fa['fa_speech_rate_cv']:.3f}") st.divider() # ── 6 Dimensions ── st.subheader("6 Fluency Dimensions") dim_labels = { "dim_continuity": ("Continuity", "How much and how long the speaker talks without pausing"), "dim_pause_quality": ("Pause Quality", "Average pause duration and long pause proportion"), "dim_placement": ("Placement", "Where pauses fall relative to sentence structure"), "dim_articulation": ("Articulation", "Smoothness of delivery (legato vs staccato)"), "dim_dominance": ("Dominance", "Balance of unplanned vs planned pauses"), "dim_word_precision": ("Word Precision", "Word-level confidence and speech rate consistency"), } cols = st.columns(3) for i, (dim_key, (label, desc)) in enumerate(dim_labels.items()): with cols[i % 3]: val = composite[dim_key] # Color code: positive = green, negative = red delta_color = "normal" if val >= 0 else "inverse" st.metric(label, f"{val:.2f}", help=desc) st.divider() # ── Placement Details ── st.subheader("Pause Placement Details") pcol1, pcol2 = st.columns(2) with pcol1: st.metric("Boundary-aligned pauses", f"{placement['boundary_pause_ratio']:.0%}") st.metric("Mid-clause pauses", f"{placement['mid_clause_pause_ratio']:.0%}") with pcol2: st.metric("Content-word preceding", f"{syn['syn_content_word_preceding_ratio']:.0%}") st.metric("Function-word preceding", f"{syn['syn_function_word_preceding_ratio']:.0%}") # ── Raw Data Expander ── with st.expander("All Raw Features"): import pandas as pd all_data = {**all_features, **predictions, **composite} feature_df = pd.DataFrame([all_data]).T feature_df.columns = ["Value"] feature_df["Value"] = feature_df["Value"].astype(str) st.dataframe(feature_df, use_container_width=True) else: st.info("Upload a .wav, .mp3, or .m4a audio file, or record from your microphone to begin analysis.")