Spaces:
Running
Running
| """ | |
| Fluency Benchmark App β Streamlit Interface | |
| Upload an audio file β full pipeline β fluency report. | |
| Run: streamlit run app.py | |
| """ | |
| import sys | |
| import tempfile | |
| from pathlib import Path | |
| import streamlit as st | |
| import numpy as np | |
| # Add app root to path | |
| sys.path.insert(0, str(Path(__file__).parent)) | |
| st.set_page_config(page_title="Fluency Benchmark", layout="wide") | |
| st.title("Fluency Benchmark Pipeline") | |
| st.markdown("Upload an English speech audio file to measure temporal fluency β flow, continuity, and pausing behavior.") | |
| # ββ Sidebar ββ | |
| st.sidebar.header("About") | |
| st.sidebar.markdown(""" | |
| **What this measures:** | |
| - Speech continuity (how much time is speech vs silence) | |
| - Pause behavior (frequency, duration, placement) | |
| - Articulation smoothness (legato vs staccato) | |
| - Word-level precision (confidence, filled pauses) | |
| **What this does NOT measure:** | |
| - Grammar correctness | |
| - Vocabulary richness | |
| - Pronunciation accuracy | |
| """) | |
| # ββ Input ββ | |
| input_method = st.radio("Choose input method", ["Upload File", "Record from Mic"], horizontal=True) | |
| audio_path = None | |
| if input_method == "Upload File": | |
| uploaded_file = st.file_uploader("Upload audio file", type=["wav", "mp3", "m4a", "ogg", "flac"]) | |
| if uploaded_file is not None: | |
| suffix = Path(uploaded_file.name).suffix | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: | |
| tmp.write(uploaded_file.read()) | |
| audio_path = tmp.name | |
| st.audio(uploaded_file, format=f"audio/{suffix.strip('.')}") | |
| else: | |
| mic_audio = st.audio_input("Record audio") | |
| if mic_audio is not None: | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: | |
| tmp.write(mic_audio.read()) | |
| audio_path = tmp.name | |
| st.audio(mic_audio, format="audio/wav") | |
| if audio_path is not None: | |
| if st.button("Analyze Fluency", type="primary"): | |
| # ββ Step 1: VAD ββ | |
| with st.spinner("Step 1/6: Running Voice Activity Detection..."): | |
| from pipeline.vad import run_vad | |
| vad_features = run_vad(audio_path) | |
| # ββ Step 2: Transcription ββ | |
| with st.spinner("Step 2/6: Transcribing and aligning words..."): | |
| from pipeline.transcribe import transcribe_and_align | |
| tx = transcribe_and_align(audio_path) | |
| words = tx["words"] | |
| transcript = tx["transcript"] | |
| # Update MLU with actual word count | |
| word_count = len(words) | |
| segments = vad_features["speech_segments"] | |
| vad_features["mlu"] = round(word_count / max(segments, 1), 2) | |
| # ββ Step 3: Placement ββ | |
| with st.spinner("Step 3/6: Classifying pause placement..."): | |
| from pipeline.placement import classify_pauses | |
| placement = classify_pauses(words, vad_features) | |
| # ββ Step 4: FA Features ββ | |
| with st.spinner("Step 4/6: Computing word-level features..."): | |
| from pipeline.fa_features import compute_fa_features | |
| fa = compute_fa_features(words, vad_features["total_duration_sec"]) | |
| # ββ Step 5: Syntactic Features ββ | |
| with st.spinner("Step 5/6: Analyzing syntactic pause context..."): | |
| from pipeline.syntactic_features import compute_syntactic_features | |
| syn = compute_syntactic_features(words, transcript) | |
| # Combine all features | |
| all_features = {**vad_features, **placement, **fa, **syn, "word_count": word_count} | |
| # ββ Step 6: Model Inference + Composite ββ | |
| with st.spinner("Step 6/6: Running models and scoring..."): | |
| from models.inference import predict | |
| from pipeline.composite import compute_composite | |
| predictions = predict(all_features) | |
| composite = compute_composite(all_features, predictions) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # RESULTS DISPLAY | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.divider() | |
| # ββ Headline Score ββ | |
| band = composite["fluency_band"] | |
| pct = composite["composite_percentile"] | |
| band_colors = {"LOW": "red", "MEDIUM": "orange", "HIGH": "green"} | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Fluency Percentile", f"{pct:.1f}%") | |
| with col2: | |
| st.metric("Fluency Band", band) | |
| with col3: | |
| ci = f"[{composite['composite_ci_low']:.2f}, {composite['composite_ci_high']:.2f}]" | |
| st.metric("95% Confidence Interval", ci) | |
| st.divider() | |
| # ββ Transcript ββ | |
| st.subheader("Transcript") | |
| st.text_area("", transcript, height=100, disabled=True) | |
| # ββ Two columns: Temporal + Pauses ββ | |
| col_left, col_right = st.columns(2) | |
| with col_left: | |
| st.subheader("Temporal Profile") | |
| st.metric("Speech Ratio", f"{vad_features['speech_ratio']:.1%}") | |
| st.metric("Mean Length of Utterance", f"{vad_features['mlu']:.1f} words") | |
| st.metric("Word Count", str(word_count)) | |
| st.metric("Duration", f"{vad_features['total_duration_sec']:.1f}s") | |
| with col_right: | |
| st.subheader("Pause Behavior") | |
| freq_label = predictions.get("pause_freq_ordinal_label", "?") | |
| dur_label = predictions.get("pause_dur_ordinal_label", "?") | |
| place_label = predictions.get("pause_place_ordinal_label", "?") | |
| st.metric("Pause Frequency", f"{freq_label} ({vad_features['pause_frequency_per_sec']:.2f}/s)") | |
| st.metric("Pause Duration", f"{dur_label} (avg {vad_features['mean_pause_duration_sec']:.2f}s)") | |
| st.metric("Pause Placement", place_label) | |
| st.metric("Long Pause Ratio", f"{vad_features['long_pause_ratio']:.0%}") | |
| st.divider() | |
| # ββ Diagnosis ββ | |
| col_a, col_b = st.columns(2) | |
| with col_a: | |
| st.subheader("Hesitation Diagnosis") | |
| cl = predictions.get("cognitive_load_ordinal_label", "?") | |
| uc = predictions.get("utterance_constraints_ordinal_label", "?") | |
| st.metric("Cognitive Load", cl) | |
| st.metric("Utterance Constraints", uc) | |
| # Dominance | |
| st.markdown("**Pause Type Breakdown**") | |
| pu = predictions.get("prop_unplanned_pred", 0) | |
| pp = predictions.get("prop_planned_pred", 0) | |
| pn = predictions.get("prop_neutral_pred", 0) | |
| st.progress(pu, text=f"Unplanned (hesitation): {pu:.0%}") | |
| st.progress(pp, text=f"Planned (breathing): {pp:.0%}") | |
| st.progress(pn, text=f"Neutral (filler): {pn:.0%}") | |
| with col_b: | |
| st.subheader("Articulation & Word Quality") | |
| art_label = predictions.get("articulation_ordinal_label", "?") | |
| st.metric("Articulation", art_label) | |
| st.metric("Word Confidence", f"{fa['fa_mean_word_confidence']:.1%}") | |
| st.metric("Filled Pauses", str(fa["fa_filled_pause_count"])) | |
| st.metric("Speech Rate CV", f"{fa['fa_speech_rate_cv']:.3f}") | |
| st.divider() | |
| # ββ 6 Dimensions ββ | |
| st.subheader("6 Fluency Dimensions") | |
| dim_labels = { | |
| "dim_continuity": ("Continuity", "How much and how long the speaker talks without pausing"), | |
| "dim_pause_quality": ("Pause Quality", "Average pause duration and long pause proportion"), | |
| "dim_placement": ("Placement", "Where pauses fall relative to sentence structure"), | |
| "dim_articulation": ("Articulation", "Smoothness of delivery (legato vs staccato)"), | |
| "dim_dominance": ("Dominance", "Balance of unplanned vs planned pauses"), | |
| "dim_word_precision": ("Word Precision", "Word-level confidence and speech rate consistency"), | |
| } | |
| cols = st.columns(3) | |
| for i, (dim_key, (label, desc)) in enumerate(dim_labels.items()): | |
| with cols[i % 3]: | |
| val = composite[dim_key] | |
| # Color code: positive = green, negative = red | |
| delta_color = "normal" if val >= 0 else "inverse" | |
| st.metric(label, f"{val:.2f}", help=desc) | |
| st.divider() | |
| # ββ Placement Details ββ | |
| st.subheader("Pause Placement Details") | |
| pcol1, pcol2 = st.columns(2) | |
| with pcol1: | |
| st.metric("Boundary-aligned pauses", f"{placement['boundary_pause_ratio']:.0%}") | |
| st.metric("Mid-clause pauses", f"{placement['mid_clause_pause_ratio']:.0%}") | |
| with pcol2: | |
| st.metric("Content-word preceding", f"{syn['syn_content_word_preceding_ratio']:.0%}") | |
| st.metric("Function-word preceding", f"{syn['syn_function_word_preceding_ratio']:.0%}") | |
| # ββ Raw Data Expander ββ | |
| with st.expander("All Raw Features"): | |
| import pandas as pd | |
| all_data = {**all_features, **predictions, **composite} | |
| feature_df = pd.DataFrame([all_data]).T | |
| feature_df.columns = ["Value"] | |
| feature_df["Value"] = feature_df["Value"].astype(str) | |
| st.dataframe(feature_df, use_container_width=True) | |
| else: | |
| st.info("Upload a .wav, .mp3, or .m4a audio file, or record from your microphone to begin analysis.") | |