keshavgautam03
Add mic input, remove How It Works section
0d27fe1
"""
Fluency Benchmark App β€” Streamlit Interface
Upload an audio file β†’ full pipeline β†’ fluency report.
Run: streamlit run app.py
"""
import sys
import tempfile
from pathlib import Path
import streamlit as st
import numpy as np
# Add app root to path
sys.path.insert(0, str(Path(__file__).parent))
st.set_page_config(page_title="Fluency Benchmark", layout="wide")
st.title("Fluency Benchmark Pipeline")
st.markdown("Upload an English speech audio file to measure temporal fluency β€” flow, continuity, and pausing behavior.")
# ── Sidebar ──
st.sidebar.header("About")
st.sidebar.markdown("""
**What this measures:**
- Speech continuity (how much time is speech vs silence)
- Pause behavior (frequency, duration, placement)
- Articulation smoothness (legato vs staccato)
- Word-level precision (confidence, filled pauses)
**What this does NOT measure:**
- Grammar correctness
- Vocabulary richness
- Pronunciation accuracy
""")
# ── Input ──
input_method = st.radio("Choose input method", ["Upload File", "Record from Mic"], horizontal=True)
audio_path = None
if input_method == "Upload File":
uploaded_file = st.file_uploader("Upload audio file", type=["wav", "mp3", "m4a", "ogg", "flac"])
if uploaded_file is not None:
suffix = Path(uploaded_file.name).suffix
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(uploaded_file.read())
audio_path = tmp.name
st.audio(uploaded_file, format=f"audio/{suffix.strip('.')}")
else:
mic_audio = st.audio_input("Record audio")
if mic_audio is not None:
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
tmp.write(mic_audio.read())
audio_path = tmp.name
st.audio(mic_audio, format="audio/wav")
if audio_path is not None:
if st.button("Analyze Fluency", type="primary"):
# ── Step 1: VAD ──
with st.spinner("Step 1/6: Running Voice Activity Detection..."):
from pipeline.vad import run_vad
vad_features = run_vad(audio_path)
# ── Step 2: Transcription ──
with st.spinner("Step 2/6: Transcribing and aligning words..."):
from pipeline.transcribe import transcribe_and_align
tx = transcribe_and_align(audio_path)
words = tx["words"]
transcript = tx["transcript"]
# Update MLU with actual word count
word_count = len(words)
segments = vad_features["speech_segments"]
vad_features["mlu"] = round(word_count / max(segments, 1), 2)
# ── Step 3: Placement ──
with st.spinner("Step 3/6: Classifying pause placement..."):
from pipeline.placement import classify_pauses
placement = classify_pauses(words, vad_features)
# ── Step 4: FA Features ──
with st.spinner("Step 4/6: Computing word-level features..."):
from pipeline.fa_features import compute_fa_features
fa = compute_fa_features(words, vad_features["total_duration_sec"])
# ── Step 5: Syntactic Features ──
with st.spinner("Step 5/6: Analyzing syntactic pause context..."):
from pipeline.syntactic_features import compute_syntactic_features
syn = compute_syntactic_features(words, transcript)
# Combine all features
all_features = {**vad_features, **placement, **fa, **syn, "word_count": word_count}
# ── Step 6: Model Inference + Composite ──
with st.spinner("Step 6/6: Running models and scoring..."):
from models.inference import predict
from pipeline.composite import compute_composite
predictions = predict(all_features)
composite = compute_composite(all_features, predictions)
# ════════════════════════════════════════════════════════════
# RESULTS DISPLAY
# ════════════════════════════════════════════════════════════
st.divider()
# ── Headline Score ──
band = composite["fluency_band"]
pct = composite["composite_percentile"]
band_colors = {"LOW": "red", "MEDIUM": "orange", "HIGH": "green"}
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Fluency Percentile", f"{pct:.1f}%")
with col2:
st.metric("Fluency Band", band)
with col3:
ci = f"[{composite['composite_ci_low']:.2f}, {composite['composite_ci_high']:.2f}]"
st.metric("95% Confidence Interval", ci)
st.divider()
# ── Transcript ──
st.subheader("Transcript")
st.text_area("", transcript, height=100, disabled=True)
# ── Two columns: Temporal + Pauses ──
col_left, col_right = st.columns(2)
with col_left:
st.subheader("Temporal Profile")
st.metric("Speech Ratio", f"{vad_features['speech_ratio']:.1%}")
st.metric("Mean Length of Utterance", f"{vad_features['mlu']:.1f} words")
st.metric("Word Count", str(word_count))
st.metric("Duration", f"{vad_features['total_duration_sec']:.1f}s")
with col_right:
st.subheader("Pause Behavior")
freq_label = predictions.get("pause_freq_ordinal_label", "?")
dur_label = predictions.get("pause_dur_ordinal_label", "?")
place_label = predictions.get("pause_place_ordinal_label", "?")
st.metric("Pause Frequency", f"{freq_label} ({vad_features['pause_frequency_per_sec']:.2f}/s)")
st.metric("Pause Duration", f"{dur_label} (avg {vad_features['mean_pause_duration_sec']:.2f}s)")
st.metric("Pause Placement", place_label)
st.metric("Long Pause Ratio", f"{vad_features['long_pause_ratio']:.0%}")
st.divider()
# ── Diagnosis ──
col_a, col_b = st.columns(2)
with col_a:
st.subheader("Hesitation Diagnosis")
cl = predictions.get("cognitive_load_ordinal_label", "?")
uc = predictions.get("utterance_constraints_ordinal_label", "?")
st.metric("Cognitive Load", cl)
st.metric("Utterance Constraints", uc)
# Dominance
st.markdown("**Pause Type Breakdown**")
pu = predictions.get("prop_unplanned_pred", 0)
pp = predictions.get("prop_planned_pred", 0)
pn = predictions.get("prop_neutral_pred", 0)
st.progress(pu, text=f"Unplanned (hesitation): {pu:.0%}")
st.progress(pp, text=f"Planned (breathing): {pp:.0%}")
st.progress(pn, text=f"Neutral (filler): {pn:.0%}")
with col_b:
st.subheader("Articulation & Word Quality")
art_label = predictions.get("articulation_ordinal_label", "?")
st.metric("Articulation", art_label)
st.metric("Word Confidence", f"{fa['fa_mean_word_confidence']:.1%}")
st.metric("Filled Pauses", str(fa["fa_filled_pause_count"]))
st.metric("Speech Rate CV", f"{fa['fa_speech_rate_cv']:.3f}")
st.divider()
# ── 6 Dimensions ──
st.subheader("6 Fluency Dimensions")
dim_labels = {
"dim_continuity": ("Continuity", "How much and how long the speaker talks without pausing"),
"dim_pause_quality": ("Pause Quality", "Average pause duration and long pause proportion"),
"dim_placement": ("Placement", "Where pauses fall relative to sentence structure"),
"dim_articulation": ("Articulation", "Smoothness of delivery (legato vs staccato)"),
"dim_dominance": ("Dominance", "Balance of unplanned vs planned pauses"),
"dim_word_precision": ("Word Precision", "Word-level confidence and speech rate consistency"),
}
cols = st.columns(3)
for i, (dim_key, (label, desc)) in enumerate(dim_labels.items()):
with cols[i % 3]:
val = composite[dim_key]
# Color code: positive = green, negative = red
delta_color = "normal" if val >= 0 else "inverse"
st.metric(label, f"{val:.2f}", help=desc)
st.divider()
# ── Placement Details ──
st.subheader("Pause Placement Details")
pcol1, pcol2 = st.columns(2)
with pcol1:
st.metric("Boundary-aligned pauses", f"{placement['boundary_pause_ratio']:.0%}")
st.metric("Mid-clause pauses", f"{placement['mid_clause_pause_ratio']:.0%}")
with pcol2:
st.metric("Content-word preceding", f"{syn['syn_content_word_preceding_ratio']:.0%}")
st.metric("Function-word preceding", f"{syn['syn_function_word_preceding_ratio']:.0%}")
# ── Raw Data Expander ──
with st.expander("All Raw Features"):
import pandas as pd
all_data = {**all_features, **predictions, **composite}
feature_df = pd.DataFrame([all_data]).T
feature_df.columns = ["Value"]
feature_df["Value"] = feature_df["Value"].astype(str)
st.dataframe(feature_df, use_container_width=True)
else:
st.info("Upload a .wav, .mp3, or .m4a audio file, or record from your microphone to begin analysis.")