Spaces:

stimuler
/

fluency-benchmark

Running

keshavgautam03

Add mic input, remove How It Works section

0d27fe1 13 days ago

9.6 kB

	"""
	Fluency Benchmark App — Streamlit Interface

	Upload an audio file → full pipeline → fluency report.

	Run: streamlit run app.py
	"""

	import sys
	import tempfile
	from pathlib import Path

	import streamlit as st
	import numpy as np

	# Add app root to path
	sys.path.insert(0, str(Path(__file__).parent))

	st.set_page_config(page_title="Fluency Benchmark", layout="wide")
	st.title("Fluency Benchmark Pipeline")
	st.markdown("Upload an English speech audio file to measure temporal fluency — flow, continuity, and pausing behavior.")

	# ── Sidebar ──
	st.sidebar.header("About")
	st.sidebar.markdown("""
	What this measures:
	- Speech continuity (how much time is speech vs silence)
	- Pause behavior (frequency, duration, placement)
	- Articulation smoothness (legato vs staccato)
	- Word-level precision (confidence, filled pauses)

	What this does NOT measure:
	- Grammar correctness
	- Vocabulary richness
	- Pronunciation accuracy
	""")

	# ── Input ──
	input_method = st.radio("Choose input method", ["Upload File", "Record from Mic"], horizontal=True)

	audio_path = None
	if input_method == "Upload File":
	uploaded_file = st.file_uploader("Upload audio file", type=["wav", "mp3", "m4a", "ogg", "flac"])
	if uploaded_file is not None:
	suffix = Path(uploaded_file.name).suffix
	with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
	tmp.write(uploaded_file.read())
	audio_path = tmp.name
	st.audio(uploaded_file, format=f"audio/{suffix.strip('.')}")
	else:
	mic_audio = st.audio_input("Record audio")
	if mic_audio is not None:
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
	tmp.write(mic_audio.read())
	audio_path = tmp.name
	st.audio(mic_audio, format="audio/wav")

	if audio_path is not None:

	if st.button("Analyze Fluency", type="primary"):
	# ── Step 1: VAD ──
	with st.spinner("Step 1/6: Running Voice Activity Detection..."):
	from pipeline.vad import run_vad
	vad_features = run_vad(audio_path)

	# ── Step 2: Transcription ──
	with st.spinner("Step 2/6: Transcribing and aligning words..."):
	from pipeline.transcribe import transcribe_and_align
	tx = transcribe_and_align(audio_path)
	words = tx["words"]
	transcript = tx["transcript"]

	# Update MLU with actual word count
	word_count = len(words)
	segments = vad_features["speech_segments"]
	vad_features["mlu"] = round(word_count / max(segments, 1), 2)

	# ── Step 3: Placement ──
	with st.spinner("Step 3/6: Classifying pause placement..."):
	from pipeline.placement import classify_pauses
	placement = classify_pauses(words, vad_features)

	# ── Step 4: FA Features ──
	with st.spinner("Step 4/6: Computing word-level features..."):
	from pipeline.fa_features import compute_fa_features
	fa = compute_fa_features(words, vad_features["total_duration_sec"])

	# ── Step 5: Syntactic Features ──
	with st.spinner("Step 5/6: Analyzing syntactic pause context..."):
	from pipeline.syntactic_features import compute_syntactic_features
	syn = compute_syntactic_features(words, transcript)

	# Combine all features
	all_features = {vad_features, placement, fa, syn, "word_count": word_count}

	# ── Step 6: Model Inference + Composite ──
	with st.spinner("Step 6/6: Running models and scoring..."):
	from models.inference import predict
	from pipeline.composite import compute_composite

	predictions = predict(all_features)
	composite = compute_composite(all_features, predictions)

	# ════════════════════════════════════════════════════════════
	# RESULTS DISPLAY
	# ════════════════════════════════════════════════════════════

	st.divider()

	# ── Headline Score ──
	band = composite["fluency_band"]
	pct = composite["composite_percentile"]
	band_colors = {"LOW": "red", "MEDIUM": "orange", "HIGH": "green"}

	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("Fluency Percentile", f"{pct:.1f}%")
	with col2:
	st.metric("Fluency Band", band)
	with col3:
	ci = f"[{composite['composite_ci_low']:.2f}, {composite['composite_ci_high']:.2f}]"
	st.metric("95% Confidence Interval", ci)

	st.divider()

	# ── Transcript ──
	st.subheader("Transcript")
	st.text_area("", transcript, height=100, disabled=True)

	# ── Two columns: Temporal + Pauses ──
	col_left, col_right = st.columns(2)

	with col_left:
	st.subheader("Temporal Profile")
	st.metric("Speech Ratio", f"{vad_features['speech_ratio']:.1%}")
	st.metric("Mean Length of Utterance", f"{vad_features['mlu']:.1f} words")
	st.metric("Word Count", str(word_count))
	st.metric("Duration", f"{vad_features['total_duration_sec']:.1f}s")

	with col_right:
	st.subheader("Pause Behavior")
	freq_label = predictions.get("pause_freq_ordinal_label", "?")
	dur_label = predictions.get("pause_dur_ordinal_label", "?")
	place_label = predictions.get("pause_place_ordinal_label", "?")
	st.metric("Pause Frequency", f"{freq_label} ({vad_features['pause_frequency_per_sec']:.2f}/s)")
	st.metric("Pause Duration", f"{dur_label} (avg {vad_features['mean_pause_duration_sec']:.2f}s)")
	st.metric("Pause Placement", place_label)
	st.metric("Long Pause Ratio", f"{vad_features['long_pause_ratio']:.0%}")

	st.divider()

	# ── Diagnosis ──
	col_a, col_b = st.columns(2)

	with col_a:
	st.subheader("Hesitation Diagnosis")
	cl = predictions.get("cognitive_load_ordinal_label", "?")
	uc = predictions.get("utterance_constraints_ordinal_label", "?")
	st.metric("Cognitive Load", cl)
	st.metric("Utterance Constraints", uc)

	# Dominance
	st.markdown("Pause Type Breakdown")
	pu = predictions.get("prop_unplanned_pred", 0)
	pp = predictions.get("prop_planned_pred", 0)
	pn = predictions.get("prop_neutral_pred", 0)
	st.progress(pu, text=f"Unplanned (hesitation): {pu:.0%}")
	st.progress(pp, text=f"Planned (breathing): {pp:.0%}")
	st.progress(pn, text=f"Neutral (filler): {pn:.0%}")

	with col_b:
	st.subheader("Articulation & Word Quality")
	art_label = predictions.get("articulation_ordinal_label", "?")
	st.metric("Articulation", art_label)
	st.metric("Word Confidence", f"{fa['fa_mean_word_confidence']:.1%}")
	st.metric("Filled Pauses", str(fa["fa_filled_pause_count"]))
	st.metric("Speech Rate CV", f"{fa['fa_speech_rate_cv']:.3f}")

	st.divider()

	# ── 6 Dimensions ──
	st.subheader("6 Fluency Dimensions")

	dim_labels = {
	"dim_continuity": ("Continuity", "How much and how long the speaker talks without pausing"),
	"dim_pause_quality": ("Pause Quality", "Average pause duration and long pause proportion"),
	"dim_placement": ("Placement", "Where pauses fall relative to sentence structure"),
	"dim_articulation": ("Articulation", "Smoothness of delivery (legato vs staccato)"),
	"dim_dominance": ("Dominance", "Balance of unplanned vs planned pauses"),
	"dim_word_precision": ("Word Precision", "Word-level confidence and speech rate consistency"),
	}

	cols = st.columns(3)
	for i, (dim_key, (label, desc)) in enumerate(dim_labels.items()):
	with cols[i % 3]:
	val = composite[dim_key]
	# Color code: positive = green, negative = red
	delta_color = "normal" if val >= 0 else "inverse"
	st.metric(label, f"{val:.2f}", help=desc)

	st.divider()

	# ── Placement Details ──
	st.subheader("Pause Placement Details")
	pcol1, pcol2 = st.columns(2)
	with pcol1:
	st.metric("Boundary-aligned pauses", f"{placement['boundary_pause_ratio']:.0%}")
	st.metric("Mid-clause pauses", f"{placement['mid_clause_pause_ratio']:.0%}")
	with pcol2:
	st.metric("Content-word preceding", f"{syn['syn_content_word_preceding_ratio']:.0%}")
	st.metric("Function-word preceding", f"{syn['syn_function_word_preceding_ratio']:.0%}")

	# ── Raw Data Expander ──
	with st.expander("All Raw Features"):
	import pandas as pd
	all_data = {all_features, predictions, **composite}
	feature_df = pd.DataFrame([all_data]).T
	feature_df.columns = ["Value"]
	feature_df["Value"] = feature_df["Value"].astype(str)
	st.dataframe(feature_df, use_container_width=True)

	else:
	st.info("Upload a .wav, .mp3, or .m4a audio file, or record from your microphone to begin analysis.")