fluency-benchmark / models /train_and_save.py
keshavgautam03
Initial deploy: fluency benchmark app
1e81b0d
"""Train all models on the full benchmark dataset and save for inference.
Run ONCE before using the app:
python -m models.train_and_save
"""
import json
import warnings
from pathlib import Path
import joblib
import mord
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from scipy.stats import kendalltau
from sklearn.metrics import accuracy_score
warnings.filterwarnings("ignore")
BASE = Path(__file__).parent.parent
DATA_PATH = BASE / "data" / "benchmark_stage2_scored.csv"
JOINED_PATH = BASE / "data" / "benchmark_stage2_joined.csv"
SAVE_DIR = BASE / "saved_models"
GUA_CORE = [
"speech_ratio", "mlu", "mean_pause_duration_sec",
"pause_frequency_per_sec", "long_pause_ratio", "short_pause_share",
]
PLACEMENT = [
"boundary_pause_ratio", "mid_clause_pause_ratio",
"mid_clause_mean_duration", "mid_clause_long_ratio",
]
FA_FEATURES = [
"fa_filled_pause_count", "fa_filled_pause_ratio",
"fa_mean_word_duration_sec", "fa_std_word_duration_sec",
"fa_mean_word_confidence", "fa_low_confidence_word_ratio",
"fa_articulation_rate", "fa_speech_rate_cv",
"fa_pause_position_mean", "fa_pause_position_std",
"fa_long_pause_ratio",
]
SYN_FEATURES = [
"syn_content_word_preceding_ratio", "syn_function_word_preceding_ratio",
"syn_content_word_following_ratio", "syn_clause_boundary_pause_ratio",
]
GUA_ALL = GUA_CORE + PLACEMENT + FA_FEATURES + SYN_FEATURES
ORDINAL_TARGETS = [
"articulation_ordinal", "pause_freq_ordinal", "pause_dur_ordinal",
"pause_place_ordinal", "cognitive_load_ordinal", "utterance_constraints_ordinal",
]
def train_ordinal_models(df):
"""Train best ordinal model per target on full dataset."""
models = {}
for target in ORDINAL_TARGETS:
valid = df[GUA_ALL + [target]].dropna()
if len(valid) < 20:
print(f" {target}: SKIPPED (N={len(valid)})")
continue
X = valid[GUA_ALL].values
y = valid[target].astype(int).values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
unique = np.sort(np.unique(y))
class_map = {c: i for i, c in enumerate(unique)}
class_unmap = {i: c for c, i in class_map.items()}
y_mapped = np.array([class_map[v] for v in y])
# Grid search for best model
best_model, best_name, best_score = None, "", -1
for alpha in [0.01, 0.1, 1.0, 5.0, 10.0]:
for ModelClass, mname in [
(mord.LogisticAT, "AT"),
(mord.LogisticIT, "IT"),
(mord.LogisticSE, "SE"),
]:
try:
m = ModelClass(alpha=alpha, max_iter=3000)
m.fit(X_scaled, y_mapped)
preds = m.predict(X_scaled)
preds_orig = np.array([class_unmap.get(p, p) for p in preds])
acc = accuracy_score(y, preds_orig)
tau, _ = kendalltau(y, preds_orig)
tau = tau if not np.isnan(tau) else 0
score = 0.5 * acc + 0.5 * tau
if score > best_score:
best_score = score
best_model = m
best_name = f"{mname}(α={alpha})"
except Exception:
continue
models[target] = {
"model": best_model,
"scaler": scaler,
"class_map": class_map,
"class_unmap": {str(k): int(v) for k, v in class_unmap.items()},
"name": best_name,
"n_train": len(valid),
}
print(f" {target}: {best_name} N={len(valid)} score={best_score:.3f}")
return models
def train_dominance_model(df):
"""Train ALR Ridge model for pause dominance proportions."""
dom_cols = ["prop_unplanned", "prop_planned", "prop_neutral"]
valid = df[GUA_ALL + dom_cols].dropna()
if len(valid) < 20:
print(" Dominance: SKIPPED")
return None
eps = 0.01
valid = valid.copy()
valid["alr_unplanned"] = np.log((valid["prop_unplanned"] + eps) / (valid["prop_neutral"] + eps))
valid["alr_planned"] = np.log((valid["prop_planned"] + eps) / (valid["prop_neutral"] + eps))
scaler = StandardScaler()
X = scaler.fit_transform(valid[GUA_ALL].values)
y_alr = valid[["alr_unplanned", "alr_planned"]].values
# Find best alpha
best_alpha, best_rho = 1.0, -1
for alpha in [0.01, 0.1, 1.0, 10.0, 100.0]:
ridge = Ridge(alpha=alpha)
ridge.fit(X, y_alr)
preds = ridge.predict(X)
from scipy.stats import spearmanr
rho, _ = spearmanr(y_alr[:, 0], preds[:, 0])
if not np.isnan(rho) and rho > best_rho:
best_rho = rho
best_alpha = alpha
ridge = Ridge(alpha=best_alpha)
ridge.fit(X, y_alr)
print(f" Dominance: Ridge(α={best_alpha}) N={len(valid)} rho={best_rho:.3f}")
return {
"model": ridge,
"scaler": scaler,
"alpha": best_alpha,
}
def save_population_stats(df):
"""Save population means and stds for z-scoring at inference time."""
stats = {"means": {}, "stds": {}}
for col in GUA_ALL + ORDINAL_TARGETS + ["prop_unplanned", "prop_planned", "prop_neutral"]:
if col in df.columns:
stats["means"][col] = float(df[col].mean()) if df[col].notna().any() else 0.0
stats["stds"][col] = float(df[col].std()) if df[col].notna().any() else 1.0
with open(SAVE_DIR / "population_stats.json", "w") as f:
json.dump(stats, f, indent=2)
# Save composite distribution for percentile computation
if "composite_raw" in df.columns:
dist = df["composite_raw"].dropna().sort_values().values
np.save(SAVE_DIR / "benchmark_distribution.npy", dist)
print(f" Population stats: {len(stats['means'])} features")
print(f" Benchmark distribution: {len(dist)} values")
def main():
SAVE_DIR.mkdir(exist_ok=True)
# Load data
scored = pd.read_csv(DATA_PATH) if DATA_PATH.exists() else None
joined = pd.read_csv(JOINED_PATH) if JOINED_PATH.exists() else None
df = joined if joined is not None else scored
if df is None:
print("ERROR: No data found. Copy benchmark_stage2_joined.csv to fluency_app/data/")
return
print(f"Loaded: {len(df)} rows, {len(df.columns)} columns")
# Check required columns
missing = [c for c in GUA_ALL if c not in df.columns]
if missing:
print(f"WARNING: Missing features: {missing}")
print(" Some models may have reduced accuracy")
# Train ordinal models
print("\n── Training Ordinal Models ──")
ordinal_models = train_ordinal_models(df)
for target, info in ordinal_models.items():
joblib.dump(info["model"], SAVE_DIR / f"ordinal_{target}.joblib")
joblib.dump(info["scaler"], SAVE_DIR / f"scaler_{target}.joblib")
# Save class mappings
class_mappings = {t: info["class_unmap"] for t, info in ordinal_models.items()}
with open(SAVE_DIR / "class_mappings.json", "w") as f:
json.dump(class_mappings, f, indent=2)
# Train dominance model
print("\n── Training Dominance Model ──")
dom = train_dominance_model(df)
if dom:
joblib.dump(dom["model"], SAVE_DIR / "dominance_ridge.joblib")
joblib.dump(dom["scaler"], SAVE_DIR / "scaler_dominance.joblib")
# Save population stats
print("\n── Saving Population Statistics ──")
# Need scored data for composite distribution
if scored is not None:
save_population_stats(scored)
else:
save_population_stats(df)
# Save feature order
with open(SAVE_DIR / "feature_order.json", "w") as f:
json.dump(GUA_ALL, f)
print(f"\n All models saved to {SAVE_DIR}/")
print(f" Files: {list(SAVE_DIR.glob('*'))}")
if __name__ == "__main__":
main()