"""Train all models on the full benchmark dataset and save for inference. Run ONCE before using the app: python -m models.train_and_save """ import json import warnings from pathlib import Path import joblib import mord import numpy as np import pandas as pd from sklearn.linear_model import Ridge from sklearn.preprocessing import StandardScaler from scipy.stats import kendalltau from sklearn.metrics import accuracy_score warnings.filterwarnings("ignore") BASE = Path(__file__).parent.parent DATA_PATH = BASE / "data" / "benchmark_stage2_scored.csv" JOINED_PATH = BASE / "data" / "benchmark_stage2_joined.csv" SAVE_DIR = BASE / "saved_models" GUA_CORE = [ "speech_ratio", "mlu", "mean_pause_duration_sec", "pause_frequency_per_sec", "long_pause_ratio", "short_pause_share", ] PLACEMENT = [ "boundary_pause_ratio", "mid_clause_pause_ratio", "mid_clause_mean_duration", "mid_clause_long_ratio", ] FA_FEATURES = [ "fa_filled_pause_count", "fa_filled_pause_ratio", "fa_mean_word_duration_sec", "fa_std_word_duration_sec", "fa_mean_word_confidence", "fa_low_confidence_word_ratio", "fa_articulation_rate", "fa_speech_rate_cv", "fa_pause_position_mean", "fa_pause_position_std", "fa_long_pause_ratio", ] SYN_FEATURES = [ "syn_content_word_preceding_ratio", "syn_function_word_preceding_ratio", "syn_content_word_following_ratio", "syn_clause_boundary_pause_ratio", ] GUA_ALL = GUA_CORE + PLACEMENT + FA_FEATURES + SYN_FEATURES ORDINAL_TARGETS = [ "articulation_ordinal", "pause_freq_ordinal", "pause_dur_ordinal", "pause_place_ordinal", "cognitive_load_ordinal", "utterance_constraints_ordinal", ] def train_ordinal_models(df): """Train best ordinal model per target on full dataset.""" models = {} for target in ORDINAL_TARGETS: valid = df[GUA_ALL + [target]].dropna() if len(valid) < 20: print(f" {target}: SKIPPED (N={len(valid)})") continue X = valid[GUA_ALL].values y = valid[target].astype(int).values scaler = StandardScaler() X_scaled = scaler.fit_transform(X) unique = np.sort(np.unique(y)) class_map = {c: i for i, c in enumerate(unique)} class_unmap = {i: c for c, i in class_map.items()} y_mapped = np.array([class_map[v] for v in y]) # Grid search for best model best_model, best_name, best_score = None, "", -1 for alpha in [0.01, 0.1, 1.0, 5.0, 10.0]: for ModelClass, mname in [ (mord.LogisticAT, "AT"), (mord.LogisticIT, "IT"), (mord.LogisticSE, "SE"), ]: try: m = ModelClass(alpha=alpha, max_iter=3000) m.fit(X_scaled, y_mapped) preds = m.predict(X_scaled) preds_orig = np.array([class_unmap.get(p, p) for p in preds]) acc = accuracy_score(y, preds_orig) tau, _ = kendalltau(y, preds_orig) tau = tau if not np.isnan(tau) else 0 score = 0.5 * acc + 0.5 * tau if score > best_score: best_score = score best_model = m best_name = f"{mname}(α={alpha})" except Exception: continue models[target] = { "model": best_model, "scaler": scaler, "class_map": class_map, "class_unmap": {str(k): int(v) for k, v in class_unmap.items()}, "name": best_name, "n_train": len(valid), } print(f" {target}: {best_name} N={len(valid)} score={best_score:.3f}") return models def train_dominance_model(df): """Train ALR Ridge model for pause dominance proportions.""" dom_cols = ["prop_unplanned", "prop_planned", "prop_neutral"] valid = df[GUA_ALL + dom_cols].dropna() if len(valid) < 20: print(" Dominance: SKIPPED") return None eps = 0.01 valid = valid.copy() valid["alr_unplanned"] = np.log((valid["prop_unplanned"] + eps) / (valid["prop_neutral"] + eps)) valid["alr_planned"] = np.log((valid["prop_planned"] + eps) / (valid["prop_neutral"] + eps)) scaler = StandardScaler() X = scaler.fit_transform(valid[GUA_ALL].values) y_alr = valid[["alr_unplanned", "alr_planned"]].values # Find best alpha best_alpha, best_rho = 1.0, -1 for alpha in [0.01, 0.1, 1.0, 10.0, 100.0]: ridge = Ridge(alpha=alpha) ridge.fit(X, y_alr) preds = ridge.predict(X) from scipy.stats import spearmanr rho, _ = spearmanr(y_alr[:, 0], preds[:, 0]) if not np.isnan(rho) and rho > best_rho: best_rho = rho best_alpha = alpha ridge = Ridge(alpha=best_alpha) ridge.fit(X, y_alr) print(f" Dominance: Ridge(α={best_alpha}) N={len(valid)} rho={best_rho:.3f}") return { "model": ridge, "scaler": scaler, "alpha": best_alpha, } def save_population_stats(df): """Save population means and stds for z-scoring at inference time.""" stats = {"means": {}, "stds": {}} for col in GUA_ALL + ORDINAL_TARGETS + ["prop_unplanned", "prop_planned", "prop_neutral"]: if col in df.columns: stats["means"][col] = float(df[col].mean()) if df[col].notna().any() else 0.0 stats["stds"][col] = float(df[col].std()) if df[col].notna().any() else 1.0 with open(SAVE_DIR / "population_stats.json", "w") as f: json.dump(stats, f, indent=2) # Save composite distribution for percentile computation if "composite_raw" in df.columns: dist = df["composite_raw"].dropna().sort_values().values np.save(SAVE_DIR / "benchmark_distribution.npy", dist) print(f" Population stats: {len(stats['means'])} features") print(f" Benchmark distribution: {len(dist)} values") def main(): SAVE_DIR.mkdir(exist_ok=True) # Load data scored = pd.read_csv(DATA_PATH) if DATA_PATH.exists() else None joined = pd.read_csv(JOINED_PATH) if JOINED_PATH.exists() else None df = joined if joined is not None else scored if df is None: print("ERROR: No data found. Copy benchmark_stage2_joined.csv to fluency_app/data/") return print(f"Loaded: {len(df)} rows, {len(df.columns)} columns") # Check required columns missing = [c for c in GUA_ALL if c not in df.columns] if missing: print(f"WARNING: Missing features: {missing}") print(" Some models may have reduced accuracy") # Train ordinal models print("\n── Training Ordinal Models ──") ordinal_models = train_ordinal_models(df) for target, info in ordinal_models.items(): joblib.dump(info["model"], SAVE_DIR / f"ordinal_{target}.joblib") joblib.dump(info["scaler"], SAVE_DIR / f"scaler_{target}.joblib") # Save class mappings class_mappings = {t: info["class_unmap"] for t, info in ordinal_models.items()} with open(SAVE_DIR / "class_mappings.json", "w") as f: json.dump(class_mappings, f, indent=2) # Train dominance model print("\n── Training Dominance Model ──") dom = train_dominance_model(df) if dom: joblib.dump(dom["model"], SAVE_DIR / "dominance_ridge.joblib") joblib.dump(dom["scaler"], SAVE_DIR / "scaler_dominance.joblib") # Save population stats print("\n── Saving Population Statistics ──") # Need scored data for composite distribution if scored is not None: save_population_stats(scored) else: save_population_stats(df) # Save feature order with open(SAVE_DIR / "feature_order.json", "w") as f: json.dump(GUA_ALL, f) print(f"\n All models saved to {SAVE_DIR}/") print(f" Files: {list(SAVE_DIR.glob('*'))}") if __name__ == "__main__": main()