Spaces:
Running
Running
| """Train all models on the full benchmark dataset and save for inference. | |
| Run ONCE before using the app: | |
| python -m models.train_and_save | |
| """ | |
| import json | |
| import warnings | |
| from pathlib import Path | |
| import joblib | |
| import mord | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.linear_model import Ridge | |
| from sklearn.preprocessing import StandardScaler | |
| from scipy.stats import kendalltau | |
| from sklearn.metrics import accuracy_score | |
| warnings.filterwarnings("ignore") | |
| BASE = Path(__file__).parent.parent | |
| DATA_PATH = BASE / "data" / "benchmark_stage2_scored.csv" | |
| JOINED_PATH = BASE / "data" / "benchmark_stage2_joined.csv" | |
| SAVE_DIR = BASE / "saved_models" | |
| GUA_CORE = [ | |
| "speech_ratio", "mlu", "mean_pause_duration_sec", | |
| "pause_frequency_per_sec", "long_pause_ratio", "short_pause_share", | |
| ] | |
| PLACEMENT = [ | |
| "boundary_pause_ratio", "mid_clause_pause_ratio", | |
| "mid_clause_mean_duration", "mid_clause_long_ratio", | |
| ] | |
| FA_FEATURES = [ | |
| "fa_filled_pause_count", "fa_filled_pause_ratio", | |
| "fa_mean_word_duration_sec", "fa_std_word_duration_sec", | |
| "fa_mean_word_confidence", "fa_low_confidence_word_ratio", | |
| "fa_articulation_rate", "fa_speech_rate_cv", | |
| "fa_pause_position_mean", "fa_pause_position_std", | |
| "fa_long_pause_ratio", | |
| ] | |
| SYN_FEATURES = [ | |
| "syn_content_word_preceding_ratio", "syn_function_word_preceding_ratio", | |
| "syn_content_word_following_ratio", "syn_clause_boundary_pause_ratio", | |
| ] | |
| GUA_ALL = GUA_CORE + PLACEMENT + FA_FEATURES + SYN_FEATURES | |
| ORDINAL_TARGETS = [ | |
| "articulation_ordinal", "pause_freq_ordinal", "pause_dur_ordinal", | |
| "pause_place_ordinal", "cognitive_load_ordinal", "utterance_constraints_ordinal", | |
| ] | |
| def train_ordinal_models(df): | |
| """Train best ordinal model per target on full dataset.""" | |
| models = {} | |
| for target in ORDINAL_TARGETS: | |
| valid = df[GUA_ALL + [target]].dropna() | |
| if len(valid) < 20: | |
| print(f" {target}: SKIPPED (N={len(valid)})") | |
| continue | |
| X = valid[GUA_ALL].values | |
| y = valid[target].astype(int).values | |
| scaler = StandardScaler() | |
| X_scaled = scaler.fit_transform(X) | |
| unique = np.sort(np.unique(y)) | |
| class_map = {c: i for i, c in enumerate(unique)} | |
| class_unmap = {i: c for c, i in class_map.items()} | |
| y_mapped = np.array([class_map[v] for v in y]) | |
| # Grid search for best model | |
| best_model, best_name, best_score = None, "", -1 | |
| for alpha in [0.01, 0.1, 1.0, 5.0, 10.0]: | |
| for ModelClass, mname in [ | |
| (mord.LogisticAT, "AT"), | |
| (mord.LogisticIT, "IT"), | |
| (mord.LogisticSE, "SE"), | |
| ]: | |
| try: | |
| m = ModelClass(alpha=alpha, max_iter=3000) | |
| m.fit(X_scaled, y_mapped) | |
| preds = m.predict(X_scaled) | |
| preds_orig = np.array([class_unmap.get(p, p) for p in preds]) | |
| acc = accuracy_score(y, preds_orig) | |
| tau, _ = kendalltau(y, preds_orig) | |
| tau = tau if not np.isnan(tau) else 0 | |
| score = 0.5 * acc + 0.5 * tau | |
| if score > best_score: | |
| best_score = score | |
| best_model = m | |
| best_name = f"{mname}(α={alpha})" | |
| except Exception: | |
| continue | |
| models[target] = { | |
| "model": best_model, | |
| "scaler": scaler, | |
| "class_map": class_map, | |
| "class_unmap": {str(k): int(v) for k, v in class_unmap.items()}, | |
| "name": best_name, | |
| "n_train": len(valid), | |
| } | |
| print(f" {target}: {best_name} N={len(valid)} score={best_score:.3f}") | |
| return models | |
| def train_dominance_model(df): | |
| """Train ALR Ridge model for pause dominance proportions.""" | |
| dom_cols = ["prop_unplanned", "prop_planned", "prop_neutral"] | |
| valid = df[GUA_ALL + dom_cols].dropna() | |
| if len(valid) < 20: | |
| print(" Dominance: SKIPPED") | |
| return None | |
| eps = 0.01 | |
| valid = valid.copy() | |
| valid["alr_unplanned"] = np.log((valid["prop_unplanned"] + eps) / (valid["prop_neutral"] + eps)) | |
| valid["alr_planned"] = np.log((valid["prop_planned"] + eps) / (valid["prop_neutral"] + eps)) | |
| scaler = StandardScaler() | |
| X = scaler.fit_transform(valid[GUA_ALL].values) | |
| y_alr = valid[["alr_unplanned", "alr_planned"]].values | |
| # Find best alpha | |
| best_alpha, best_rho = 1.0, -1 | |
| for alpha in [0.01, 0.1, 1.0, 10.0, 100.0]: | |
| ridge = Ridge(alpha=alpha) | |
| ridge.fit(X, y_alr) | |
| preds = ridge.predict(X) | |
| from scipy.stats import spearmanr | |
| rho, _ = spearmanr(y_alr[:, 0], preds[:, 0]) | |
| if not np.isnan(rho) and rho > best_rho: | |
| best_rho = rho | |
| best_alpha = alpha | |
| ridge = Ridge(alpha=best_alpha) | |
| ridge.fit(X, y_alr) | |
| print(f" Dominance: Ridge(α={best_alpha}) N={len(valid)} rho={best_rho:.3f}") | |
| return { | |
| "model": ridge, | |
| "scaler": scaler, | |
| "alpha": best_alpha, | |
| } | |
| def save_population_stats(df): | |
| """Save population means and stds for z-scoring at inference time.""" | |
| stats = {"means": {}, "stds": {}} | |
| for col in GUA_ALL + ORDINAL_TARGETS + ["prop_unplanned", "prop_planned", "prop_neutral"]: | |
| if col in df.columns: | |
| stats["means"][col] = float(df[col].mean()) if df[col].notna().any() else 0.0 | |
| stats["stds"][col] = float(df[col].std()) if df[col].notna().any() else 1.0 | |
| with open(SAVE_DIR / "population_stats.json", "w") as f: | |
| json.dump(stats, f, indent=2) | |
| # Save composite distribution for percentile computation | |
| if "composite_raw" in df.columns: | |
| dist = df["composite_raw"].dropna().sort_values().values | |
| np.save(SAVE_DIR / "benchmark_distribution.npy", dist) | |
| print(f" Population stats: {len(stats['means'])} features") | |
| print(f" Benchmark distribution: {len(dist)} values") | |
| def main(): | |
| SAVE_DIR.mkdir(exist_ok=True) | |
| # Load data | |
| scored = pd.read_csv(DATA_PATH) if DATA_PATH.exists() else None | |
| joined = pd.read_csv(JOINED_PATH) if JOINED_PATH.exists() else None | |
| df = joined if joined is not None else scored | |
| if df is None: | |
| print("ERROR: No data found. Copy benchmark_stage2_joined.csv to fluency_app/data/") | |
| return | |
| print(f"Loaded: {len(df)} rows, {len(df.columns)} columns") | |
| # Check required columns | |
| missing = [c for c in GUA_ALL if c not in df.columns] | |
| if missing: | |
| print(f"WARNING: Missing features: {missing}") | |
| print(" Some models may have reduced accuracy") | |
| # Train ordinal models | |
| print("\n── Training Ordinal Models ──") | |
| ordinal_models = train_ordinal_models(df) | |
| for target, info in ordinal_models.items(): | |
| joblib.dump(info["model"], SAVE_DIR / f"ordinal_{target}.joblib") | |
| joblib.dump(info["scaler"], SAVE_DIR / f"scaler_{target}.joblib") | |
| # Save class mappings | |
| class_mappings = {t: info["class_unmap"] for t, info in ordinal_models.items()} | |
| with open(SAVE_DIR / "class_mappings.json", "w") as f: | |
| json.dump(class_mappings, f, indent=2) | |
| # Train dominance model | |
| print("\n── Training Dominance Model ──") | |
| dom = train_dominance_model(df) | |
| if dom: | |
| joblib.dump(dom["model"], SAVE_DIR / "dominance_ridge.joblib") | |
| joblib.dump(dom["scaler"], SAVE_DIR / "scaler_dominance.joblib") | |
| # Save population stats | |
| print("\n── Saving Population Statistics ──") | |
| # Need scored data for composite distribution | |
| if scored is not None: | |
| save_population_stats(scored) | |
| else: | |
| save_population_stats(df) | |
| # Save feature order | |
| with open(SAVE_DIR / "feature_order.json", "w") as f: | |
| json.dump(GUA_ALL, f) | |
| print(f"\n All models saved to {SAVE_DIR}/") | |
| print(f" Files: {list(SAVE_DIR.glob('*'))}") | |
| if __name__ == "__main__": | |
| main() | |