"""Load saved models and run inference on a feature vector. v3: Supports FrankHall (HistGBT binary decomposition) models alongside mord. Computes engineered features on-the-fly if models require them. Uses per-target feature orders from training_config.json. """ import json from pathlib import Path import joblib import numpy as np # FrankHallOrdinal must be importable for joblib deserialization of v3 models from models.frank_hall import FrankHallOrdinal # noqa: F401 SAVE_DIR = Path(__file__).parent.parent / "saved_models" _models = {} _loaded = False # Engineered feature computations (must match training notebook exactly) ENGINEERED_FEATURES = { "pause_load": lambda f: f.get("pause_frequency_per_sec", 0) * f.get("mean_pause_duration_sec", 0), "continuity_composite": lambda f: f.get("mlu", 0) * f.get("speech_ratio", 0), "disruption_signal": lambda f: f.get("mid_clause_pause_ratio", 0) * f.get("fa_speech_rate_cv", 0), "pause_severity_ratio": lambda f: f.get("long_pause_ratio", 0) / (f.get("short_pause_share", 0) + 1e-6), "filled_pause_dominance": lambda f: f.get("fa_filled_pause_ratio", 0) / (f.get("pause_frequency_per_sec", 0) + 1e-6), } def _load_all(): global _models, _loaded if _loaded: return with open(SAVE_DIR / "feature_order.json") as f: _models["feature_order"] = json.load(f) with open(SAVE_DIR / "class_mappings.json") as f: _models["class_mappings"] = json.load(f) # Load per-target training config (feature lists, model info) config_path = SAVE_DIR / "training_config.json" if config_path.exists(): with open(config_path) as f: _models["training_config"] = json.load(f) else: _models["training_config"] = {} # Load ordinal models _models["ordinal"] = {} targets = [ "articulation_ordinal", "pause_freq_ordinal", "pause_dur_ordinal", "pause_place_ordinal", "cognitive_load_ordinal", "utterance_constraints_ordinal", ] for t in targets: model_path = SAVE_DIR / f"ordinal_{t}.joblib" scaler_path = SAVE_DIR / f"scaler_{t}.joblib" if model_path.exists() and scaler_path.exists(): tc = _models["training_config"].get(t, {}) _models["ordinal"][t] = { "model": joblib.load(model_path), "scaler": joblib.load(scaler_path), "class_unmap": {int(k): v for k, v in _models["class_mappings"].get(t, {}).items()}, "features": tc.get("features", _models["feature_order"]), } # Load dominance model dom_path = SAVE_DIR / "dominance_ridge.joblib" dom_scaler_path = SAVE_DIR / "scaler_dominance.joblib" if dom_path.exists() and dom_scaler_path.exists(): _models["dominance"] = { "model": joblib.load(dom_path), "scaler": joblib.load(dom_scaler_path), } _loaded = True def _build_feature_vector(features: dict, feature_list: list) -> np.ndarray: """Build feature vector, computing engineered features on-the-fly if needed.""" vals = [] for f in feature_list: if f in features: vals.append(features[f]) elif f in ENGINEERED_FEATURES: vals.append(ENGINEERED_FEATURES[f](features)) else: vals.append(0.0) x = np.array([vals]) return np.nan_to_num(x, nan=0.0) def predict(features: dict) -> dict: """Run all models on a feature dict. Args: features: dict with keys matching GUA_ALL feature names Returns: dict with ordinal predictions and dominance proportions """ _load_all() results = {} # Ordinal predictions (v3: per-target feature lists, FrankHall + mord compatible) label_maps = { "articulation_ordinal": {1: "Legato", 2: "Portato", 3: "Staccato+"}, "pause_freq_ordinal": {1: "Low", 2: "Medium", 3: "High+"}, "pause_dur_ordinal": {1: "Short", 2: "Medium", 3: "Long"}, "pause_place_ordinal": {1: "Boundary", 2: "Mixed", 3: "Mid-Constituent"}, "cognitive_load_ordinal": {1: "Low", 2: "Medium", 3: "High"}, "utterance_constraints_ordinal": {1: "Low", 2: "Medium", 3: "High"}, } for target, info in _models.get("ordinal", {}).items(): target_features = info.get("features", _models["feature_order"]) x = _build_feature_vector(features, target_features) x_scaled = info["scaler"].transform(x) pred_mapped = info["model"].predict(x_scaled)[0] pred_orig = info["class_unmap"].get(int(pred_mapped), int(pred_mapped)) label = label_maps.get(target, {}).get(pred_orig, str(pred_orig)) results[f"{target}_pred"] = pred_orig results[f"{target}_label"] = label # Dominance predictions (always uses base 25 features) dom = _models.get("dominance") if dom: x = _build_feature_vector(features, _models["feature_order"]) x_scaled = dom["scaler"].transform(x) alr_pred = dom["model"].predict(x_scaled)[0] exp_u = np.exp(alr_pred[0]) exp_p = np.exp(alr_pred[1]) denom = 1 + exp_u + exp_p results["prop_unplanned_pred"] = round(float(exp_u / denom), 4) results["prop_planned_pred"] = round(float(exp_p / denom), 4) results["prop_neutral_pred"] = round(float(1.0 / denom), 4) return results