fluency-benchmark / models /inference.py
syt20's picture
Replace with fluency_app_v3: updated models, new pipeline modules, experiments
63fae5b verified
"""Load saved models and run inference on a feature vector.
v3: Supports FrankHall (HistGBT binary decomposition) models alongside mord.
Computes engineered features on-the-fly if models require them.
Uses per-target feature orders from training_config.json.
"""
import json
from pathlib import Path
import joblib
import numpy as np
# FrankHallOrdinal must be importable for joblib deserialization of v3 models
from models.frank_hall import FrankHallOrdinal # noqa: F401
SAVE_DIR = Path(__file__).parent.parent / "saved_models"
_models = {}
_loaded = False
# Engineered feature computations (must match training notebook exactly)
ENGINEERED_FEATURES = {
"pause_load": lambda f: f.get("pause_frequency_per_sec", 0) * f.get("mean_pause_duration_sec", 0),
"continuity_composite": lambda f: f.get("mlu", 0) * f.get("speech_ratio", 0),
"disruption_signal": lambda f: f.get("mid_clause_pause_ratio", 0) * f.get("fa_speech_rate_cv", 0),
"pause_severity_ratio": lambda f: f.get("long_pause_ratio", 0) / (f.get("short_pause_share", 0) + 1e-6),
"filled_pause_dominance": lambda f: f.get("fa_filled_pause_ratio", 0) / (f.get("pause_frequency_per_sec", 0) + 1e-6),
}
def _load_all():
global _models, _loaded
if _loaded:
return
with open(SAVE_DIR / "feature_order.json") as f:
_models["feature_order"] = json.load(f)
with open(SAVE_DIR / "class_mappings.json") as f:
_models["class_mappings"] = json.load(f)
# Load per-target training config (feature lists, model info)
config_path = SAVE_DIR / "training_config.json"
if config_path.exists():
with open(config_path) as f:
_models["training_config"] = json.load(f)
else:
_models["training_config"] = {}
# Load ordinal models
_models["ordinal"] = {}
targets = [
"articulation_ordinal", "pause_freq_ordinal", "pause_dur_ordinal",
"pause_place_ordinal", "cognitive_load_ordinal", "utterance_constraints_ordinal",
]
for t in targets:
model_path = SAVE_DIR / f"ordinal_{t}.joblib"
scaler_path = SAVE_DIR / f"scaler_{t}.joblib"
if model_path.exists() and scaler_path.exists():
tc = _models["training_config"].get(t, {})
_models["ordinal"][t] = {
"model": joblib.load(model_path),
"scaler": joblib.load(scaler_path),
"class_unmap": {int(k): v for k, v in _models["class_mappings"].get(t, {}).items()},
"features": tc.get("features", _models["feature_order"]),
}
# Load dominance model
dom_path = SAVE_DIR / "dominance_ridge.joblib"
dom_scaler_path = SAVE_DIR / "scaler_dominance.joblib"
if dom_path.exists() and dom_scaler_path.exists():
_models["dominance"] = {
"model": joblib.load(dom_path),
"scaler": joblib.load(dom_scaler_path),
}
_loaded = True
def _build_feature_vector(features: dict, feature_list: list) -> np.ndarray:
"""Build feature vector, computing engineered features on-the-fly if needed."""
vals = []
for f in feature_list:
if f in features:
vals.append(features[f])
elif f in ENGINEERED_FEATURES:
vals.append(ENGINEERED_FEATURES[f](features))
else:
vals.append(0.0)
x = np.array([vals])
return np.nan_to_num(x, nan=0.0)
def predict(features: dict) -> dict:
"""Run all models on a feature dict.
Args:
features: dict with keys matching GUA_ALL feature names
Returns:
dict with ordinal predictions and dominance proportions
"""
_load_all()
results = {}
# Ordinal predictions (v3: per-target feature lists, FrankHall + mord compatible)
label_maps = {
"articulation_ordinal": {1: "Legato", 2: "Portato", 3: "Staccato+"},
"pause_freq_ordinal": {1: "Low", 2: "Medium", 3: "High+"},
"pause_dur_ordinal": {1: "Short", 2: "Medium", 3: "Long"},
"pause_place_ordinal": {1: "Boundary", 2: "Mixed", 3: "Mid-Constituent"},
"cognitive_load_ordinal": {1: "Low", 2: "Medium", 3: "High"},
"utterance_constraints_ordinal": {1: "Low", 2: "Medium", 3: "High"},
}
for target, info in _models.get("ordinal", {}).items():
target_features = info.get("features", _models["feature_order"])
x = _build_feature_vector(features, target_features)
x_scaled = info["scaler"].transform(x)
pred_mapped = info["model"].predict(x_scaled)[0]
pred_orig = info["class_unmap"].get(int(pred_mapped), int(pred_mapped))
label = label_maps.get(target, {}).get(pred_orig, str(pred_orig))
results[f"{target}_pred"] = pred_orig
results[f"{target}_label"] = label
# Dominance predictions (always uses base 25 features)
dom = _models.get("dominance")
if dom:
x = _build_feature_vector(features, _models["feature_order"])
x_scaled = dom["scaler"].transform(x)
alr_pred = dom["model"].predict(x_scaled)[0]
exp_u = np.exp(alr_pred[0])
exp_p = np.exp(alr_pred[1])
denom = 1 + exp_u + exp_p
results["prop_unplanned_pred"] = round(float(exp_u / denom), 4)
results["prop_planned_pred"] = round(float(exp_p / denom), 4)
results["prop_neutral_pred"] = round(float(1.0 / denom), 4)
return results