Spaces:

stimuler
/

fluency-benchmark

Running

App Files Files Community

fluency-benchmark / models /inference.py

syt20

Replace with fluency_app_v3: updated models, new pipeline modules, experiments

63fae5b verified 7 days ago

raw

history blame contribute delete

5.38 kB

	"""Load saved models and run inference on a feature vector.

	v3: Supports FrankHall (HistGBT binary decomposition) models alongside mord.
	Computes engineered features on-the-fly if models require them.
	Uses per-target feature orders from training_config.json.
	"""

	import json
	from pathlib import Path

	import joblib
	import numpy as np

	# FrankHallOrdinal must be importable for joblib deserialization of v3 models
	from models.frank_hall import FrankHallOrdinal # noqa: F401

	SAVE_DIR = Path(__file__).parent.parent / "saved_models"

	_models = {}
	_loaded = False


	# Engineered feature computations (must match training notebook exactly)
	ENGINEERED_FEATURES = {
	"pause_load": lambda f: f.get("pause_frequency_per_sec", 0) * f.get("mean_pause_duration_sec", 0),
	"continuity_composite": lambda f: f.get("mlu", 0) * f.get("speech_ratio", 0),
	"disruption_signal": lambda f: f.get("mid_clause_pause_ratio", 0) * f.get("fa_speech_rate_cv", 0),
	"pause_severity_ratio": lambda f: f.get("long_pause_ratio", 0) / (f.get("short_pause_share", 0) + 1e-6),
	"filled_pause_dominance": lambda f: f.get("fa_filled_pause_ratio", 0) / (f.get("pause_frequency_per_sec", 0) + 1e-6),
	}


	def _load_all():
	global _models, _loaded
	if _loaded:
	return

	with open(SAVE_DIR / "feature_order.json") as f:
	_models["feature_order"] = json.load(f)

	with open(SAVE_DIR / "class_mappings.json") as f:
	_models["class_mappings"] = json.load(f)

	# Load per-target training config (feature lists, model info)
	config_path = SAVE_DIR / "training_config.json"
	if config_path.exists():
	with open(config_path) as f:
	_models["training_config"] = json.load(f)
	else:
	_models["training_config"] = {}

	# Load ordinal models
	_models["ordinal"] = {}
	targets = [
	"articulation_ordinal", "pause_freq_ordinal", "pause_dur_ordinal",
	"pause_place_ordinal", "cognitive_load_ordinal", "utterance_constraints_ordinal",
	]
	for t in targets:
	model_path = SAVE_DIR / f"ordinal_{t}.joblib"
	scaler_path = SAVE_DIR / f"scaler_{t}.joblib"
	if model_path.exists() and scaler_path.exists():
	tc = _models["training_config"].get(t, {})
	_models["ordinal"][t] = {
	"model": joblib.load(model_path),
	"scaler": joblib.load(scaler_path),
	"class_unmap": {int(k): v for k, v in _models["class_mappings"].get(t, {}).items()},
	"features": tc.get("features", _models["feature_order"]),
	}

	# Load dominance model
	dom_path = SAVE_DIR / "dominance_ridge.joblib"
	dom_scaler_path = SAVE_DIR / "scaler_dominance.joblib"
	if dom_path.exists() and dom_scaler_path.exists():
	_models["dominance"] = {
	"model": joblib.load(dom_path),
	"scaler": joblib.load(dom_scaler_path),
	}

	_loaded = True


	def _build_feature_vector(features: dict, feature_list: list) -> np.ndarray:
	"""Build feature vector, computing engineered features on-the-fly if needed."""
	vals = []
	for f in feature_list:
	if f in features:
	vals.append(features[f])
	elif f in ENGINEERED_FEATURES:
	vals.append(ENGINEERED_FEATURES[f](features))
	else:
	vals.append(0.0)
	x = np.array([vals])
	return np.nan_to_num(x, nan=0.0)


	def predict(features: dict) -> dict:
	"""Run all models on a feature dict.

	Args:
	features: dict with keys matching GUA_ALL feature names

	Returns:
	dict with ordinal predictions and dominance proportions
	"""
	_load_all()

	results = {}

	# Ordinal predictions (v3: per-target feature lists, FrankHall + mord compatible)
	label_maps = {
	"articulation_ordinal": {1: "Legato", 2: "Portato", 3: "Staccato+"},
	"pause_freq_ordinal": {1: "Low", 2: "Medium", 3: "High+"},
	"pause_dur_ordinal": {1: "Short", 2: "Medium", 3: "Long"},
	"pause_place_ordinal": {1: "Boundary", 2: "Mixed", 3: "Mid-Constituent"},
	"cognitive_load_ordinal": {1: "Low", 2: "Medium", 3: "High"},
	"utterance_constraints_ordinal": {1: "Low", 2: "Medium", 3: "High"},
	}

	for target, info in _models.get("ordinal", {}).items():
	target_features = info.get("features", _models["feature_order"])
	x = _build_feature_vector(features, target_features)
	x_scaled = info["scaler"].transform(x)
	pred_mapped = info["model"].predict(x_scaled)[0]
	pred_orig = info["class_unmap"].get(int(pred_mapped), int(pred_mapped))
	label = label_maps.get(target, {}).get(pred_orig, str(pred_orig))
	results[f"{target}_pred"] = pred_orig
	results[f"{target}_label"] = label

	# Dominance predictions (always uses base 25 features)
	dom = _models.get("dominance")
	if dom:
	x = _build_feature_vector(features, _models["feature_order"])
	x_scaled = dom["scaler"].transform(x)
	alr_pred = dom["model"].predict(x_scaled)[0]
	exp_u = np.exp(alr_pred[0])
	exp_p = np.exp(alr_pred[1])
	denom = 1 + exp_u + exp_p
	results["prop_unplanned_pred"] = round(float(exp_u / denom), 4)
	results["prop_planned_pred"] = round(float(exp_p / denom), 4)
	results["prop_neutral_pred"] = round(float(1.0 / denom), 4)

	return results