""" RhythmEnv Inference Evaluation — Baseline vs Trained, with meta-RL eval suite. Three evaluation conditions: 1. discrete-3-profiles: 3 hardcoded reference profiles. A sanity check that the meta-trained agent still handles the original named profiles. 2. continuous-in-distribution: Sampled profiles from the training distribution (was the agent able to learn the meta-policy?) 3. continuous-OOD: Profiles from a held-out region of the parameter space (does the meta-policy generalize, or did the agent memorize?) Usage: # Baselines only (no trained model): python training/inference_eval.py # With trained model: python training/inference_eval.py --model_path outputs/rhythmenv_trained """ import argparse import json import os import random import sys from typing import Optional sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from models import ActionType, RhythmAction from server.rhythm_environment import RhythmEnvironment, MAX_STEPS, sample_profile, profile_to_belief_vector from training.dataset import heuristic_action DISCRETE_PROFILES = ["introvert_morning", "extrovert_night_owl", "workaholic_stoic"] SLOT_NAMES = ["Morning", "Afternoon", "Evening", "Night"] DAY_NAMES = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] # Seed ranges: training distribution = [0, 200); OOD = [10000, 10030) # (10000 offset makes seeded sampled profiles in OOD region statistically distinct) IN_DIST_SEEDS_DEFAULT = list(range(100, 110)) # 10 unseen-by-training in-distribution OOD_SEEDS_DEFAULT = list(range(10000, 10010)) # 10 OOD seeds def random_action(rng) -> ActionType: return rng.choice(list(ActionType)) def model_action(obs, model, tokenizer, return_belief: bool = False): """Get action (and optionally belief) from trained model.""" # Lazy imports: keep the heavy training-stack imports out of module load # so this script can run in baseline-only mode without unsloth/transformers. from training.dataset import format_observation_prompt, SYSTEM_PROMPT from training.reward_functions import extract_action_and_belief prompt = format_observation_prompt(obs) messages = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": prompt}, ] text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = tokenizer(text, return_tensors="pt").to(model.device) # 256 tokens lets the SFT-distilled student emit its full # ... block PLUS the final S M W ACTION_NAME line. # Earlier 20-token cap truncated mid-reasoning so the answer line was # never reached and parser fell back to extracting action names from # the partial reasoning text. outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.7, do_sample=True) response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) action_type, belief, _ = extract_action_and_belief(response) if action_type is None: action_type = ActionType.SLEEP return (action_type, belief) if return_belief else action_type def run_episode( seed: int, strategy: str, profile_mode: str = "continuous", profile: Optional[str] = None, model=None, tokenizer=None, ) -> dict: """Run a single episode and return per-episode metrics.""" rng = random.Random(seed + 500) env = RhythmEnvironment() if profile is not None: obs = env.reset(seed=seed, profile=profile) else: obs = env.reset(seed=seed, profile_mode=profile_mode) true_belief = env.get_belief_target() profile_name = env.state.profile_name total_reward = 0.0 step_rewards = [] actions_taken = [] beliefs_seen = [] # for trained model for step in range(MAX_STEPS): if obs.done: break if strategy == "heuristic": action_type = heuristic_action(obs) elif strategy == "random": action_type = random_action(rng) elif strategy == "model" and model is not None: action_type, belief = model_action(obs, model, tokenizer, return_belief=True) beliefs_seen.append(belief) # Tell the env about the emitted belief so the grader can score # belief_accuracy. Heuristic / random skip this — they get 0 on # the belief component, by design. env.record_belief(belief) else: action_type = random_action(rng) action = RhythmAction(action_type=action_type) actions_taken.append(action_type.value) obs = env.step(action) total_reward += obs.reward step_rewards.append(obs.reward) final_score = obs.reward_breakdown.get("final_score", 0.0) # Adaptation: late-half mean minus early-half mean half = max(len(step_rewards) // 2, 1) early = step_rewards[:half] late = step_rewards[half:] adaptation = (sum(late) / len(late) - sum(early) / len(early)) if (early and late) else 0.0 # Belief tracking (only for trained model) final_belief = beliefs_seen[-1] if beliefs_seen else None belief_mae = None if final_belief is not None: belief_mae = sum(abs(b - t) for b, t in zip(final_belief, true_belief)) / 3.0 return { "profile_name": profile_name, "profile_mode": profile_mode if profile is None else "discrete", "strategy": strategy, "seed": seed, "final_score": round(final_score, 4), "total_reward": round(total_reward, 2), "adaptation": round(adaptation, 3), "vitality": round(obs.vitality, 2), "cognition": round(obs.cognition, 2), "progress": round(obs.progress, 2), "serenity": round(obs.serenity, 2), "connection": round(obs.connection, 2), "actions": actions_taken, "true_belief": [round(x, 3) for x in true_belief], "final_belief": [round(x, 3) for x in final_belief] if final_belief is not None else None, "belief_mae": round(belief_mae, 3) if belief_mae is not None else None, } def eval_condition( name: str, strategies: list[str], runs: list[dict], model=None, tokenizer=None, ) -> list[dict]: """Run an eval condition and print summary.""" print(f"\n{'=' * 60}") print(f"Condition: {name}") print(f"{'=' * 60}") results = [] for strategy in strategies: print(f"\n Strategy: {strategy.upper()}") scores = [] adaptations = [] belief_maes = [] for run in runs: r = run_episode(strategy=strategy, model=model, tokenizer=tokenizer, **run) results.append({"condition": name, **r}) scores.append(r["final_score"]) adaptations.append(r["adaptation"]) if r["belief_mae"] is not None: belief_maes.append(r["belief_mae"]) avg_score = sum(scores) / len(scores) if scores else 0.0 avg_adapt = sum(adaptations) / len(adaptations) if adaptations else 0.0 avg_mae = sum(belief_maes) / len(belief_maes) if belief_maes else None line = f" avg_score={avg_score:.3f} avg_adaptation={avg_adapt:+.3f}" if avg_mae is not None: line += f" avg_belief_mae={avg_mae:.3f}" print(line) return results def main(): parser = argparse.ArgumentParser(description="Evaluate RhythmEnv agent (meta-RL eval suite)") parser.add_argument("--model_path", type=str, default=None, help="Path to trained model (skip for baseline only)") parser.add_argument("--num_episodes", type=int, default=5, help="Episodes per condition per strategy (for discrete: per-profile)") parser.add_argument("--output_file", type=str, default="eval_results.json") parser.add_argument("--in_dist_seeds", type=str, default=None, help="Comma-separated seeds for in-distribution eval") parser.add_argument("--ood_seeds", type=str, default=None, help="Comma-separated seeds for OOD eval") args = parser.parse_args() in_dist_seeds = ( [int(s) for s in args.in_dist_seeds.split(",")] if args.in_dist_seeds else IN_DIST_SEEDS_DEFAULT[:args.num_episodes * 2] ) ood_seeds = ( [int(s) for s in args.ood_seeds.split(",")] if args.ood_seeds else OOD_SEEDS_DEFAULT[:args.num_episodes * 2] ) model, tokenizer = None, None strategies = ["heuristic", "random"] if args.model_path and os.path.exists(args.model_path): try: from unsloth import FastLanguageModel # max_seq_length=2048 must accommodate: user prompt with 7-step # history + per-meter anomalies (~900-1200 tokens) PLUS # max_new_tokens=256 for the CoT response. Earlier value of 768 # silently truncated prompts on the LEFT (kept end of prompt, # lost system instructions or older meter history), producing # incoherent model outputs. model, tokenizer = FastLanguageModel.from_pretrained( model_name=args.model_path, load_in_4bit=True, max_seq_length=2048, ) FastLanguageModel.for_inference(model) strategies.append("model") print(f"Loaded trained model from: {args.model_path}") except Exception as e: print(f"Warning: Could not load model: {e}") print("Running baseline-only evaluation.") all_results = [] # Condition 1: 3 hardcoded reference profiles (sanity-check the agent # still handles the named profiles — no longer the primary eval signal). discrete_runs = [ {"seed": ep, "profile": p} for p in DISCRETE_PROFILES for ep in range(args.num_episodes) ] all_results += eval_condition( "discrete-3-profiles", strategies, discrete_runs, model=model, tokenizer=tokenizer, ) # Condition 2: In-distribution sampled profiles in_dist_runs = [{"seed": s, "profile_mode": "continuous"} for s in in_dist_seeds] all_results += eval_condition( "continuous-in-distribution", strategies, in_dist_runs, model=model, tokenizer=tokenizer, ) # Condition 3: OOD sampled profiles (the meta-learning generalization test) ood_runs = [{"seed": s, "profile_mode": "continuous"} for s in ood_seeds] all_results += eval_condition( "continuous-OOD (generalization)", strategies, ood_runs, model=model, tokenizer=tokenizer, ) # Per-profile breakdown for the 3 reference profiles print(f"\n{'=' * 70}") print("DISCRETE-3-PROFILE BREAKDOWN") print(f"{'=' * 70}") print(f"{'Profile':<25} ", end="") for s in strategies: print(f"{s:>10}", end="") print() print("-" * 70) discrete = [r for r in all_results if r["condition"] == "discrete-3-profiles"] for profile in DISCRETE_PROFILES: row = f"{profile:<25} " for s in strategies: rs = [r for r in discrete if r["profile_name"] == profile and r["strategy"] == s] avg = sum(r["final_score"] for r in rs) / len(rs) if rs else 0.0 row += f"{avg:>10.3f}" print(row) # Save with open(args.output_file, "w") as f: json.dump(all_results, f, indent=2) print(f"\nResults saved to: {args.output_file}") if __name__ == "__main__": main()