Spaces:
Sleeping
Sleeping
| """ | |
| RhythmEnv Inference Evaluation — Baseline vs Trained, with meta-RL eval suite. | |
| Three evaluation conditions: | |
| 1. discrete-3-profiles: 3 hardcoded reference profiles. A sanity check | |
| that the meta-trained agent still handles the original named profiles. | |
| 2. continuous-in-distribution: Sampled profiles from the training distribution | |
| (was the agent able to learn the meta-policy?) | |
| 3. continuous-OOD: Profiles from a held-out region of the parameter space | |
| (does the meta-policy generalize, or did the agent memorize?) | |
| Usage: | |
| # Baselines only (no trained model): | |
| python training/inference_eval.py | |
| # With trained model: | |
| python training/inference_eval.py --model_path outputs/rhythmenv_trained | |
| """ | |
| import argparse | |
| import json | |
| import os | |
| import random | |
| import sys | |
| from typing import Optional | |
| sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) | |
| from models import ActionType, RhythmAction | |
| from server.rhythm_environment import RhythmEnvironment, MAX_STEPS, sample_profile, profile_to_belief_vector | |
| from training.dataset import heuristic_action | |
| DISCRETE_PROFILES = ["introvert_morning", "extrovert_night_owl", "workaholic_stoic"] | |
| SLOT_NAMES = ["Morning", "Afternoon", "Evening", "Night"] | |
| DAY_NAMES = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] | |
| # Seed ranges: training distribution = [0, 200); OOD = [10000, 10030) | |
| # (10000 offset makes seeded sampled profiles in OOD region statistically distinct) | |
| IN_DIST_SEEDS_DEFAULT = list(range(100, 110)) # 10 unseen-by-training in-distribution | |
| OOD_SEEDS_DEFAULT = list(range(10000, 10010)) # 10 OOD seeds | |
| def random_action(rng) -> ActionType: | |
| return rng.choice(list(ActionType)) | |
| def model_action(obs, model, tokenizer, return_belief: bool = False): | |
| """Get action (and optionally belief) from trained model.""" | |
| # Lazy imports: keep the heavy training-stack imports out of module load | |
| # so this script can run in baseline-only mode without unsloth/transformers. | |
| from training.dataset import format_observation_prompt, SYSTEM_PROMPT | |
| from training.reward_functions import extract_action_and_belief | |
| prompt = format_observation_prompt(obs) | |
| messages = [ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": prompt}, | |
| ] | |
| text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| inputs = tokenizer(text, return_tensors="pt").to(model.device) | |
| # 256 tokens lets the SFT-distilled student emit its full | |
| # <reasoning>...</reasoning> block PLUS the final S M W ACTION_NAME line. | |
| # Earlier 20-token cap truncated mid-reasoning so the answer line was | |
| # never reached and parser fell back to extracting action names from | |
| # the partial reasoning text. | |
| outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.7, do_sample=True) | |
| response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) | |
| action_type, belief, _ = extract_action_and_belief(response) | |
| if action_type is None: | |
| action_type = ActionType.SLEEP | |
| return (action_type, belief) if return_belief else action_type | |
| def run_episode( | |
| seed: int, | |
| strategy: str, | |
| profile_mode: str = "continuous", | |
| profile: Optional[str] = None, | |
| model=None, | |
| tokenizer=None, | |
| ) -> dict: | |
| """Run a single episode and return per-episode metrics.""" | |
| rng = random.Random(seed + 500) | |
| env = RhythmEnvironment() | |
| if profile is not None: | |
| obs = env.reset(seed=seed, profile=profile) | |
| else: | |
| obs = env.reset(seed=seed, profile_mode=profile_mode) | |
| true_belief = env.get_belief_target() | |
| profile_name = env.state.profile_name | |
| total_reward = 0.0 | |
| step_rewards = [] | |
| actions_taken = [] | |
| beliefs_seen = [] # for trained model | |
| for step in range(MAX_STEPS): | |
| if obs.done: | |
| break | |
| if strategy == "heuristic": | |
| action_type = heuristic_action(obs) | |
| elif strategy == "random": | |
| action_type = random_action(rng) | |
| elif strategy == "model" and model is not None: | |
| action_type, belief = model_action(obs, model, tokenizer, return_belief=True) | |
| beliefs_seen.append(belief) | |
| # Tell the env about the emitted belief so the grader can score | |
| # belief_accuracy. Heuristic / random skip this — they get 0 on | |
| # the belief component, by design. | |
| env.record_belief(belief) | |
| else: | |
| action_type = random_action(rng) | |
| action = RhythmAction(action_type=action_type) | |
| actions_taken.append(action_type.value) | |
| obs = env.step(action) | |
| total_reward += obs.reward | |
| step_rewards.append(obs.reward) | |
| final_score = obs.reward_breakdown.get("final_score", 0.0) | |
| # Adaptation: late-half mean minus early-half mean | |
| half = max(len(step_rewards) // 2, 1) | |
| early = step_rewards[:half] | |
| late = step_rewards[half:] | |
| adaptation = (sum(late) / len(late) - sum(early) / len(early)) if (early and late) else 0.0 | |
| # Belief tracking (only for trained model) | |
| final_belief = beliefs_seen[-1] if beliefs_seen else None | |
| belief_mae = None | |
| if final_belief is not None: | |
| belief_mae = sum(abs(b - t) for b, t in zip(final_belief, true_belief)) / 3.0 | |
| return { | |
| "profile_name": profile_name, | |
| "profile_mode": profile_mode if profile is None else "discrete", | |
| "strategy": strategy, | |
| "seed": seed, | |
| "final_score": round(final_score, 4), | |
| "total_reward": round(total_reward, 2), | |
| "adaptation": round(adaptation, 3), | |
| "vitality": round(obs.vitality, 2), | |
| "cognition": round(obs.cognition, 2), | |
| "progress": round(obs.progress, 2), | |
| "serenity": round(obs.serenity, 2), | |
| "connection": round(obs.connection, 2), | |
| "actions": actions_taken, | |
| "true_belief": [round(x, 3) for x in true_belief], | |
| "final_belief": [round(x, 3) for x in final_belief] if final_belief is not None else None, | |
| "belief_mae": round(belief_mae, 3) if belief_mae is not None else None, | |
| } | |
| def eval_condition( | |
| name: str, | |
| strategies: list[str], | |
| runs: list[dict], | |
| model=None, | |
| tokenizer=None, | |
| ) -> list[dict]: | |
| """Run an eval condition and print summary.""" | |
| print(f"\n{'=' * 60}") | |
| print(f"Condition: {name}") | |
| print(f"{'=' * 60}") | |
| results = [] | |
| for strategy in strategies: | |
| print(f"\n Strategy: {strategy.upper()}") | |
| scores = [] | |
| adaptations = [] | |
| belief_maes = [] | |
| for run in runs: | |
| r = run_episode(strategy=strategy, model=model, tokenizer=tokenizer, **run) | |
| results.append({"condition": name, **r}) | |
| scores.append(r["final_score"]) | |
| adaptations.append(r["adaptation"]) | |
| if r["belief_mae"] is not None: | |
| belief_maes.append(r["belief_mae"]) | |
| avg_score = sum(scores) / len(scores) if scores else 0.0 | |
| avg_adapt = sum(adaptations) / len(adaptations) if adaptations else 0.0 | |
| avg_mae = sum(belief_maes) / len(belief_maes) if belief_maes else None | |
| line = f" avg_score={avg_score:.3f} avg_adaptation={avg_adapt:+.3f}" | |
| if avg_mae is not None: | |
| line += f" avg_belief_mae={avg_mae:.3f}" | |
| print(line) | |
| return results | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Evaluate RhythmEnv agent (meta-RL eval suite)") | |
| parser.add_argument("--model_path", type=str, default=None, | |
| help="Path to trained model (skip for baseline only)") | |
| parser.add_argument("--num_episodes", type=int, default=5, | |
| help="Episodes per condition per strategy (for discrete: per-profile)") | |
| parser.add_argument("--output_file", type=str, default="eval_results.json") | |
| parser.add_argument("--in_dist_seeds", type=str, default=None, | |
| help="Comma-separated seeds for in-distribution eval") | |
| parser.add_argument("--ood_seeds", type=str, default=None, | |
| help="Comma-separated seeds for OOD eval") | |
| args = parser.parse_args() | |
| in_dist_seeds = ( | |
| [int(s) for s in args.in_dist_seeds.split(",")] if args.in_dist_seeds | |
| else IN_DIST_SEEDS_DEFAULT[:args.num_episodes * 2] | |
| ) | |
| ood_seeds = ( | |
| [int(s) for s in args.ood_seeds.split(",")] if args.ood_seeds | |
| else OOD_SEEDS_DEFAULT[:args.num_episodes * 2] | |
| ) | |
| model, tokenizer = None, None | |
| strategies = ["heuristic", "random"] | |
| if args.model_path and os.path.exists(args.model_path): | |
| try: | |
| from unsloth import FastLanguageModel | |
| # max_seq_length=2048 must accommodate: user prompt with 7-step | |
| # history + per-meter anomalies (~900-1200 tokens) PLUS | |
| # max_new_tokens=256 for the CoT response. Earlier value of 768 | |
| # silently truncated prompts on the LEFT (kept end of prompt, | |
| # lost system instructions or older meter history), producing | |
| # incoherent model outputs. | |
| model, tokenizer = FastLanguageModel.from_pretrained( | |
| model_name=args.model_path, | |
| load_in_4bit=True, | |
| max_seq_length=2048, | |
| ) | |
| FastLanguageModel.for_inference(model) | |
| strategies.append("model") | |
| print(f"Loaded trained model from: {args.model_path}") | |
| except Exception as e: | |
| print(f"Warning: Could not load model: {e}") | |
| print("Running baseline-only evaluation.") | |
| all_results = [] | |
| # Condition 1: 3 hardcoded reference profiles (sanity-check the agent | |
| # still handles the named profiles — no longer the primary eval signal). | |
| discrete_runs = [ | |
| {"seed": ep, "profile": p} | |
| for p in DISCRETE_PROFILES for ep in range(args.num_episodes) | |
| ] | |
| all_results += eval_condition( | |
| "discrete-3-profiles", | |
| strategies, discrete_runs, | |
| model=model, tokenizer=tokenizer, | |
| ) | |
| # Condition 2: In-distribution sampled profiles | |
| in_dist_runs = [{"seed": s, "profile_mode": "continuous"} for s in in_dist_seeds] | |
| all_results += eval_condition( | |
| "continuous-in-distribution", | |
| strategies, in_dist_runs, | |
| model=model, tokenizer=tokenizer, | |
| ) | |
| # Condition 3: OOD sampled profiles (the meta-learning generalization test) | |
| ood_runs = [{"seed": s, "profile_mode": "continuous"} for s in ood_seeds] | |
| all_results += eval_condition( | |
| "continuous-OOD (generalization)", | |
| strategies, ood_runs, | |
| model=model, tokenizer=tokenizer, | |
| ) | |
| # Per-profile breakdown for the 3 reference profiles | |
| print(f"\n{'=' * 70}") | |
| print("DISCRETE-3-PROFILE BREAKDOWN") | |
| print(f"{'=' * 70}") | |
| print(f"{'Profile':<25} ", end="") | |
| for s in strategies: | |
| print(f"{s:>10}", end="") | |
| print() | |
| print("-" * 70) | |
| discrete = [r for r in all_results if r["condition"] == "discrete-3-profiles"] | |
| for profile in DISCRETE_PROFILES: | |
| row = f"{profile:<25} " | |
| for s in strategies: | |
| rs = [r for r in discrete if r["profile_name"] == profile and r["strategy"] == s] | |
| avg = sum(r["final_score"] for r in rs) / len(rs) if rs else 0.0 | |
| row += f"{avg:>10.3f}" | |
| print(row) | |
| # Save | |
| with open(args.output_file, "w") as f: | |
| json.dump(all_results, f, indent=2) | |
| print(f"\nResults saved to: {args.output_file}") | |
| if __name__ == "__main__": | |
| main() | |