""" check_leak.py — Validates BudgetRouterGRPOEnv before GRPO training. Checks: 1. Tool methods return strings (not crash). 2. Episode ends gracefully via ValueError (TRL-idiomatic done signal). 3. Reward is a float in [0, 1] — not a dict, not NaN. 4. History uses actual_degradation_start (jittered) — NOT the config constant. This proves grade_episode() will compute correct adaptation windows. 5. 10-step reward trajectory printed: verify no explosion/vanishing. 6. Provider status IS present in tool responses (intentional — text interface needs it). Run: uv run python check_leak.py """ import sys def main() -> None: try: from train.grpo_env import BudgetRouterGRPOEnv from budget_router.reward import grade_episode from budget_router.tasks import HARD_MULTI except ImportError as e: print(f"[FAIL] Import error: {e}") sys.exit(1) print("=" * 60) print("BudgetRouterGRPOEnv — Pre-training Validation") print("=" * 60) # ── Check 0: transformers version (soft warning — required for environment_factory) ── print("\n[CHECK 0] transformers version (required for environment_factory)...") try: import transformers ver_str = transformers.__version__ # TRL's environment_factory requires transformers >= 4.47.0 (confirmed shipping in # stable builds as of Apr 2026). Exact minimum threshold is version-specific to TRL. # If not installed, training will fail at import time — caught here early. print(f" ✅ transformers=={ver_str} installed.") # Soft check: warn if below 4.47 (minimum known to ship environment_factory support) major, minor = int(ver_str.split(".")[0]), int(ver_str.split(".")[1]) if major < 4 or (major == 4 and minor < 47): print( f" ⚠️ WARNING: transformers {ver_str} may be too old for environment_factory.\n" f" Recommended: pip install 'transformers>=4.47.0' or install from main." ) except ImportError: print( " ⚠️ WARNING: transformers is NOT installed in this venv.\n" " Install before GRPO training: pip install 'transformers>=4.47.0' trl accelerate peft" ) # ── Check 1: reset() returns a non-empty string ───────────────────── print("\n[CHECK 1] reset() returns rich text observation...") env = BudgetRouterGRPOEnv() obs_text = env.reset(scenario="hard_multi", seed=42) assert isinstance(obs_text, str) and len(obs_text) > 10, \ f"reset() should return non-empty string, got: {obs_text!r}" assert "Budget" in obs_text, "reset() should mention Budget" assert "Provider" in obs_text, "reset() should include provider status (text interface, not sanitized)" print(f" ✅ reset() returned {len(obs_text)} chars. Provider status PRESENT (correct for text interface).") print(f" Preview: {obs_text[:120].replace(chr(10), ' ')}...") # ── Check 2: Tool methods return strings step-by-step ─────────────── print("\n[CHECK 2] Tool methods return strings and accumulate history...") env2 = BudgetRouterGRPOEnv() env2.reset(scenario="hard_multi", seed=42) step_results = [] episode_done = False for step in range(25): # more than max_steps to test guard action_fn = [env2.route_to_a, env2.route_to_b, env2.shed_load, env2.route_to_b][step % 4] try: result = action_fn() assert isinstance(result, str), f"Tool method should return str, got {type(result)}" step_results.append(result) print(f" Step {step + 1:02d}: ✅ {result[:80].replace(chr(10), ' ')}...") except ValueError as e: episode_done = True print(f" Step {step + 1:02d}: ✅ Episode ended via ValueError (TRL-idiomatic): {str(e)[:80]}...") break assert episode_done, "Episode should end with ValueError before step 25" assert len(step_results) > 0, "At least one tool step should complete" print(f" ✅ Episode ended correctly after {len(step_results)} tool calls.") # ── Check 3: Reward is float in [0, 1] ────────────────────────────── print("\n[CHECK 3] Reward is float in [0, 1]...") assert isinstance(env2.reward, float), \ f"env.reward should be float, got {type(env2.reward)}: {env2.reward!r}" assert 0.0 <= env2.reward <= 1.0, \ f"env.reward should be in [0, 1], got {env2.reward}" import math assert not math.isnan(env2.reward), "env.reward is NaN — grade_episode bug" print(f" ✅ env.reward = {env2.reward:.4f} (float, in [0,1], not NaN)") # ── Check 4: History uses actual jittered degradation_start_step ──── print("\n[CHECK 4] History contains jittered actual_degradation_start (not config constant)...") history = env2._env._internal.history assert len(history) > 0, "History should not be empty after episode" # Read degradation_start_step from step_info (written by environment.py) step_info_degrade_start = history[0].get("degradation_start_step") # Read the actual jittered value from internal state actual_jittered_start = env2._env._internal.actual_degradation_start # Config constant for hard_multi config_constant = HARD_MULTI.degradation_start_step # = 0 print(f" Config constant (degradation_start_step): {config_constant}") print(f" step_info[degradation_start_step]: {step_info_degrade_start}") print(f" internal.actual_degradation_start: {actual_jittered_start}") assert step_info_degrade_start is not None, \ "step_info missing degradation_start_step — grade_episode() will break" assert step_info_degrade_start == actual_jittered_start, \ (f"step_info uses wrong degradation onset! " f"Got {step_info_degrade_start}, expected {actual_jittered_start}. " f"This would corrupt adaptation scores in grade_episode().") print(f" ✅ Jittered onset correctly propagated through step_info.") # ── Check 5: grade_episode() on history returns consistent score ───── print("\n[CHECK 5] grade_episode(history) matches env.reward...") grader_result = grade_episode(history) assert isinstance(grader_result, dict), "grade_episode should return dict" grader_score = float(grader_result["overall_score"]) assert abs(grader_score - env2.reward) < 1e-6, \ f"env.reward ({env2.reward}) != grade_episode score ({grader_score}). Mismatch." print(f" ✅ grade_episode overall_score = {grader_score:.4f}, env.reward = {env2.reward:.4f}. Match confirmed.") # ── Check 6: 10-episode reward trajectory ──────────────────────────── print("\n[CHECK 6] 10-episode reward trajectory (hard_multi, varying seeds)...") print(" Episode | Seed | Steps | Score | Reward-in-range") rewards = [] for ep, seed in enumerate(range(10)): env3 = BudgetRouterGRPOEnv() env3.reset(scenario="hard_multi", seed=seed) done = False steps = 0 while not done and steps < 30: # Alternate actions: A, B, A, B... (simple test policy) action_fn = env3.route_to_a if steps % 2 == 0 else env3.route_to_b try: action_fn() steps += 1 except ValueError: done = True reward = env3.reward rewards.append(reward) in_range = "✅" if 0.0 <= reward <= 1.0 else "❌" print(f" Ep {ep+1:02d} | {seed:4d} | {steps:5d} | {reward:.4f} | {in_range}") import statistics if len(rewards) > 1: std = statistics.stdev(rewards) mean = statistics.mean(rewards) print(f"\n Mean reward: {mean:.4f} | Std: {std:.4f}") if std < 0.03: print( f" ⚠️ WARNING: Low reward variance (std={std:.4f}). GRPO may get weak gradient signal.\n" f" Mitigation: Use num_generations=8, hard_multi scenario, and a small LLM\n" f" at initialization that makes diverse routing decisions." ) else: print(f" ✅ Reward variance is sufficient for GRPO learning (std={std:.4f} > 0.03).") print("\n" + "=" * 60) print("✅ ALL CHECKS PASSED — BudgetRouterGRPOEnv is ready for GRPO training.") print("=" * 60) print("\nRecommended training config (Mac MPS / Colab):") print(" scenario: hard_multi") print(" num_generations: 8") print(" model: Qwen2.5-1.5B (Mac 16GB) / Qwen2.5-7B (Colab T4)") print(" Mac: TRL + PyTorch MPS (set PYTORCH_ENABLE_MPS_FALLBACK=1)") print(" Colab: Unsloth + vLLM on NVIDIA T4/A100") if __name__ == "__main__": main()