Spaces:
Sleeping
Sleeping
| """End-to-end smoke test: scripted episode, in-process, no server. | |
| Runs: run_baseline(adam) -> draft(Adam-ish) -> inspect -> draft(SGD+momentum) | |
| -> commit, and verifies the env threads state correctly and produces a | |
| finite reward. | |
| """ | |
| from __future__ import annotations | |
| import sys | |
| from pathlib import Path | |
| # Allow running directly: `python tests/test_episode.py` | |
| sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent)) | |
| from landscapeforge.models import LandscapeforgeAction # type: ignore | |
| from landscapeforge.server.landscapeforge_environment import ( # type: ignore | |
| LandscapeforgeEnvironment, | |
| ) | |
| ADAM_CODE = """ | |
| import numpy as np | |
| class Optimizer: | |
| def __init__(self, dim): | |
| self.lr = 1e-3 | |
| self.b1 = 0.9 | |
| self.b2 = 0.999 | |
| self.eps = 1e-8 | |
| self.m = np.zeros(dim) | |
| self.v = np.zeros(dim) | |
| self.t = 0 | |
| def step(self, x, f_val, grad): | |
| self.t += 1 | |
| self.m = self.b1 * self.m + (1 - self.b1) * grad | |
| self.v = self.b2 * self.v + (1 - self.b2) * grad * grad | |
| m_hat = self.m / (1 - self.b1 ** self.t) | |
| v_hat = self.v / (1 - self.b2 ** self.t) | |
| return x - self.lr * m_hat / (np.sqrt(v_hat) + self.eps) | |
| """ | |
| SGDM_CODE = """ | |
| import numpy as np | |
| class Optimizer: | |
| def __init__(self, dim): | |
| self.lr = 0.05 | |
| self.beta = 0.9 | |
| self.v = np.zeros(dim) | |
| def step(self, x, f_val, grad): | |
| self.v = self.beta * self.v - self.lr * grad | |
| return x + self.v | |
| """ | |
| def scripted_episode() -> None: | |
| env = LandscapeforgeEnvironment(tier="T0", seed=42) | |
| obs = env.reset() | |
| print(f"[reset] landscape: {obs.landscape_description}") | |
| print(f" dim={obs.dim}, hints={obs.structural_hints}") | |
| print(f" budget={obs.budget_remaining}") | |
| # 1. Run Adam baseline to see what it does. | |
| obs = env.step(LandscapeforgeAction( | |
| kind="run_baseline", baseline_name="adam", | |
| )) | |
| print(f"\n[run_baseline adam] result={obs.last_action_result}") | |
| print(f" budget_remaining={obs.budget_remaining}") | |
| # 2. Submit an Adam draft. | |
| obs = env.step(LandscapeforgeAction(kind="draft", code=ADAM_CODE)) | |
| print(f"\n[draft adam] compile_error={obs.last_action_result.get('compile_error')}") | |
| print(f" summary={obs.last_action_result.get('summary')}") | |
| print(f" budget_remaining={obs.budget_remaining}") | |
| # 3. Inspect the first draft. | |
| obs = env.step(LandscapeforgeAction( | |
| kind="inspect", draft_idx=0, step_range_start=10, step_range_end=20, | |
| )) | |
| print(f"\n[inspect 0 steps 10-20] result={obs.last_action_result}") | |
| print(f" budget_remaining={obs.budget_remaining}") | |
| # 4. Submit an SGD+momentum alternative. | |
| obs = env.step(LandscapeforgeAction(kind="draft", code=SGDM_CODE)) | |
| print(f"\n[draft sgdm] compile_error={obs.last_action_result.get('compile_error')}") | |
| print(f" summary={obs.last_action_result.get('summary')}") | |
| print(f" budget_remaining={obs.budget_remaining}") | |
| # 5. Commit. | |
| obs = env.step(LandscapeforgeAction(kind="commit")) | |
| print(f"\n[commit]") | |
| print(f" done={obs.done}") | |
| print(f" reward={obs.reward}") | |
| print(f" final_regret={obs.final_regret}") | |
| print(f" r_optcoder_breakdown={obs.r_optcoder_breakdown}") | |
| print(f" last_action_result={obs.last_action_result}") | |
| # Sanity checks | |
| assert obs.done is True, "should be done after commit" | |
| assert obs.reward is not None, "reward must be produced" | |
| assert obs.final_regret is not None, "final_regret must be produced" | |
| assert obs.r_optcoder_breakdown, "breakdown must be populated" | |
| print("\n✓ scripted_episode PASSED") | |
| def episode_with_broken_code() -> None: | |
| """Submitting code that fails to compile should not crash the env.""" | |
| env = LandscapeforgeEnvironment(tier="T0", seed=7) | |
| env.reset() | |
| # Intentional syntax error | |
| obs = env.step(LandscapeforgeAction( | |
| kind="draft", code="this is not python", | |
| )) | |
| print(f"\n[broken draft] compile_error={obs.last_action_result.get('compile_error')}") | |
| assert obs.last_action_result.get("compile_error") is not None | |
| assert obs.done is False | |
| # Commit with bad code — should produce worst-case regret, not crash | |
| obs = env.step(LandscapeforgeAction(kind="commit")) | |
| print(f"[broken commit] reward={obs.reward}, final_regret={obs.final_regret}") | |
| assert obs.done is True | |
| assert obs.reward is not None | |
| print("\n✓ episode_with_broken_code PASSED") | |
| def budget_exhaustion() -> None: | |
| """Spamming drafts until budget runs out should auto-commit.""" | |
| env = LandscapeforgeEnvironment(tier="T0", seed=3) | |
| env.reset() | |
| for i in range(10): | |
| obs = env.step(LandscapeforgeAction(kind="draft", code=ADAM_CODE)) | |
| if obs.done: | |
| print(f"\n[budget_exhaustion] auto-committed after {i+1} drafts") | |
| print(f" reason={obs.last_action_result.get('reason')}") | |
| assert obs.last_action_result.get("reason") == "budget_exhausted" | |
| print("\n✓ budget_exhaustion PASSED") | |
| return | |
| raise AssertionError("Budget never exhausted — shouldn't happen with draft cost 2, budget 12") | |
| if __name__ == "__main__": | |
| scripted_episode() | |
| episode_with_broken_code() | |
| budget_exhaustion() | |
| print("\nAll tests passed.") | |