landscapeforge / tests /test_episode.py
mnawfal29's picture
Upload folder using huggingface_hub
b0b140b verified
"""End-to-end smoke test: scripted episode, in-process, no server.
Runs: run_baseline(adam) -> draft(Adam-ish) -> inspect -> draft(SGD+momentum)
-> commit, and verifies the env threads state correctly and produces a
finite reward.
"""
from __future__ import annotations
import sys
from pathlib import Path
# Allow running directly: `python tests/test_episode.py`
sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
from landscapeforge.models import LandscapeforgeAction # type: ignore
from landscapeforge.server.landscapeforge_environment import ( # type: ignore
LandscapeforgeEnvironment,
)
ADAM_CODE = """
import numpy as np
class Optimizer:
def __init__(self, dim):
self.lr = 1e-3
self.b1 = 0.9
self.b2 = 0.999
self.eps = 1e-8
self.m = np.zeros(dim)
self.v = np.zeros(dim)
self.t = 0
def step(self, x, f_val, grad):
self.t += 1
self.m = self.b1 * self.m + (1 - self.b1) * grad
self.v = self.b2 * self.v + (1 - self.b2) * grad * grad
m_hat = self.m / (1 - self.b1 ** self.t)
v_hat = self.v / (1 - self.b2 ** self.t)
return x - self.lr * m_hat / (np.sqrt(v_hat) + self.eps)
"""
SGDM_CODE = """
import numpy as np
class Optimizer:
def __init__(self, dim):
self.lr = 0.05
self.beta = 0.9
self.v = np.zeros(dim)
def step(self, x, f_val, grad):
self.v = self.beta * self.v - self.lr * grad
return x + self.v
"""
def scripted_episode() -> None:
env = LandscapeforgeEnvironment(tier="T0", seed=42)
obs = env.reset()
print(f"[reset] landscape: {obs.landscape_description}")
print(f" dim={obs.dim}, hints={obs.structural_hints}")
print(f" budget={obs.budget_remaining}")
# 1. Run Adam baseline to see what it does.
obs = env.step(LandscapeforgeAction(
kind="run_baseline", baseline_name="adam",
))
print(f"\n[run_baseline adam] result={obs.last_action_result}")
print(f" budget_remaining={obs.budget_remaining}")
# 2. Submit an Adam draft.
obs = env.step(LandscapeforgeAction(kind="draft", code=ADAM_CODE))
print(f"\n[draft adam] compile_error={obs.last_action_result.get('compile_error')}")
print(f" summary={obs.last_action_result.get('summary')}")
print(f" budget_remaining={obs.budget_remaining}")
# 3. Inspect the first draft.
obs = env.step(LandscapeforgeAction(
kind="inspect", draft_idx=0, step_range_start=10, step_range_end=20,
))
print(f"\n[inspect 0 steps 10-20] result={obs.last_action_result}")
print(f" budget_remaining={obs.budget_remaining}")
# 4. Submit an SGD+momentum alternative.
obs = env.step(LandscapeforgeAction(kind="draft", code=SGDM_CODE))
print(f"\n[draft sgdm] compile_error={obs.last_action_result.get('compile_error')}")
print(f" summary={obs.last_action_result.get('summary')}")
print(f" budget_remaining={obs.budget_remaining}")
# 5. Commit.
obs = env.step(LandscapeforgeAction(kind="commit"))
print(f"\n[commit]")
print(f" done={obs.done}")
print(f" reward={obs.reward}")
print(f" final_regret={obs.final_regret}")
print(f" r_optcoder_breakdown={obs.r_optcoder_breakdown}")
print(f" last_action_result={obs.last_action_result}")
# Sanity checks
assert obs.done is True, "should be done after commit"
assert obs.reward is not None, "reward must be produced"
assert obs.final_regret is not None, "final_regret must be produced"
assert obs.r_optcoder_breakdown, "breakdown must be populated"
print("\n✓ scripted_episode PASSED")
def episode_with_broken_code() -> None:
"""Submitting code that fails to compile should not crash the env."""
env = LandscapeforgeEnvironment(tier="T0", seed=7)
env.reset()
# Intentional syntax error
obs = env.step(LandscapeforgeAction(
kind="draft", code="this is not python",
))
print(f"\n[broken draft] compile_error={obs.last_action_result.get('compile_error')}")
assert obs.last_action_result.get("compile_error") is not None
assert obs.done is False
# Commit with bad code — should produce worst-case regret, not crash
obs = env.step(LandscapeforgeAction(kind="commit"))
print(f"[broken commit] reward={obs.reward}, final_regret={obs.final_regret}")
assert obs.done is True
assert obs.reward is not None
print("\n✓ episode_with_broken_code PASSED")
def budget_exhaustion() -> None:
"""Spamming drafts until budget runs out should auto-commit."""
env = LandscapeforgeEnvironment(tier="T0", seed=3)
env.reset()
for i in range(10):
obs = env.step(LandscapeforgeAction(kind="draft", code=ADAM_CODE))
if obs.done:
print(f"\n[budget_exhaustion] auto-committed after {i+1} drafts")
print(f" reason={obs.last_action_result.get('reason')}")
assert obs.last_action_result.get("reason") == "budget_exhausted"
print("\n✓ budget_exhaustion PASSED")
return
raise AssertionError("Budget never exhausted — shouldn't happen with draft cost 2, budget 12")
if __name__ == "__main__":
scripted_episode()
episode_with_broken_code()
budget_exhaustion()
print("\nAll tests passed.")