Spaces:

mnawfal29
/

landscapeforge

Sleeping

App Files Files Community

landscapeforge / tests /test_episode.py

mnawfal29

Upload folder using huggingface_hub

b0b140b verified 14 days ago

raw

history blame contribute delete

5.45 kB

	"""End-to-end smoke test: scripted episode, in-process, no server.

	Runs: run_baseline(adam) -> draft(Adam-ish) -> inspect -> draft(SGD+momentum)
	-> commit, and verifies the env threads state correctly and produces a
	finite reward.
	"""

	from __future__ import annotations

	import sys
	from pathlib import Path

	# Allow running directly: `python tests/test_episode.py`
	sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))

	from landscapeforge.models import LandscapeforgeAction # type: ignore
	from landscapeforge.server.landscapeforge_environment import ( # type: ignore
	LandscapeforgeEnvironment,
	)


	ADAM_CODE = """
	import numpy as np

	class Optimizer:
	def __init__(self, dim):
	self.lr = 1e-3
	self.b1 = 0.9
	self.b2 = 0.999
	self.eps = 1e-8
	self.m = np.zeros(dim)
	self.v = np.zeros(dim)
	self.t = 0

	def step(self, x, f_val, grad):
	self.t += 1
	self.m = self.b1 * self.m + (1 - self.b1) * grad
	self.v = self.b2 * self.v + (1 - self.b2) * grad * grad
	m_hat = self.m / (1 - self.b1 ** self.t)
	v_hat = self.v / (1 - self.b2 ** self.t)
	return x - self.lr * m_hat / (np.sqrt(v_hat) + self.eps)
	"""

	SGDM_CODE = """
	import numpy as np

	class Optimizer:
	def __init__(self, dim):
	self.lr = 0.05
	self.beta = 0.9
	self.v = np.zeros(dim)

	def step(self, x, f_val, grad):
	self.v = self.beta * self.v - self.lr * grad
	return x + self.v
	"""


	def scripted_episode() -> None:
	env = LandscapeforgeEnvironment(tier="T0", seed=42)
	obs = env.reset()
	print(f"[reset] landscape: {obs.landscape_description}")
	print(f" dim={obs.dim}, hints={obs.structural_hints}")
	print(f" budget={obs.budget_remaining}")

	# 1. Run Adam baseline to see what it does.
	obs = env.step(LandscapeforgeAction(
	kind="run_baseline", baseline_name="adam",
	))
	print(f"\n[run_baseline adam] result={obs.last_action_result}")
	print(f" budget_remaining={obs.budget_remaining}")

	# 2. Submit an Adam draft.
	obs = env.step(LandscapeforgeAction(kind="draft", code=ADAM_CODE))
	print(f"\n[draft adam] compile_error={obs.last_action_result.get('compile_error')}")
	print(f" summary={obs.last_action_result.get('summary')}")
	print(f" budget_remaining={obs.budget_remaining}")

	# 3. Inspect the first draft.
	obs = env.step(LandscapeforgeAction(
	kind="inspect", draft_idx=0, step_range_start=10, step_range_end=20,
	))
	print(f"\n[inspect 0 steps 10-20] result={obs.last_action_result}")
	print(f" budget_remaining={obs.budget_remaining}")

	# 4. Submit an SGD+momentum alternative.
	obs = env.step(LandscapeforgeAction(kind="draft", code=SGDM_CODE))
	print(f"\n[draft sgdm] compile_error={obs.last_action_result.get('compile_error')}")
	print(f" summary={obs.last_action_result.get('summary')}")
	print(f" budget_remaining={obs.budget_remaining}")

	# 5. Commit.
	obs = env.step(LandscapeforgeAction(kind="commit"))
	print(f"\n[commit]")
	print(f" done={obs.done}")
	print(f" reward={obs.reward}")
	print(f" final_regret={obs.final_regret}")
	print(f" r_optcoder_breakdown={obs.r_optcoder_breakdown}")
	print(f" last_action_result={obs.last_action_result}")

	# Sanity checks
	assert obs.done is True, "should be done after commit"
	assert obs.reward is not None, "reward must be produced"
	assert obs.final_regret is not None, "final_regret must be produced"
	assert obs.r_optcoder_breakdown, "breakdown must be populated"
	print("\n✓ scripted_episode PASSED")


	def episode_with_broken_code() -> None:
	"""Submitting code that fails to compile should not crash the env."""
	env = LandscapeforgeEnvironment(tier="T0", seed=7)
	env.reset()

	# Intentional syntax error
	obs = env.step(LandscapeforgeAction(
	kind="draft", code="this is not python",
	))
	print(f"\n[broken draft] compile_error={obs.last_action_result.get('compile_error')}")
	assert obs.last_action_result.get("compile_error") is not None
	assert obs.done is False

	# Commit with bad code — should produce worst-case regret, not crash
	obs = env.step(LandscapeforgeAction(kind="commit"))
	print(f"[broken commit] reward={obs.reward}, final_regret={obs.final_regret}")
	assert obs.done is True
	assert obs.reward is not None
	print("\n✓ episode_with_broken_code PASSED")


	def budget_exhaustion() -> None:
	"""Spamming drafts until budget runs out should auto-commit."""
	env = LandscapeforgeEnvironment(tier="T0", seed=3)
	env.reset()

	for i in range(10):
	obs = env.step(LandscapeforgeAction(kind="draft", code=ADAM_CODE))
	if obs.done:
	print(f"\n[budget_exhaustion] auto-committed after {i+1} drafts")
	print(f" reason={obs.last_action_result.get('reason')}")
	assert obs.last_action_result.get("reason") == "budget_exhausted"
	print("\n✓ budget_exhaustion PASSED")
	return
	raise AssertionError("Budget never exhausted — shouldn't happen with draft cost 2, budget 12")


	if __name__ == "__main__":
	scripted_episode()
	episode_with_broken_code()
	budget_exhaustion()
	print("\nAll tests passed.")