Spaces:

mnawfal29
/

landscapeforge

Sleeping

App Files Files Community

landscapeforge / server /landscapeforge_environment.py

mnawfal29

Upload folder using huggingface_hub

b0b140b verified 14 days ago

raw

history blame contribute delete

20.5 kB

	"""LandscapeForge OpenEnv environment — OptCoder REPL (Phase C).

	For v1 we ship OptCoder-only: LandscapeForge is a fixed template picker
	controlled by the env itself (uniform random over the tier menu). The agent
	acting through OpenEnv is OptCoder.

	Each `reset()` samples a new landscape from the current tier. Each `step()`
	executes one OptCoder action (run_baseline / draft / inspect / commit),
	mutates env state, and returns an Observation reflecting the new state.
	Episode ends when OptCoder commits or budget is exhausted.
	"""

	from __future__ import annotations

	from typing import Any, Optional
	from uuid import uuid4

	import numpy as np
	from openenv.core.env_server.interfaces import Environment
	from openenv.core.env_server.types import State

	try:
	from ..models import (
	ACTION_COSTS,
	LandscapeforgeAction,
	LandscapeforgeObservation,
	)
	from ..landscapes import (
	TIER_MENU,
	Landscape,
	build_landscape,
	structural_hints,
	)
	from ..reference_optimizers import run_baseline as run_reference_baseline
	from ..reference_optimizers import tune_adam_lr
	from ..sandbox import SandboxError, compile_optimizer
	from ..arena import ArenaResult, auto_test_draft, run_arena
	from ..rewards import ast_novelty_score, compute_optcoder_reward, compute_step_reward
	except ImportError:
	# Running from repo root or package layout quirks
	from models import ( # type: ignore
	ACTION_COSTS,
	LandscapeforgeAction,
	LandscapeforgeObservation,
	)
	from landscapes import ( # type: ignore
	TIER_MENU,
	Landscape,
	build_landscape,
	structural_hints,
	)
	from reference_optimizers import run_baseline as run_reference_baseline # type: ignore
	from reference_optimizers import tune_adam_lr # type: ignore
	from sandbox import SandboxError, compile_optimizer # type: ignore
	from arena import ArenaResult, auto_test_draft, run_arena # type: ignore
	from rewards import ast_novelty_score, compute_optcoder_reward, compute_step_reward # type: ignore


	BUDGET_TOTAL = 12
	ARENA_SEEDS = [101, 202, 303, 404, 505, 606, 707, 808, 909, 1010]
	ARENA_STEPS = 200
	BASELINE_STEPS = 30 # env-controlled; agent does not choose


	# Reference source blobs for AST novelty comparison (short pseudo-implementations).
	# Kept minimal — enough to detect "this commit is basically Adam".
	_REF_SGD = """
	class Optimizer:
	def __init__(self, dim): self.lr = 0.01
	def step(self, x, f, g): return x - self.lr * g
	""".strip()

	def _adam_source(lr: float) -> str:
	"""Adam reference implementation parameterized by LR.

	Used by `_ensure_adam_arena` after LR tuning — the baseline is
	Adam-at-best-LR-for-this-landscape, not Adam-at-fixed-default.
	"""
	return f"""
	class Optimizer:
	def __init__(self, dim):
	self.lr = {lr}
	self.b1 = 0.9
	self.b2 = 0.999
	self.eps = 1e-8
	self.m = np.zeros(dim)
	self.v = np.zeros(dim)
	self.t = 0
	def step(self, x, f_val, g):
	self.t += 1
	self.m = self.b1self.m + (1-self.b1)g
	self.v = self.b2self.v + (1-self.b2)g*g
	mh = self.m/(1-self.b1**self.t)
	vh = self.v/(1-self.b2**self.t)
	return x - self.lr * mh / (np.sqrt(vh) + self.eps)
	""".strip()


	# Frozen default-LR source used only for AST-novelty comparison (so r_novelty
	# measures "structurally different from Adam" regardless of the tuned LR).
	_REF_ADAM = _adam_source(0.001)

	_REF_MOMENTUM = """
	class Optimizer:
	def __init__(self, dim):
	import numpy as np
	self.lr=0.01; self.beta=0.9; self.v = np.zeros(dim)
	def step(self, x, f, g):
	self.v = self.betaself.v - self.lrg
	return x + self.v
	""".strip()

	REFERENCE_SOURCES = [_REF_SGD, _REF_ADAM, _REF_MOMENTUM]


	class LandscapeforgeEnvironment(Environment):
	"""OptCoder-facing OpenEnv environment.

	LandscapeForge is internal (template picker) in v1.
	"""

	SUPPORTS_CONCURRENT_SESSIONS: bool = True

	def __init__(self, tier: str = "T0", seed: int = 0):
	self._initial_tier = tier
	self._master_rng = np.random.default_rng(seed)
	self._reset_count = 0
	self._tier = tier
	self._state = State(episode_id=str(uuid4()), step_count=0)
	# Populated by reset()
	self._landscape: Optional[Landscape] = None
	self._hints: dict = {}
	self._baseline_history: list[dict] = []
	self._draft_history: list[dict] = []
	self._draft_details: list[list[dict]] = [] # per-draft per-step detail
	self._inspect_requests: list[dict] = []
	self._current_draft: Optional[str] = None
	self._budget_spent: int = 0
	self._committed: bool = False
	self._final_obs: Optional[LandscapeforgeObservation] = None
	# Cache Adam's full arena result per episode (computed lazily, for
	# reward normalization via progress-based r_regret). The baseline is
	# Adam-at-tuned-LR — per-landscape LR is selected via a short sweep.
	self._adam_arena_cache: Optional[ArenaResult] = None
	self._adam_tuned_lr: Optional[float] = None
	# Stepwise feedback log (PBS delta + compile penalty). This is shown to
	# the LLM in the observation so it can course-correct mid-episode, but
	# NEVER added to the training scalar — final reward is purely terminal
	# arena reward (§9.1) for robustness against reward hacking.
	self._step_feedback_log: list[dict] = []

	# ---------- OpenEnv API ----------

	def reset(self) -> LandscapeforgeObservation:
	self._reset_count += 1
	self._state = State(episode_id=str(uuid4()), step_count=0)

	# Pick a landscape from the current tier's menu.
	menu = TIER_MENU[self._tier]
	template = str(self._master_rng.choice(menu))
	dim = int(self._master_rng.integers(2, 6)) # small dims for v1
	params = self._sample_params(template)
	self._landscape = build_landscape(
	template=template, dim=dim, params=params,
	rng=np.random.default_rng(int(self._master_rng.integers(0, 2**31))),
	)
	self._hints = structural_hints(
	self._landscape,
	rng=np.random.default_rng(int(self._master_rng.integers(0, 2**31))),
	)

	# Wipe REPL state
	self._baseline_history = []
	self._draft_history = []
	self._draft_details = []
	self._inspect_requests = []
	self._current_draft = None
	self._budget_spent = 0
	self._committed = False
	self._final_obs = None
	self._adam_arena_cache = None
	self._adam_tuned_lr = None
	self._step_feedback_log = []

	return self._make_observation(
	last_kind=None, last_result={"reset": True}, done=False, reward=0.0,
	)

	def step(self, action: LandscapeforgeAction) -> LandscapeforgeObservation: # type: ignore[override]
	if self._landscape is None:
	raise RuntimeError("step() called before reset()")
	if self._committed:
	# Episode already done; return terminal obs.
	assert self._final_obs is not None
	return self._final_obs

	self._state.step_count += 1
	cost = ACTION_COSTS[action.kind]
	# Charge budget first so over-limit actions are rejected.
	if self._budget_spent + cost > BUDGET_TOTAL and action.kind != "commit":
	return self._force_commit(reason="budget_exhausted")

	self._budget_spent += cost

	# Snapshot draft history for PBS computation
	prev_draft_history_snapshot = list(self._draft_history)

	if action.kind == "run_baseline":
	result = self._do_run_baseline(action)
	elif action.kind == "draft":
	result = self._do_draft(action)
	elif action.kind == "inspect":
	result = self._do_inspect(action)
	elif action.kind == "commit":
	return self._do_commit()
	else:
	raise ValueError(f"Unknown action kind: {action.kind}")

	# Compute stepwise FEEDBACK (NOT reward). Signals the LLM can use to
	# course-correct mid-episode — exposed through last_action_result.
	# Explicitly NOT summed into training reward; terminal arena reward
	# is the only signal GRPO sees (robust against reward hacking).
	step_feedback = compute_step_reward(
	prev_draft_history=prev_draft_history_snapshot,
	new_draft_history=self._draft_history,
	action_kind=action.kind,
	action_result=result,
	)
	if step_feedback["breakdown"]:
	entry = {
	"turn": self._state.step_count,
	"action_kind": action.kind,
	**step_feedback["breakdown"],
	}
	self._step_feedback_log.append(entry)
	# Surface on this turn's action result so the LLM sees it immediately.
	result = {**result, "feedback": step_feedback["breakdown"]}

	# Check if budget now exhausted; if so, auto-commit.
	if self._budget_spent >= BUDGET_TOTAL:
	return self._force_commit(reason="budget_exhausted")

	return self._make_observation(
	last_kind=action.kind, last_result=result,
	done=False, reward=0.0, # no reward on non-terminal steps
	)

	@property
	def state(self) -> State:
	return self._state

	# ---------- Action handlers ----------

	def _do_run_baseline(self, action: LandscapeforgeAction) -> dict:
	assert self._landscape is not None
	# Fixed init AND fixed step count for baseline comparability across
	# episodes and rollouts (important for GRPO group-relative advantages).
	rng = np.random.default_rng(42)
	x0 = rng.normal(0.0, 0.5, size=self._landscape.dim)
	result = run_reference_baseline(
	name=action.baseline_name, f=self._landscape.f, grad=self._landscape.grad,
	x0=x0, steps=BASELINE_STEPS,
	)
	self._baseline_history.append(result)
	return {
	"baseline_index": len(self._baseline_history) - 1,
	"name": result["name"],
	"n_steps": len(result["trajectory"]),
	"final_f": (result["trajectory"][-1]["f"]
	if result["trajectory"] and result["trajectory"][-1]["f"] is not None
	else None),
	}

	def _do_draft(self, action: LandscapeforgeAction) -> dict:
	assert self._landscape is not None
	code = action.code or ""
	self._current_draft = code
	try:
	opt = compile_optimizer(code, dim=self._landscape.dim)
	except SandboxError as e:
	# Record failed draft; still counts toward history for inspect.
	self._draft_history.append({
	"code": code,
	"compile_error": str(e),
	"summary": {"converged": False, "diverged": True, "error": str(e),
	"final_f": None, "step_of_min": None, "min_f": None},
	})
	self._draft_details.append([])
	return {"draft_index": len(self._draft_history) - 1,
	"compile_error": str(e), "summary": None}

	test = auto_test_draft(opt, self._landscape, seed=0, steps=20)
	self._draft_history.append({
	"code": code,
	"compile_error": None,
	"summary": test["summary"],
	})
	self._draft_details.append(test["detail"])
	return {"draft_index": len(self._draft_history) - 1,
	"compile_error": None, "summary": test["summary"]}

	def _do_inspect(self, action: LandscapeforgeAction) -> dict:
	idx = action.draft_idx
	if idx is None or idx < 0 or idx >= len(self._draft_details):
	return {"error": f"draft_idx {idx} out of range (have {len(self._draft_details)} drafts)"}
	detail = self._draft_details[idx]
	start = action.step_range_start
	end = min(action.step_range_end, len(detail))
	sliced = detail[start:end]
	record = {
	"draft_idx": idx,
	"step_range": [start, end],
	"detail": sliced,
	}
	self._inspect_requests.append(record)
	return {"draft_idx": idx, "step_range": [start, end], "n_steps": len(sliced)}

	def _do_commit(self) -> LandscapeforgeObservation:
	return self._finalize_episode(reason="commit")

	def _force_commit(self, reason: str) -> LandscapeforgeObservation:
	return self._finalize_episode(reason=reason)

	# ---------- Episode finalization ----------

	def _finalize_episode(self, reason: str) -> LandscapeforgeObservation:
	assert self._landscape is not None
	self._committed = True

	# Need a current_draft. If none, produce a worst-case result.
	if not self._current_draft:
	result = {
	"reason": reason,
	"no_draft": True,
	"final_regret": 1.0,
	}
	r_total = -1.0
	breakdown = {"no_draft": 1.0}
	obs = self._make_observation(
	last_kind="commit", last_result=result,
	done=True, reward=r_total,
	)
	obs.committed = True
	obs.final_regret = 1.0
	obs.r_optcoder = r_total
	obs.r_optcoder_breakdown = breakdown
	self._final_obs = obs
	return obs

	# Full Phase-D arena eval
	try:
	opt = compile_optimizer(self._current_draft, dim=self._landscape.dim)
	arena = run_arena(opt, self._landscape, seeds=ARENA_SEEDS, steps=ARENA_STEPS)
	except SandboxError as e:
	# Committed code fails to compile -> worst-case result
	arena = ArenaResult(
	initial_values=[1.0] * len(ARENA_SEEDS),
	final_values=[float("nan")] * len(ARENA_SEEDS),
	crashed=[True] * len(ARENA_SEEDS),
	trajectories=[[] for _ in ARENA_SEEDS],
	)

	# Adam baseline arena for normalization (always run for reward stability).
	adam_arena = self._ensure_adam_arena()

	novelty = ast_novelty_score(self._current_draft, REFERENCE_SOURCES)
	# Convergence step: first seed's trajectory, first step where f < 0.01 * f0
	convergence_step = self._compute_convergence_step(arena)

	reward = compute_optcoder_reward(
	arena=arena,
	adam_arena=adam_arena,
	actions_used_cost=self._budget_spent,
	budget_total=BUDGET_TOTAL,
	novelty_score=novelty,
	convergence_step=convergence_step,
	arena_steps=ARENA_STEPS,
	)

	result = {
	"reason": reason,
	"my_mean_progress": arena.mean_progress,
	"adam_mean_progress": adam_arena.mean_progress,
	"adam_tuned_lr": self._adam_tuned_lr,
	"speedup_vs_adam": reward.breakdown.get("speedup_vs_adam"),
	"crash_fraction": arena.crash_fraction,
	"novelty_score": novelty,
	"convergence_step": convergence_step,
	}

	obs = self._make_observation(
	last_kind="commit", last_result=result,
	done=True, reward=reward.r_total,
	)
	obs.committed = True
	# `final_regret` is reinterpreted (no f_min dependency): Adam-shortfall
	# in [0, 1]. 0 = matched or beat Adam's descent; 1 = made zero progress
	# while Adam descended normally. Capped at 1.
	speedup = reward.breakdown.get("speedup_vs_adam", 0.0)
	obs.final_regret = float(max(0.0, min(1.0, 1.0 - speedup)))
	obs.r_optcoder = reward.r_total
	obs.r_optcoder_breakdown = reward.breakdown
	self._final_obs = obs
	return obs

	# ---------- Helpers ----------

	def _make_observation(self, last_kind: Optional[str], last_result: dict,
	done: bool, reward: float) -> LandscapeforgeObservation:
	assert self._landscape is not None
	return LandscapeforgeObservation(
	landscape_description=self._landscape.description,
	dim=self._landscape.dim,
	structural_hints=self._hints,
	baseline_history=self._serialize_baseline_history(),
	draft_history=self._serialize_draft_history(),
	inspect_requests=list(self._inspect_requests),
	current_draft=self._current_draft,
	budget_remaining=BUDGET_TOTAL - self._budget_spent,
	last_action_kind=last_kind,
	last_action_result=last_result,
	done=done,
	reward=reward,
	)

	def _serialize_baseline_history(self) -> list[dict]:
	# Trim trajectory to summary-friendly size (every step, x as list).
	return [
	{"name": b["name"], "trajectory": b["trajectory"]}
	for b in self._baseline_history
	]

	def _serialize_draft_history(self) -> list[dict]:
	# For the observation we include code + summary per draft.
	return [
	{"code": d["code"], "summary": d["summary"], "compile_error": d["compile_error"]}
	for d in self._draft_history
	]

	def _sample_params(self, template: str) -> dict:
	rng = self._master_rng
	if template == "quadratic":
	# T0 uses cond up to 100; T1 up to 1000; T2 higher.
	cap = {"T0": 100.0, "T1": 1000.0, "T2": 10_000.0}[self._tier]
	return {"cond": float(rng.uniform(1.0, cap))}
	if template == "gaussian_mix":
	return {
	"k": int(rng.integers(2, 6)),
	"sigma": float(rng.uniform(0.3, 1.0)),
	"spread": float(rng.uniform(1.0, 4.0)),
	}
	if template == "huber":
	return {"delta": float(rng.uniform(0.5, 2.0))}
	return {}

	def _ensure_adam_arena(self) -> ArenaResult:
	"""Build the Adam baseline, FAIRLY — LR is tuned per landscape before
	running the arena. The tuning uses a short 30-step sweep on a dedicated
	seed (not one of the arena seeds) to avoid overfitting.

	Cached per episode in `_adam_arena_cache`. Tuned LR is stored in
	`_adam_tuned_lr` for logging / demo surfacing.
	"""
	if self._adam_arena_cache is not None:
	return self._adam_arena_cache
	assert self._landscape is not None
	try:
	# Tune LR on seed 0 (not in ARENA_SEEDS), 30-step sweep.
	tune_rng = np.random.default_rng(0)
	tune_x0 = tune_rng.normal(0.0, 0.5, size=self._landscape.dim)
	best_lr = tune_adam_lr(
	f=self._landscape.f, grad=self._landscape.grad,
	x0=tune_x0, sweep_steps=30,
	)
	self._adam_tuned_lr = best_lr

	adam_opt = compile_optimizer(_adam_source(best_lr), dim=self._landscape.dim)
	self._adam_arena_cache = run_arena(
	adam_opt, self._landscape,
	seeds=ARENA_SEEDS, steps=ARENA_STEPS,
	)
	except Exception:
	self._adam_tuned_lr = None
	self._adam_arena_cache = ArenaResult(
	initial_values=[1.0] * len(ARENA_SEEDS),
	final_values=[1.0] * len(ARENA_SEEDS),
	crashed=[True] * len(ARENA_SEEDS),
	trajectories=[[] for _ in ARENA_SEEDS],
	)
	return self._adam_arena_cache

	def _compute_convergence_step(self, arena) -> Optional[int]:
	"""First step on first seed where f < 1% of initial f."""
	if not arena.trajectories or not arena.trajectories[0]:
	return None
	traj = arena.trajectories[0]
	if not traj:
	return None
	f0 = traj[0]["f"]
	if f0 <= 0:
	return None
	threshold = 0.01 * f0
	for t, snap in enumerate(traj):
	if snap["f"] < threshold:
	return t
	return None

	# ---------- Tier advancement API (used by trainer, not agent) ----------

	def advance_tier(self, new_tier: str) -> None:
	if new_tier not in TIER_MENU:
	raise ValueError(f"Unknown tier {new_tier}")
	self._tier = new_tier