Spaces:

mnawfal29
/

landscapeforge

Sleeping

App Files Files Community

landscapeforge / arena.py

mnawfal29

Upload folder using huggingface_hub

901a0ed verified 14 days ago

raw

history blame contribute delete

8.21 kB

	"""Arena — run a compiled OptCoder submission on a landscape and score it.

	Two entry points:

	1. `run_arena(opt, landscape, seeds, steps)` — Phase-D full evaluation.
	Run the committed optimizer for ``steps`` iterations from each seed's fresh
	random init, collect per-seed trajectories, and return an ``ArenaResult``.
	Used at `commit` time to produce the terminal reward.

	2. `auto_test_draft(opt, landscape, seed, steps)` — lightweight per-draft
	test. Single fixed seed, 20 steps by default. Result is a compact summary
	surfaced to the LLM in the next observation so it can decide whether to
	refine or commit.

	Returned summary from ``auto_test_draft`` has these fields (consumed by the
	LLM via the observation, and by the demo UI):

	converged True if final_f < 1% of initial_f (a real convergence signal,
	not just "code didn't raise").
	diverged True if code raised a SandboxError (NaN, shape mismatch,
	timeout, Python exception) OR if final_f blew up to an
	astronomical value while the optimizer returned finite arrays
	(a.k.a. "silent divergence" — e.g. LR too high, f grows
	geometrically but no step() call errors).
	error The SandboxError message if one was raised, else None.
	initial_f f(x0) — the value before any step was taken.
	final_f f(x_N) — the value at step N. This is the single most
	important diagnostic; an agent looking at final_f=5e13 knows
	its optimizer exploded even though the code compiled.
	min_f min over all visited points (catches transient improvement
	that was then lost).
	step_of_min index (0..N-1) of the step at which min_f was achieved.

	A draft with ``diverged=True`` always scores ``r_eval_failures > 0`` when
	committed — the ``compute_optcoder_reward`` pipeline treats exploded final_f
	the same as a hard crash.
	"""

	from dataclasses import dataclass

	import numpy as np

	try:
	from .landscapes import Landscape
	from .sandbox import CompiledOptimizer, SandboxError
	except ImportError: # flat layout (HF Space container)
	from landscapes import Landscape # type: ignore
	from sandbox import CompiledOptimizer, SandboxError # type: ignore


	@dataclass
	class ArenaResult:
	initial_values: list[float] # per-seed f(x_0)
	final_values: list[float] # per-seed f(x_N); NaN if crashed
	crashed: list[bool] # per-seed
	trajectories: list[list[dict]] # per-seed trajectories (may be empty)

	@property
	def mean_progress(self) -> float:
	"""Mean descent: f_initial - f_final, averaged across non-crashed seeds.
	Positive = optimizer descended; 0 = stayed put; negative = went uphill.
	Crashed seeds count as 0 progress (conservative).
	"""
	prog: list[float] = []
	for init, fin, crashed in zip(self.initial_values, self.final_values,
	self.crashed):
	if crashed or not np.isfinite(fin):
	prog.append(0.0)
	else:
	prog.append(init - fin)
	return float(np.mean(prog)) if prog else 0.0

	@property
	def mean_initial_scale(self) -> float:
	"""\|mean initial f\|; used to establish a denominator floor when Adam
	itself makes near-zero progress (rare but possible on plateaus)."""
	vals = [abs(v) for v in self.initial_values if np.isfinite(v)]
	return float(np.mean(vals)) if vals else 1.0

	@property
	def crash_fraction(self) -> float:
	return float(np.mean(self.crashed)) if self.crashed else 0.0

	@property
	def robustness(self) -> float:
	"""1 - std/\|mean\|, clamped to [0, 1]. High = consistent across seeds."""
	vals = [v for v in self.final_values if np.isfinite(v)]
	if len(vals) < 2:
	return 0.0
	m = np.mean(vals)
	s = np.std(vals)
	if abs(m) < 1e-9:
	return 1.0 if s < 1e-6 else 0.0
	return float(np.clip(1.0 - s / (abs(m) + 1e-9), 0.0, 1.0))


	def run_arena(optimizer: CompiledOptimizer, ls: Landscape,
	seeds: list[int], steps: int = 200,
	init_scale: float = 0.5) -> ArenaResult:
	"""Run the compiled optimizer from fresh seeds; capture per-run metrics.

	Does NOT depend on `ls.f_min` — per-seed progress is `f_initial - f_final`,
	which is observable regardless of whether the global minimum is known.
	"""
	initials, finals, crashed, trajs = [], [], [], []
	for seed in seeds:
	rng = np.random.default_rng(seed)
	x = rng.normal(0.0, init_scale, size=ls.dim)
	f0 = float(ls.f(x))
	initials.append(f0)
	traj: list[dict] = []
	did_crash = False
	try:
	for t in range(steps):
	fv = float(ls.f(x))
	g = np.asarray(ls.grad(x), dtype=float)
	traj.append({"t": t, "x": x.tolist(), "f": fv})
	x = optimizer.step(x, fv, g)
	except SandboxError:
	did_crash = True

	if did_crash:
	finals.append(float("nan"))
	else:
	finals.append(float(ls.f(x)))
	crashed.append(did_crash)
	trajs.append(traj)

	return ArenaResult(
	initial_values=initials,
	final_values=finals,
	crashed=crashed,
	trajectories=trajs,
	)


	#: final_f above this (absolute value) is treated as silent divergence even
	#: if every step() call returned finite values. Picked so genuinely bad
	#: convergence on stiff landscapes (f around 1e6) still counts as a valid
	#: run, but runaway growth (f around 1e10+) gets flagged.
	DIVERGENCE_F_THRESHOLD = 1e10


	def auto_test_draft(optimizer: CompiledOptimizer, ls: Landscape,
	seed: int = 0, steps: int = 20, init_scale: float = 0.5) -> dict:
	"""Single-seed quick test used at draft() time.

	Runs the optimizer for ``steps`` steps from a fixed seed, returning a
	summary (for the LLM) + full per-step detail (for ``inspect`` follow-ups).

	Silent-divergence detection: if the code raises, we flag diverged=True as
	usual. But we ALSO flag diverged when the optimizer returned finite arrays
	yet the landscape's f(x) blew past ``DIVERGENCE_F_THRESHOLD`` — the common
	"LR too high, f grows geometrically, no errors" case.
	"""
	rng = np.random.default_rng(seed)
	x = rng.normal(0.0, init_scale, size=ls.dim)
	x0 = x.copy()
	detail: list[dict] = []
	diverged = False
	err: str \| None = None
	try:
	for t in range(steps):
	fv = float(ls.f(x))
	g = np.asarray(ls.grad(x), dtype=float)
	gn = float(np.linalg.norm(g))
	prev_x = x.copy()
	x = optimizer.step(x, fv, g)
	update_norm = float(np.linalg.norm(x - prev_x))
	step_size = update_norm / (gn + 1e-12)
	detail.append({
	"t": t, "x": x.tolist(), "f": float(ls.f(x)),
	"grad_norm": gn, "update_norm": update_norm, "step_size_eff": step_size,
	})
	# Silent-divergence guard — f exploded even though step() didn't raise
	if abs(detail[-1]["f"]) > DIVERGENCE_F_THRESHOLD:
	diverged = True
	err = (f"silent divergence: \|f\| = {detail[-1]['f']:.3g} "
	f"> {DIVERGENCE_F_THRESHOLD:.0e} at step {t}")
	break
	except SandboxError as e:
	diverged = True
	err = str(e)

	if diverged or not detail:
	summary = {
	"converged": False, "diverged": True, "error": err,
	"final_f": None, "initial_f": float(ls.f(x0)),
	"step_of_min": None, "min_f": None,
	}
	else:
	fs = [d["f"] for d in detail]
	step_of_min = int(np.argmin(fs))
	summary = {
	"converged": bool(fs[-1] < 0.1 * ls.f(x0)),
	"diverged": False, "error": None,
	"final_f": fs[-1], "initial_f": float(ls.f(x0)),
	"step_of_min": step_of_min, "min_f": min(fs),
	}
	return {"summary": summary, "detail": detail}