landscapeforge / arena.py
mnawfal29's picture
Upload folder using huggingface_hub
901a0ed verified
"""Arena β€” run a compiled OptCoder submission on a landscape and score it.
Two entry points:
1. **`run_arena(opt, landscape, seeds, steps)`** β€” Phase-D full evaluation.
Run the committed optimizer for ``steps`` iterations from each seed's fresh
random init, collect per-seed trajectories, and return an ``ArenaResult``.
Used at `commit` time to produce the terminal reward.
2. **`auto_test_draft(opt, landscape, seed, steps)`** β€” lightweight per-draft
test. Single fixed seed, 20 steps by default. Result is a compact summary
surfaced to the LLM in the next observation so it can decide whether to
refine or commit.
Returned summary from ``auto_test_draft`` has these fields (consumed by the
LLM via the observation, and by the demo UI):
converged True if final_f < 1% of initial_f (a real convergence signal,
not just "code didn't raise").
diverged True if code raised a SandboxError (NaN, shape mismatch,
timeout, Python exception) OR if final_f blew up to an
astronomical value while the optimizer returned finite arrays
(a.k.a. "silent divergence" β€” e.g. LR too high, f grows
geometrically but no step() call errors).
error The SandboxError message if one was raised, else None.
initial_f f(x0) β€” the value before any step was taken.
final_f f(x_N) β€” the value at step N. This is the single most
important diagnostic; an agent looking at final_f=5e13 knows
its optimizer exploded even though the code compiled.
min_f min over all visited points (catches transient improvement
that was then lost).
step_of_min index (0..N-1) of the step at which min_f was achieved.
A draft with ``diverged=True`` always scores ``r_eval_failures > 0`` when
committed β€” the ``compute_optcoder_reward`` pipeline treats exploded final_f
the same as a hard crash.
"""
from dataclasses import dataclass
import numpy as np
try:
from .landscapes import Landscape
from .sandbox import CompiledOptimizer, SandboxError
except ImportError: # flat layout (HF Space container)
from landscapes import Landscape # type: ignore
from sandbox import CompiledOptimizer, SandboxError # type: ignore
@dataclass
class ArenaResult:
initial_values: list[float] # per-seed f(x_0)
final_values: list[float] # per-seed f(x_N); NaN if crashed
crashed: list[bool] # per-seed
trajectories: list[list[dict]] # per-seed trajectories (may be empty)
@property
def mean_progress(self) -> float:
"""Mean descent: f_initial - f_final, averaged across non-crashed seeds.
Positive = optimizer descended; 0 = stayed put; negative = went uphill.
Crashed seeds count as 0 progress (conservative).
"""
prog: list[float] = []
for init, fin, crashed in zip(self.initial_values, self.final_values,
self.crashed):
if crashed or not np.isfinite(fin):
prog.append(0.0)
else:
prog.append(init - fin)
return float(np.mean(prog)) if prog else 0.0
@property
def mean_initial_scale(self) -> float:
"""|mean initial f|; used to establish a denominator floor when Adam
itself makes near-zero progress (rare but possible on plateaus)."""
vals = [abs(v) for v in self.initial_values if np.isfinite(v)]
return float(np.mean(vals)) if vals else 1.0
@property
def crash_fraction(self) -> float:
return float(np.mean(self.crashed)) if self.crashed else 0.0
@property
def robustness(self) -> float:
"""1 - std/|mean|, clamped to [0, 1]. High = consistent across seeds."""
vals = [v for v in self.final_values if np.isfinite(v)]
if len(vals) < 2:
return 0.0
m = np.mean(vals)
s = np.std(vals)
if abs(m) < 1e-9:
return 1.0 if s < 1e-6 else 0.0
return float(np.clip(1.0 - s / (abs(m) + 1e-9), 0.0, 1.0))
def run_arena(optimizer: CompiledOptimizer, ls: Landscape,
seeds: list[int], steps: int = 200,
init_scale: float = 0.5) -> ArenaResult:
"""Run the compiled optimizer from fresh seeds; capture per-run metrics.
Does NOT depend on `ls.f_min` β€” per-seed progress is `f_initial - f_final`,
which is observable regardless of whether the global minimum is known.
"""
initials, finals, crashed, trajs = [], [], [], []
for seed in seeds:
rng = np.random.default_rng(seed)
x = rng.normal(0.0, init_scale, size=ls.dim)
f0 = float(ls.f(x))
initials.append(f0)
traj: list[dict] = []
did_crash = False
try:
for t in range(steps):
fv = float(ls.f(x))
g = np.asarray(ls.grad(x), dtype=float)
traj.append({"t": t, "x": x.tolist(), "f": fv})
x = optimizer.step(x, fv, g)
except SandboxError:
did_crash = True
if did_crash:
finals.append(float("nan"))
else:
finals.append(float(ls.f(x)))
crashed.append(did_crash)
trajs.append(traj)
return ArenaResult(
initial_values=initials,
final_values=finals,
crashed=crashed,
trajectories=trajs,
)
#: final_f above this (absolute value) is treated as silent divergence even
#: if every step() call returned finite values. Picked so genuinely bad
#: convergence on stiff landscapes (f around 1e6) still counts as a valid
#: run, but runaway growth (f around 1e10+) gets flagged.
DIVERGENCE_F_THRESHOLD = 1e10
def auto_test_draft(optimizer: CompiledOptimizer, ls: Landscape,
seed: int = 0, steps: int = 20, init_scale: float = 0.5) -> dict:
"""Single-seed quick test used at draft() time.
Runs the optimizer for ``steps`` steps from a fixed seed, returning a
summary (for the LLM) + full per-step detail (for ``inspect`` follow-ups).
Silent-divergence detection: if the code raises, we flag diverged=True as
usual. But we ALSO flag diverged when the optimizer returned finite arrays
yet the landscape's f(x) blew past ``DIVERGENCE_F_THRESHOLD`` β€” the common
"LR too high, f grows geometrically, no errors" case.
"""
rng = np.random.default_rng(seed)
x = rng.normal(0.0, init_scale, size=ls.dim)
x0 = x.copy()
detail: list[dict] = []
diverged = False
err: str | None = None
try:
for t in range(steps):
fv = float(ls.f(x))
g = np.asarray(ls.grad(x), dtype=float)
gn = float(np.linalg.norm(g))
prev_x = x.copy()
x = optimizer.step(x, fv, g)
update_norm = float(np.linalg.norm(x - prev_x))
step_size = update_norm / (gn + 1e-12)
detail.append({
"t": t, "x": x.tolist(), "f": float(ls.f(x)),
"grad_norm": gn, "update_norm": update_norm, "step_size_eff": step_size,
})
# Silent-divergence guard β€” f exploded even though step() didn't raise
if abs(detail[-1]["f"]) > DIVERGENCE_F_THRESHOLD:
diverged = True
err = (f"silent divergence: |f| = {detail[-1]['f']:.3g} "
f"> {DIVERGENCE_F_THRESHOLD:.0e} at step {t}")
break
except SandboxError as e:
diverged = True
err = str(e)
if diverged or not detail:
summary = {
"converged": False, "diverged": True, "error": err,
"final_f": None, "initial_f": float(ls.f(x0)),
"step_of_min": None, "min_f": None,
}
else:
fs = [d["f"] for d in detail]
step_of_min = int(np.argmin(fs))
summary = {
"converged": bool(fs[-1] < 0.1 * ls.f(x0)),
"diverged": False, "error": None,
"final_f": fs[-1], "initial_f": float(ls.f(x0)),
"step_of_min": step_of_min, "min_f": min(fs),
}
return {"summary": summary, "detail": detail}