"""Arena — run a compiled OptCoder submission on a landscape and score it. Two entry points: 1. **`run_arena(opt, landscape, seeds, steps)`** — Phase-D full evaluation. Run the committed optimizer for ``steps`` iterations from each seed's fresh random init, collect per-seed trajectories, and return an ``ArenaResult``. Used at `commit` time to produce the terminal reward. 2. **`auto_test_draft(opt, landscape, seed, steps)`** — lightweight per-draft test. Single fixed seed, 20 steps by default. Result is a compact summary surfaced to the LLM in the next observation so it can decide whether to refine or commit. Returned summary from ``auto_test_draft`` has these fields (consumed by the LLM via the observation, and by the demo UI): converged True if final_f < 1% of initial_f (a real convergence signal, not just "code didn't raise"). diverged True if code raised a SandboxError (NaN, shape mismatch, timeout, Python exception) OR if final_f blew up to an astronomical value while the optimizer returned finite arrays (a.k.a. "silent divergence" — e.g. LR too high, f grows geometrically but no step() call errors). error The SandboxError message if one was raised, else None. initial_f f(x0) — the value before any step was taken. final_f f(x_N) — the value at step N. This is the single most important diagnostic; an agent looking at final_f=5e13 knows its optimizer exploded even though the code compiled. min_f min over all visited points (catches transient improvement that was then lost). step_of_min index (0..N-1) of the step at which min_f was achieved. A draft with ``diverged=True`` always scores ``r_eval_failures > 0`` when committed — the ``compute_optcoder_reward`` pipeline treats exploded final_f the same as a hard crash. """ from dataclasses import dataclass import numpy as np try: from .landscapes import Landscape from .sandbox import CompiledOptimizer, SandboxError except ImportError: # flat layout (HF Space container) from landscapes import Landscape # type: ignore from sandbox import CompiledOptimizer, SandboxError # type: ignore @dataclass class ArenaResult: initial_values: list[float] # per-seed f(x_0) final_values: list[float] # per-seed f(x_N); NaN if crashed crashed: list[bool] # per-seed trajectories: list[list[dict]] # per-seed trajectories (may be empty) @property def mean_progress(self) -> float: """Mean descent: f_initial - f_final, averaged across non-crashed seeds. Positive = optimizer descended; 0 = stayed put; negative = went uphill. Crashed seeds count as 0 progress (conservative). """ prog: list[float] = [] for init, fin, crashed in zip(self.initial_values, self.final_values, self.crashed): if crashed or not np.isfinite(fin): prog.append(0.0) else: prog.append(init - fin) return float(np.mean(prog)) if prog else 0.0 @property def mean_initial_scale(self) -> float: """|mean initial f|; used to establish a denominator floor when Adam itself makes near-zero progress (rare but possible on plateaus).""" vals = [abs(v) for v in self.initial_values if np.isfinite(v)] return float(np.mean(vals)) if vals else 1.0 @property def crash_fraction(self) -> float: return float(np.mean(self.crashed)) if self.crashed else 0.0 @property def robustness(self) -> float: """1 - std/|mean|, clamped to [0, 1]. High = consistent across seeds.""" vals = [v for v in self.final_values if np.isfinite(v)] if len(vals) < 2: return 0.0 m = np.mean(vals) s = np.std(vals) if abs(m) < 1e-9: return 1.0 if s < 1e-6 else 0.0 return float(np.clip(1.0 - s / (abs(m) + 1e-9), 0.0, 1.0)) def run_arena(optimizer: CompiledOptimizer, ls: Landscape, seeds: list[int], steps: int = 200, init_scale: float = 0.5) -> ArenaResult: """Run the compiled optimizer from fresh seeds; capture per-run metrics. Does NOT depend on `ls.f_min` — per-seed progress is `f_initial - f_final`, which is observable regardless of whether the global minimum is known. """ initials, finals, crashed, trajs = [], [], [], [] for seed in seeds: rng = np.random.default_rng(seed) x = rng.normal(0.0, init_scale, size=ls.dim) f0 = float(ls.f(x)) initials.append(f0) traj: list[dict] = [] did_crash = False try: for t in range(steps): fv = float(ls.f(x)) g = np.asarray(ls.grad(x), dtype=float) traj.append({"t": t, "x": x.tolist(), "f": fv}) x = optimizer.step(x, fv, g) except SandboxError: did_crash = True if did_crash: finals.append(float("nan")) else: finals.append(float(ls.f(x))) crashed.append(did_crash) trajs.append(traj) return ArenaResult( initial_values=initials, final_values=finals, crashed=crashed, trajectories=trajs, ) #: final_f above this (absolute value) is treated as silent divergence even #: if every step() call returned finite values. Picked so genuinely bad #: convergence on stiff landscapes (f around 1e6) still counts as a valid #: run, but runaway growth (f around 1e10+) gets flagged. DIVERGENCE_F_THRESHOLD = 1e10 def auto_test_draft(optimizer: CompiledOptimizer, ls: Landscape, seed: int = 0, steps: int = 20, init_scale: float = 0.5) -> dict: """Single-seed quick test used at draft() time. Runs the optimizer for ``steps`` steps from a fixed seed, returning a summary (for the LLM) + full per-step detail (for ``inspect`` follow-ups). Silent-divergence detection: if the code raises, we flag diverged=True as usual. But we ALSO flag diverged when the optimizer returned finite arrays yet the landscape's f(x) blew past ``DIVERGENCE_F_THRESHOLD`` — the common "LR too high, f grows geometrically, no errors" case. """ rng = np.random.default_rng(seed) x = rng.normal(0.0, init_scale, size=ls.dim) x0 = x.copy() detail: list[dict] = [] diverged = False err: str | None = None try: for t in range(steps): fv = float(ls.f(x)) g = np.asarray(ls.grad(x), dtype=float) gn = float(np.linalg.norm(g)) prev_x = x.copy() x = optimizer.step(x, fv, g) update_norm = float(np.linalg.norm(x - prev_x)) step_size = update_norm / (gn + 1e-12) detail.append({ "t": t, "x": x.tolist(), "f": float(ls.f(x)), "grad_norm": gn, "update_norm": update_norm, "step_size_eff": step_size, }) # Silent-divergence guard — f exploded even though step() didn't raise if abs(detail[-1]["f"]) > DIVERGENCE_F_THRESHOLD: diverged = True err = (f"silent divergence: |f| = {detail[-1]['f']:.3g} " f"> {DIVERGENCE_F_THRESHOLD:.0e} at step {t}") break except SandboxError as e: diverged = True err = str(e) if diverged or not detail: summary = { "converged": False, "diverged": True, "error": err, "final_f": None, "initial_f": float(ls.f(x0)), "step_of_min": None, "min_f": None, } else: fs = [d["f"] for d in detail] step_of_min = int(np.argmin(fs)) summary = { "converged": bool(fs[-1] < 0.1 * ls.f(x0)), "diverged": False, "error": None, "final_f": fs[-1], "initial_f": float(ls.f(x0)), "step_of_min": step_of_min, "min_f": min(fs), } return {"summary": summary, "detail": detail}