Spaces:
Sleeping
Sleeping
| """Arena β run a compiled OptCoder submission on a landscape and score it. | |
| Two entry points: | |
| 1. **`run_arena(opt, landscape, seeds, steps)`** β Phase-D full evaluation. | |
| Run the committed optimizer for ``steps`` iterations from each seed's fresh | |
| random init, collect per-seed trajectories, and return an ``ArenaResult``. | |
| Used at `commit` time to produce the terminal reward. | |
| 2. **`auto_test_draft(opt, landscape, seed, steps)`** β lightweight per-draft | |
| test. Single fixed seed, 20 steps by default. Result is a compact summary | |
| surfaced to the LLM in the next observation so it can decide whether to | |
| refine or commit. | |
| Returned summary from ``auto_test_draft`` has these fields (consumed by the | |
| LLM via the observation, and by the demo UI): | |
| converged True if final_f < 1% of initial_f (a real convergence signal, | |
| not just "code didn't raise"). | |
| diverged True if code raised a SandboxError (NaN, shape mismatch, | |
| timeout, Python exception) OR if final_f blew up to an | |
| astronomical value while the optimizer returned finite arrays | |
| (a.k.a. "silent divergence" β e.g. LR too high, f grows | |
| geometrically but no step() call errors). | |
| error The SandboxError message if one was raised, else None. | |
| initial_f f(x0) β the value before any step was taken. | |
| final_f f(x_N) β the value at step N. This is the single most | |
| important diagnostic; an agent looking at final_f=5e13 knows | |
| its optimizer exploded even though the code compiled. | |
| min_f min over all visited points (catches transient improvement | |
| that was then lost). | |
| step_of_min index (0..N-1) of the step at which min_f was achieved. | |
| A draft with ``diverged=True`` always scores ``r_eval_failures > 0`` when | |
| committed β the ``compute_optcoder_reward`` pipeline treats exploded final_f | |
| the same as a hard crash. | |
| """ | |
| from dataclasses import dataclass | |
| import numpy as np | |
| try: | |
| from .landscapes import Landscape | |
| from .sandbox import CompiledOptimizer, SandboxError | |
| except ImportError: # flat layout (HF Space container) | |
| from landscapes import Landscape # type: ignore | |
| from sandbox import CompiledOptimizer, SandboxError # type: ignore | |
| class ArenaResult: | |
| initial_values: list[float] # per-seed f(x_0) | |
| final_values: list[float] # per-seed f(x_N); NaN if crashed | |
| crashed: list[bool] # per-seed | |
| trajectories: list[list[dict]] # per-seed trajectories (may be empty) | |
| def mean_progress(self) -> float: | |
| """Mean descent: f_initial - f_final, averaged across non-crashed seeds. | |
| Positive = optimizer descended; 0 = stayed put; negative = went uphill. | |
| Crashed seeds count as 0 progress (conservative). | |
| """ | |
| prog: list[float] = [] | |
| for init, fin, crashed in zip(self.initial_values, self.final_values, | |
| self.crashed): | |
| if crashed or not np.isfinite(fin): | |
| prog.append(0.0) | |
| else: | |
| prog.append(init - fin) | |
| return float(np.mean(prog)) if prog else 0.0 | |
| def mean_initial_scale(self) -> float: | |
| """|mean initial f|; used to establish a denominator floor when Adam | |
| itself makes near-zero progress (rare but possible on plateaus).""" | |
| vals = [abs(v) for v in self.initial_values if np.isfinite(v)] | |
| return float(np.mean(vals)) if vals else 1.0 | |
| def crash_fraction(self) -> float: | |
| return float(np.mean(self.crashed)) if self.crashed else 0.0 | |
| def robustness(self) -> float: | |
| """1 - std/|mean|, clamped to [0, 1]. High = consistent across seeds.""" | |
| vals = [v for v in self.final_values if np.isfinite(v)] | |
| if len(vals) < 2: | |
| return 0.0 | |
| m = np.mean(vals) | |
| s = np.std(vals) | |
| if abs(m) < 1e-9: | |
| return 1.0 if s < 1e-6 else 0.0 | |
| return float(np.clip(1.0 - s / (abs(m) + 1e-9), 0.0, 1.0)) | |
| def run_arena(optimizer: CompiledOptimizer, ls: Landscape, | |
| seeds: list[int], steps: int = 200, | |
| init_scale: float = 0.5) -> ArenaResult: | |
| """Run the compiled optimizer from fresh seeds; capture per-run metrics. | |
| Does NOT depend on `ls.f_min` β per-seed progress is `f_initial - f_final`, | |
| which is observable regardless of whether the global minimum is known. | |
| """ | |
| initials, finals, crashed, trajs = [], [], [], [] | |
| for seed in seeds: | |
| rng = np.random.default_rng(seed) | |
| x = rng.normal(0.0, init_scale, size=ls.dim) | |
| f0 = float(ls.f(x)) | |
| initials.append(f0) | |
| traj: list[dict] = [] | |
| did_crash = False | |
| try: | |
| for t in range(steps): | |
| fv = float(ls.f(x)) | |
| g = np.asarray(ls.grad(x), dtype=float) | |
| traj.append({"t": t, "x": x.tolist(), "f": fv}) | |
| x = optimizer.step(x, fv, g) | |
| except SandboxError: | |
| did_crash = True | |
| if did_crash: | |
| finals.append(float("nan")) | |
| else: | |
| finals.append(float(ls.f(x))) | |
| crashed.append(did_crash) | |
| trajs.append(traj) | |
| return ArenaResult( | |
| initial_values=initials, | |
| final_values=finals, | |
| crashed=crashed, | |
| trajectories=trajs, | |
| ) | |
| #: final_f above this (absolute value) is treated as silent divergence even | |
| #: if every step() call returned finite values. Picked so genuinely bad | |
| #: convergence on stiff landscapes (f around 1e6) still counts as a valid | |
| #: run, but runaway growth (f around 1e10+) gets flagged. | |
| DIVERGENCE_F_THRESHOLD = 1e10 | |
| def auto_test_draft(optimizer: CompiledOptimizer, ls: Landscape, | |
| seed: int = 0, steps: int = 20, init_scale: float = 0.5) -> dict: | |
| """Single-seed quick test used at draft() time. | |
| Runs the optimizer for ``steps`` steps from a fixed seed, returning a | |
| summary (for the LLM) + full per-step detail (for ``inspect`` follow-ups). | |
| Silent-divergence detection: if the code raises, we flag diverged=True as | |
| usual. But we ALSO flag diverged when the optimizer returned finite arrays | |
| yet the landscape's f(x) blew past ``DIVERGENCE_F_THRESHOLD`` β the common | |
| "LR too high, f grows geometrically, no errors" case. | |
| """ | |
| rng = np.random.default_rng(seed) | |
| x = rng.normal(0.0, init_scale, size=ls.dim) | |
| x0 = x.copy() | |
| detail: list[dict] = [] | |
| diverged = False | |
| err: str | None = None | |
| try: | |
| for t in range(steps): | |
| fv = float(ls.f(x)) | |
| g = np.asarray(ls.grad(x), dtype=float) | |
| gn = float(np.linalg.norm(g)) | |
| prev_x = x.copy() | |
| x = optimizer.step(x, fv, g) | |
| update_norm = float(np.linalg.norm(x - prev_x)) | |
| step_size = update_norm / (gn + 1e-12) | |
| detail.append({ | |
| "t": t, "x": x.tolist(), "f": float(ls.f(x)), | |
| "grad_norm": gn, "update_norm": update_norm, "step_size_eff": step_size, | |
| }) | |
| # Silent-divergence guard β f exploded even though step() didn't raise | |
| if abs(detail[-1]["f"]) > DIVERGENCE_F_THRESHOLD: | |
| diverged = True | |
| err = (f"silent divergence: |f| = {detail[-1]['f']:.3g} " | |
| f"> {DIVERGENCE_F_THRESHOLD:.0e} at step {t}") | |
| break | |
| except SandboxError as e: | |
| diverged = True | |
| err = str(e) | |
| if diverged or not detail: | |
| summary = { | |
| "converged": False, "diverged": True, "error": err, | |
| "final_f": None, "initial_f": float(ls.f(x0)), | |
| "step_of_min": None, "min_f": None, | |
| } | |
| else: | |
| fs = [d["f"] for d in detail] | |
| step_of_min = int(np.argmin(fs)) | |
| summary = { | |
| "converged": bool(fs[-1] < 0.1 * ls.f(x0)), | |
| "diverged": False, "error": None, | |
| "final_f": fs[-1], "initial_f": float(ls.f(x0)), | |
| "step_of_min": step_of_min, "min_f": min(fs), | |
| } | |
| return {"summary": summary, "detail": detail} | |