Spaces:
Sleeping
Sleeping
File size: 8,212 Bytes
901a0ed b0b140b 962ad43 b0b140b 901a0ed b0b140b 901a0ed b0b140b 901a0ed b0b140b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 | """Arena β run a compiled OptCoder submission on a landscape and score it.
Two entry points:
1. **`run_arena(opt, landscape, seeds, steps)`** β Phase-D full evaluation.
Run the committed optimizer for ``steps`` iterations from each seed's fresh
random init, collect per-seed trajectories, and return an ``ArenaResult``.
Used at `commit` time to produce the terminal reward.
2. **`auto_test_draft(opt, landscape, seed, steps)`** β lightweight per-draft
test. Single fixed seed, 20 steps by default. Result is a compact summary
surfaced to the LLM in the next observation so it can decide whether to
refine or commit.
Returned summary from ``auto_test_draft`` has these fields (consumed by the
LLM via the observation, and by the demo UI):
converged True if final_f < 1% of initial_f (a real convergence signal,
not just "code didn't raise").
diverged True if code raised a SandboxError (NaN, shape mismatch,
timeout, Python exception) OR if final_f blew up to an
astronomical value while the optimizer returned finite arrays
(a.k.a. "silent divergence" β e.g. LR too high, f grows
geometrically but no step() call errors).
error The SandboxError message if one was raised, else None.
initial_f f(x0) β the value before any step was taken.
final_f f(x_N) β the value at step N. This is the single most
important diagnostic; an agent looking at final_f=5e13 knows
its optimizer exploded even though the code compiled.
min_f min over all visited points (catches transient improvement
that was then lost).
step_of_min index (0..N-1) of the step at which min_f was achieved.
A draft with ``diverged=True`` always scores ``r_eval_failures > 0`` when
committed β the ``compute_optcoder_reward`` pipeline treats exploded final_f
the same as a hard crash.
"""
from dataclasses import dataclass
import numpy as np
try:
from .landscapes import Landscape
from .sandbox import CompiledOptimizer, SandboxError
except ImportError: # flat layout (HF Space container)
from landscapes import Landscape # type: ignore
from sandbox import CompiledOptimizer, SandboxError # type: ignore
@dataclass
class ArenaResult:
initial_values: list[float] # per-seed f(x_0)
final_values: list[float] # per-seed f(x_N); NaN if crashed
crashed: list[bool] # per-seed
trajectories: list[list[dict]] # per-seed trajectories (may be empty)
@property
def mean_progress(self) -> float:
"""Mean descent: f_initial - f_final, averaged across non-crashed seeds.
Positive = optimizer descended; 0 = stayed put; negative = went uphill.
Crashed seeds count as 0 progress (conservative).
"""
prog: list[float] = []
for init, fin, crashed in zip(self.initial_values, self.final_values,
self.crashed):
if crashed or not np.isfinite(fin):
prog.append(0.0)
else:
prog.append(init - fin)
return float(np.mean(prog)) if prog else 0.0
@property
def mean_initial_scale(self) -> float:
"""|mean initial f|; used to establish a denominator floor when Adam
itself makes near-zero progress (rare but possible on plateaus)."""
vals = [abs(v) for v in self.initial_values if np.isfinite(v)]
return float(np.mean(vals)) if vals else 1.0
@property
def crash_fraction(self) -> float:
return float(np.mean(self.crashed)) if self.crashed else 0.0
@property
def robustness(self) -> float:
"""1 - std/|mean|, clamped to [0, 1]. High = consistent across seeds."""
vals = [v for v in self.final_values if np.isfinite(v)]
if len(vals) < 2:
return 0.0
m = np.mean(vals)
s = np.std(vals)
if abs(m) < 1e-9:
return 1.0 if s < 1e-6 else 0.0
return float(np.clip(1.0 - s / (abs(m) + 1e-9), 0.0, 1.0))
def run_arena(optimizer: CompiledOptimizer, ls: Landscape,
seeds: list[int], steps: int = 200,
init_scale: float = 0.5) -> ArenaResult:
"""Run the compiled optimizer from fresh seeds; capture per-run metrics.
Does NOT depend on `ls.f_min` β per-seed progress is `f_initial - f_final`,
which is observable regardless of whether the global minimum is known.
"""
initials, finals, crashed, trajs = [], [], [], []
for seed in seeds:
rng = np.random.default_rng(seed)
x = rng.normal(0.0, init_scale, size=ls.dim)
f0 = float(ls.f(x))
initials.append(f0)
traj: list[dict] = []
did_crash = False
try:
for t in range(steps):
fv = float(ls.f(x))
g = np.asarray(ls.grad(x), dtype=float)
traj.append({"t": t, "x": x.tolist(), "f": fv})
x = optimizer.step(x, fv, g)
except SandboxError:
did_crash = True
if did_crash:
finals.append(float("nan"))
else:
finals.append(float(ls.f(x)))
crashed.append(did_crash)
trajs.append(traj)
return ArenaResult(
initial_values=initials,
final_values=finals,
crashed=crashed,
trajectories=trajs,
)
#: final_f above this (absolute value) is treated as silent divergence even
#: if every step() call returned finite values. Picked so genuinely bad
#: convergence on stiff landscapes (f around 1e6) still counts as a valid
#: run, but runaway growth (f around 1e10+) gets flagged.
DIVERGENCE_F_THRESHOLD = 1e10
def auto_test_draft(optimizer: CompiledOptimizer, ls: Landscape,
seed: int = 0, steps: int = 20, init_scale: float = 0.5) -> dict:
"""Single-seed quick test used at draft() time.
Runs the optimizer for ``steps`` steps from a fixed seed, returning a
summary (for the LLM) + full per-step detail (for ``inspect`` follow-ups).
Silent-divergence detection: if the code raises, we flag diverged=True as
usual. But we ALSO flag diverged when the optimizer returned finite arrays
yet the landscape's f(x) blew past ``DIVERGENCE_F_THRESHOLD`` β the common
"LR too high, f grows geometrically, no errors" case.
"""
rng = np.random.default_rng(seed)
x = rng.normal(0.0, init_scale, size=ls.dim)
x0 = x.copy()
detail: list[dict] = []
diverged = False
err: str | None = None
try:
for t in range(steps):
fv = float(ls.f(x))
g = np.asarray(ls.grad(x), dtype=float)
gn = float(np.linalg.norm(g))
prev_x = x.copy()
x = optimizer.step(x, fv, g)
update_norm = float(np.linalg.norm(x - prev_x))
step_size = update_norm / (gn + 1e-12)
detail.append({
"t": t, "x": x.tolist(), "f": float(ls.f(x)),
"grad_norm": gn, "update_norm": update_norm, "step_size_eff": step_size,
})
# Silent-divergence guard β f exploded even though step() didn't raise
if abs(detail[-1]["f"]) > DIVERGENCE_F_THRESHOLD:
diverged = True
err = (f"silent divergence: |f| = {detail[-1]['f']:.3g} "
f"> {DIVERGENCE_F_THRESHOLD:.0e} at step {t}")
break
except SandboxError as e:
diverged = True
err = str(e)
if diverged or not detail:
summary = {
"converged": False, "diverged": True, "error": err,
"final_f": None, "initial_f": float(ls.f(x0)),
"step_of_min": None, "min_f": None,
}
else:
fs = [d["f"] for d in detail]
step_of_min = int(np.argmin(fs))
summary = {
"converged": bool(fs[-1] < 0.1 * ls.f(x0)),
"diverged": False, "error": None,
"final_f": fs[-1], "initial_f": float(ls.f(x0)),
"step_of_min": step_of_min, "min_f": min(fs),
}
return {"summary": summary, "detail": detail}
|