Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- arena.py +56 -6
- frontend/src/components/TopBar.jsx +10 -17
- server/api_routes.py +37 -5
arena.py
CHANGED
|
@@ -1,7 +1,39 @@
|
|
| 1 |
-
"""
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
from dataclasses import dataclass
|
|
@@ -102,12 +134,24 @@ def run_arena(optimizer: CompiledOptimizer, ls: Landscape,
|
|
| 102 |
)
|
| 103 |
|
| 104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
def auto_test_draft(optimizer: CompiledOptimizer, ls: Landscape,
|
| 106 |
seed: int = 0, steps: int = 20, init_scale: float = 0.5) -> dict:
|
| 107 |
"""Single-seed quick test used at draft() time.
|
| 108 |
|
| 109 |
-
|
| 110 |
-
per-step
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
"""
|
| 112 |
rng = np.random.default_rng(seed)
|
| 113 |
x = rng.normal(0.0, init_scale, size=ls.dim)
|
|
@@ -128,6 +172,12 @@ def auto_test_draft(optimizer: CompiledOptimizer, ls: Landscape,
|
|
| 128 |
"t": t, "x": x.tolist(), "f": float(ls.f(x)),
|
| 129 |
"grad_norm": gn, "update_norm": update_norm, "step_size_eff": step_size,
|
| 130 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
except SandboxError as e:
|
| 132 |
diverged = True
|
| 133 |
err = str(e)
|
|
|
|
| 1 |
+
"""Arena — run a compiled OptCoder submission on a landscape and score it.
|
| 2 |
+
|
| 3 |
+
Two entry points:
|
| 4 |
+
|
| 5 |
+
1. **`run_arena(opt, landscape, seeds, steps)`** — Phase-D full evaluation.
|
| 6 |
+
Run the committed optimizer for ``steps`` iterations from each seed's fresh
|
| 7 |
+
random init, collect per-seed trajectories, and return an ``ArenaResult``.
|
| 8 |
+
Used at `commit` time to produce the terminal reward.
|
| 9 |
+
|
| 10 |
+
2. **`auto_test_draft(opt, landscape, seed, steps)`** — lightweight per-draft
|
| 11 |
+
test. Single fixed seed, 20 steps by default. Result is a compact summary
|
| 12 |
+
surfaced to the LLM in the next observation so it can decide whether to
|
| 13 |
+
refine or commit.
|
| 14 |
+
|
| 15 |
+
Returned summary from ``auto_test_draft`` has these fields (consumed by the
|
| 16 |
+
LLM via the observation, and by the demo UI):
|
| 17 |
+
|
| 18 |
+
converged True if final_f < 1% of initial_f (a real convergence signal,
|
| 19 |
+
not just "code didn't raise").
|
| 20 |
+
diverged True if code raised a SandboxError (NaN, shape mismatch,
|
| 21 |
+
timeout, Python exception) OR if final_f blew up to an
|
| 22 |
+
astronomical value while the optimizer returned finite arrays
|
| 23 |
+
(a.k.a. "silent divergence" — e.g. LR too high, f grows
|
| 24 |
+
geometrically but no step() call errors).
|
| 25 |
+
error The SandboxError message if one was raised, else None.
|
| 26 |
+
initial_f f(x0) — the value before any step was taken.
|
| 27 |
+
final_f f(x_N) — the value at step N. This is the single most
|
| 28 |
+
important diagnostic; an agent looking at final_f=5e13 knows
|
| 29 |
+
its optimizer exploded even though the code compiled.
|
| 30 |
+
min_f min over all visited points (catches transient improvement
|
| 31 |
+
that was then lost).
|
| 32 |
+
step_of_min index (0..N-1) of the step at which min_f was achieved.
|
| 33 |
+
|
| 34 |
+
A draft with ``diverged=True`` always scores ``r_eval_failures > 0`` when
|
| 35 |
+
committed — the ``compute_optcoder_reward`` pipeline treats exploded final_f
|
| 36 |
+
the same as a hard crash.
|
| 37 |
"""
|
| 38 |
|
| 39 |
from dataclasses import dataclass
|
|
|
|
| 134 |
)
|
| 135 |
|
| 136 |
|
| 137 |
+
#: final_f above this (absolute value) is treated as silent divergence even
|
| 138 |
+
#: if every step() call returned finite values. Picked so genuinely bad
|
| 139 |
+
#: convergence on stiff landscapes (f around 1e6) still counts as a valid
|
| 140 |
+
#: run, but runaway growth (f around 1e10+) gets flagged.
|
| 141 |
+
DIVERGENCE_F_THRESHOLD = 1e10
|
| 142 |
+
|
| 143 |
+
|
| 144 |
def auto_test_draft(optimizer: CompiledOptimizer, ls: Landscape,
|
| 145 |
seed: int = 0, steps: int = 20, init_scale: float = 0.5) -> dict:
|
| 146 |
"""Single-seed quick test used at draft() time.
|
| 147 |
|
| 148 |
+
Runs the optimizer for ``steps`` steps from a fixed seed, returning a
|
| 149 |
+
summary (for the LLM) + full per-step detail (for ``inspect`` follow-ups).
|
| 150 |
+
|
| 151 |
+
Silent-divergence detection: if the code raises, we flag diverged=True as
|
| 152 |
+
usual. But we ALSO flag diverged when the optimizer returned finite arrays
|
| 153 |
+
yet the landscape's f(x) blew past ``DIVERGENCE_F_THRESHOLD`` — the common
|
| 154 |
+
"LR too high, f grows geometrically, no errors" case.
|
| 155 |
"""
|
| 156 |
rng = np.random.default_rng(seed)
|
| 157 |
x = rng.normal(0.0, init_scale, size=ls.dim)
|
|
|
|
| 172 |
"t": t, "x": x.tolist(), "f": float(ls.f(x)),
|
| 173 |
"grad_norm": gn, "update_norm": update_norm, "step_size_eff": step_size,
|
| 174 |
})
|
| 175 |
+
# Silent-divergence guard — f exploded even though step() didn't raise
|
| 176 |
+
if abs(detail[-1]["f"]) > DIVERGENCE_F_THRESHOLD:
|
| 177 |
+
diverged = True
|
| 178 |
+
err = (f"silent divergence: |f| = {detail[-1]['f']:.3g} "
|
| 179 |
+
f"> {DIVERGENCE_F_THRESHOLD:.0e} at step {t}")
|
| 180 |
+
break
|
| 181 |
except SandboxError as e:
|
| 182 |
diverged = True
|
| 183 |
err = str(e)
|
frontend/src/components/TopBar.jsx
CHANGED
|
@@ -15,23 +15,16 @@ export function TopBar() {
|
|
| 15 |
</div>
|
| 16 |
</div>
|
| 17 |
<nav className="flex items-center gap-1">
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
className="text-sm text-muted hover:text-ink px-3 py-1.5 rounded-md
|
| 29 |
-
hover:bg-surface border border-transparent
|
| 30 |
-
hover:border-border transition-colors"
|
| 31 |
-
>
|
| 32 |
-
{label}
|
| 33 |
-
</a>
|
| 34 |
-
))}
|
| 35 |
</nav>
|
| 36 |
</header>
|
| 37 |
)
|
|
|
|
| 15 |
</div>
|
| 16 |
</div>
|
| 17 |
<nav className="flex items-center gap-1">
|
| 18 |
+
<a
|
| 19 |
+
href="https://huggingface.co/spaces/mnawfal29/landscapeforge"
|
| 20 |
+
target="_blank"
|
| 21 |
+
rel="noreferrer"
|
| 22 |
+
className="text-sm text-muted hover:text-ink px-3 py-1.5 rounded-md
|
| 23 |
+
hover:bg-surface border border-transparent
|
| 24 |
+
hover:border-border transition-colors"
|
| 25 |
+
>
|
| 26 |
+
Space
|
| 27 |
+
</a>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
</nav>
|
| 29 |
</header>
|
| 30 |
)
|
server/api_routes.py
CHANGED
|
@@ -375,6 +375,37 @@ def api_arena(req: ArenaReq):
|
|
| 375 |
title=f"{req.template} — your optimizer vs tuned Adam")
|
| 376 |
|
| 377 |
bk = reward.breakdown
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
return {
|
| 379 |
"contour": contour or _empty_fig(f"{req.template} · dim={dim}\nContour is 2-D only"),
|
| 380 |
"progress": _bar_fig(
|
|
@@ -388,12 +419,13 @@ def api_arena(req: ArenaReq):
|
|
| 388 |
"summary_md": (
|
| 389 |
f"<h3>Results</h3>"
|
| 390 |
f"<ul>"
|
| 391 |
-
f"<li>
|
| 392 |
-
f"<li>Tuned Adam
|
| 393 |
-
f" (lr=<code>{best_lr:g}</code>)</li>"
|
| 394 |
-
f"<li>Speedup vs Adam: <code>{bk.get('speedup_vs_adam', 0):.3g}×</code></li>"
|
| 395 |
f"<li>Your crash fraction: <code>{user_arena.crash_fraction:.0%}</code></li>"
|
| 396 |
-
f"<li><strong>Total reward: <code>{reward.r_total:+.3f}</code></strong>
|
|
|
|
|
|
|
|
|
|
| 397 |
f"</ul>"
|
| 398 |
),
|
| 399 |
}
|
|
|
|
| 375 |
title=f"{req.template} — your optimizer vs tuned Adam")
|
| 376 |
|
| 377 |
bk = reward.breakdown
|
| 378 |
+
speedup = bk.get("speedup_vs_adam", 0.0)
|
| 379 |
+
|
| 380 |
+
# Narrate the reward decomposition so users aren't confused when reward
|
| 381 |
+
# is positive despite speedup≈1× (r_convergence + r_robustness contribute
|
| 382 |
+
# independently of beating Adam; see §9.1 of LANDSCAPEFORGE_DESIGN.md).
|
| 383 |
+
parts = []
|
| 384 |
+
if abs(bk["r_regret"] * 1.0) > 0.01:
|
| 385 |
+
parts.append(f"regret {bk['r_regret']*1.0:+.3f}")
|
| 386 |
+
if abs(bk["r_convergence"] * 0.3) > 0.01:
|
| 387 |
+
parts.append(f"convergence {bk['r_convergence']*0.3:+.3f}")
|
| 388 |
+
if abs(bk["r_robustness"] * 0.3) > 0.01:
|
| 389 |
+
parts.append(f"robustness {bk['r_robustness']*0.3:+.3f}")
|
| 390 |
+
if abs(bk["r_novelty"] * 0.1) > 0.01:
|
| 391 |
+
parts.append(f"novelty {bk['r_novelty']*0.1:+.3f}")
|
| 392 |
+
if abs(bk["r_budget"] * 0.05) > 0.01:
|
| 393 |
+
parts.append(f"budget {-bk['r_budget']*0.05:+.3f}")
|
| 394 |
+
if abs(bk["r_eval_failures"] * 0.5) > 0.01:
|
| 395 |
+
parts.append(f"eval {-bk['r_eval_failures']*0.5:+.3f}")
|
| 396 |
+
|
| 397 |
+
# Speedup phrasing — avoid nonsense like "0.00×" when diverged
|
| 398 |
+
my_p, adam_p = user_arena.mean_progress, adam_arena.mean_progress
|
| 399 |
+
if my_p < 0:
|
| 400 |
+
speedup_line = "your optimizer <strong>diverged</strong> (f moved uphill)"
|
| 401 |
+
elif adam_p <= 0:
|
| 402 |
+
speedup_line = (f"Adam made no progress on this landscape; "
|
| 403 |
+
f"your progress: <code>{my_p:.3g}</code>")
|
| 404 |
+
else:
|
| 405 |
+
speedup_line = (f"Speedup vs Adam: <code>{speedup:.3g}×</code> "
|
| 406 |
+
f"(your descent <code>{my_p:.3g}</code>, Adam's "
|
| 407 |
+
f"<code>{adam_p:.3g}</code>)")
|
| 408 |
+
|
| 409 |
return {
|
| 410 |
"contour": contour or _empty_fig(f"{req.template} · dim={dim}\nContour is 2-D only"),
|
| 411 |
"progress": _bar_fig(
|
|
|
|
| 419 |
"summary_md": (
|
| 420 |
f"<h3>Results</h3>"
|
| 421 |
f"<ul>"
|
| 422 |
+
f"<li>{speedup_line}</li>"
|
| 423 |
+
f"<li>Tuned Adam LR: <code>{best_lr:g}</code></li>"
|
|
|
|
|
|
|
| 424 |
f"<li>Your crash fraction: <code>{user_arena.crash_fraction:.0%}</code></li>"
|
| 425 |
+
f"<li><strong>Total reward: <code>{reward.r_total:+.3f}</code></strong>"
|
| 426 |
+
+ (f"<span style='color:#b5ada0'> "
|
| 427 |
+
f"= {' + '.join(parts)}</span>" if parts else "")
|
| 428 |
+
+ "</li>"
|
| 429 |
f"</ul>"
|
| 430 |
),
|
| 431 |
}
|