mnawfal29 commited on
Commit
901a0ed
·
verified ·
1 Parent(s): 4a535ea

Upload folder using huggingface_hub

Browse files
arena.py CHANGED
@@ -1,7 +1,39 @@
1
- """Phase-D runner: run a compiled optimizer for N steps from K fresh seeds.
2
-
3
- Computes per-run final regret and aggregate stats used by the reward module.
4
- Also handles auto-test during draft actions (single fixed seed, fewer steps).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  """
6
 
7
  from dataclasses import dataclass
@@ -102,12 +134,24 @@ def run_arena(optimizer: CompiledOptimizer, ls: Landscape,
102
  )
103
 
104
 
 
 
 
 
 
 
 
105
  def auto_test_draft(optimizer: CompiledOptimizer, ls: Landscape,
106
  seed: int = 0, steps: int = 20, init_scale: float = 0.5) -> dict:
107
  """Single-seed quick test used at draft() time.
108
 
109
- Returns a lightweight summary (not the full trajectory) plus a detailed
110
- per-step record that `inspect` can later dig into.
 
 
 
 
 
111
  """
112
  rng = np.random.default_rng(seed)
113
  x = rng.normal(0.0, init_scale, size=ls.dim)
@@ -128,6 +172,12 @@ def auto_test_draft(optimizer: CompiledOptimizer, ls: Landscape,
128
  "t": t, "x": x.tolist(), "f": float(ls.f(x)),
129
  "grad_norm": gn, "update_norm": update_norm, "step_size_eff": step_size,
130
  })
 
 
 
 
 
 
131
  except SandboxError as e:
132
  diverged = True
133
  err = str(e)
 
1
+ """Arena run a compiled OptCoder submission on a landscape and score it.
2
+
3
+ Two entry points:
4
+
5
+ 1. **`run_arena(opt, landscape, seeds, steps)`** — Phase-D full evaluation.
6
+ Run the committed optimizer for ``steps`` iterations from each seed's fresh
7
+ random init, collect per-seed trajectories, and return an ``ArenaResult``.
8
+ Used at `commit` time to produce the terminal reward.
9
+
10
+ 2. **`auto_test_draft(opt, landscape, seed, steps)`** — lightweight per-draft
11
+ test. Single fixed seed, 20 steps by default. Result is a compact summary
12
+ surfaced to the LLM in the next observation so it can decide whether to
13
+ refine or commit.
14
+
15
+ Returned summary from ``auto_test_draft`` has these fields (consumed by the
16
+ LLM via the observation, and by the demo UI):
17
+
18
+ converged True if final_f < 1% of initial_f (a real convergence signal,
19
+ not just "code didn't raise").
20
+ diverged True if code raised a SandboxError (NaN, shape mismatch,
21
+ timeout, Python exception) OR if final_f blew up to an
22
+ astronomical value while the optimizer returned finite arrays
23
+ (a.k.a. "silent divergence" — e.g. LR too high, f grows
24
+ geometrically but no step() call errors).
25
+ error The SandboxError message if one was raised, else None.
26
+ initial_f f(x0) — the value before any step was taken.
27
+ final_f f(x_N) — the value at step N. This is the single most
28
+ important diagnostic; an agent looking at final_f=5e13 knows
29
+ its optimizer exploded even though the code compiled.
30
+ min_f min over all visited points (catches transient improvement
31
+ that was then lost).
32
+ step_of_min index (0..N-1) of the step at which min_f was achieved.
33
+
34
+ A draft with ``diverged=True`` always scores ``r_eval_failures > 0`` when
35
+ committed — the ``compute_optcoder_reward`` pipeline treats exploded final_f
36
+ the same as a hard crash.
37
  """
38
 
39
  from dataclasses import dataclass
 
134
  )
135
 
136
 
137
+ #: final_f above this (absolute value) is treated as silent divergence even
138
+ #: if every step() call returned finite values. Picked so genuinely bad
139
+ #: convergence on stiff landscapes (f around 1e6) still counts as a valid
140
+ #: run, but runaway growth (f around 1e10+) gets flagged.
141
+ DIVERGENCE_F_THRESHOLD = 1e10
142
+
143
+
144
  def auto_test_draft(optimizer: CompiledOptimizer, ls: Landscape,
145
  seed: int = 0, steps: int = 20, init_scale: float = 0.5) -> dict:
146
  """Single-seed quick test used at draft() time.
147
 
148
+ Runs the optimizer for ``steps`` steps from a fixed seed, returning a
149
+ summary (for the LLM) + full per-step detail (for ``inspect`` follow-ups).
150
+
151
+ Silent-divergence detection: if the code raises, we flag diverged=True as
152
+ usual. But we ALSO flag diverged when the optimizer returned finite arrays
153
+ yet the landscape's f(x) blew past ``DIVERGENCE_F_THRESHOLD`` — the common
154
+ "LR too high, f grows geometrically, no errors" case.
155
  """
156
  rng = np.random.default_rng(seed)
157
  x = rng.normal(0.0, init_scale, size=ls.dim)
 
172
  "t": t, "x": x.tolist(), "f": float(ls.f(x)),
173
  "grad_norm": gn, "update_norm": update_norm, "step_size_eff": step_size,
174
  })
175
+ # Silent-divergence guard — f exploded even though step() didn't raise
176
+ if abs(detail[-1]["f"]) > DIVERGENCE_F_THRESHOLD:
177
+ diverged = True
178
+ err = (f"silent divergence: |f| = {detail[-1]['f']:.3g} "
179
+ f"> {DIVERGENCE_F_THRESHOLD:.0e} at step {t}")
180
+ break
181
  except SandboxError as e:
182
  diverged = True
183
  err = str(e)
frontend/src/components/TopBar.jsx CHANGED
@@ -15,23 +15,16 @@ export function TopBar() {
15
  </div>
16
  </div>
17
  <nav className="flex items-center gap-1">
18
- {[
19
- ['Space', 'https://huggingface.co/spaces/mnawfal29/landscapeforge'],
20
- ['API schema', '/schema'],
21
- ['OpenAPI', '/openapi.json'],
22
- ].map(([label, href]) => (
23
- <a
24
- key={label}
25
- href={href}
26
- target="_blank"
27
- rel="noreferrer"
28
- className="text-sm text-muted hover:text-ink px-3 py-1.5 rounded-md
29
- hover:bg-surface border border-transparent
30
- hover:border-border transition-colors"
31
- >
32
- {label}
33
- </a>
34
- ))}
35
  </nav>
36
  </header>
37
  )
 
15
  </div>
16
  </div>
17
  <nav className="flex items-center gap-1">
18
+ <a
19
+ href="https://huggingface.co/spaces/mnawfal29/landscapeforge"
20
+ target="_blank"
21
+ rel="noreferrer"
22
+ className="text-sm text-muted hover:text-ink px-3 py-1.5 rounded-md
23
+ hover:bg-surface border border-transparent
24
+ hover:border-border transition-colors"
25
+ >
26
+ Space
27
+ </a>
 
 
 
 
 
 
 
28
  </nav>
29
  </header>
30
  )
server/api_routes.py CHANGED
@@ -375,6 +375,37 @@ def api_arena(req: ArenaReq):
375
  title=f"{req.template} — your optimizer vs tuned Adam")
376
 
377
  bk = reward.breakdown
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
  return {
379
  "contour": contour or _empty_fig(f"{req.template} · dim={dim}\nContour is 2-D only"),
380
  "progress": _bar_fig(
@@ -388,12 +419,13 @@ def api_arena(req: ArenaReq):
388
  "summary_md": (
389
  f"<h3>Results</h3>"
390
  f"<ul>"
391
- f"<li>Your mean progress: <code>{user_arena.mean_progress:.4g}</code></li>"
392
- f"<li>Tuned Adam progress: <code>{adam_arena.mean_progress:.4g}</code>"
393
- f" (lr=<code>{best_lr:g}</code>)</li>"
394
- f"<li>Speedup vs Adam: <code>{bk.get('speedup_vs_adam', 0):.3g}×</code></li>"
395
  f"<li>Your crash fraction: <code>{user_arena.crash_fraction:.0%}</code></li>"
396
- f"<li><strong>Total reward: <code>{reward.r_total:+.3f}</code></strong></li>"
 
 
 
397
  f"</ul>"
398
  ),
399
  }
 
375
  title=f"{req.template} — your optimizer vs tuned Adam")
376
 
377
  bk = reward.breakdown
378
+ speedup = bk.get("speedup_vs_adam", 0.0)
379
+
380
+ # Narrate the reward decomposition so users aren't confused when reward
381
+ # is positive despite speedup≈1× (r_convergence + r_robustness contribute
382
+ # independently of beating Adam; see §9.1 of LANDSCAPEFORGE_DESIGN.md).
383
+ parts = []
384
+ if abs(bk["r_regret"] * 1.0) > 0.01:
385
+ parts.append(f"regret {bk['r_regret']*1.0:+.3f}")
386
+ if abs(bk["r_convergence"] * 0.3) > 0.01:
387
+ parts.append(f"convergence {bk['r_convergence']*0.3:+.3f}")
388
+ if abs(bk["r_robustness"] * 0.3) > 0.01:
389
+ parts.append(f"robustness {bk['r_robustness']*0.3:+.3f}")
390
+ if abs(bk["r_novelty"] * 0.1) > 0.01:
391
+ parts.append(f"novelty {bk['r_novelty']*0.1:+.3f}")
392
+ if abs(bk["r_budget"] * 0.05) > 0.01:
393
+ parts.append(f"budget {-bk['r_budget']*0.05:+.3f}")
394
+ if abs(bk["r_eval_failures"] * 0.5) > 0.01:
395
+ parts.append(f"eval {-bk['r_eval_failures']*0.5:+.3f}")
396
+
397
+ # Speedup phrasing — avoid nonsense like "0.00×" when diverged
398
+ my_p, adam_p = user_arena.mean_progress, adam_arena.mean_progress
399
+ if my_p < 0:
400
+ speedup_line = "your optimizer <strong>diverged</strong> (f moved uphill)"
401
+ elif adam_p <= 0:
402
+ speedup_line = (f"Adam made no progress on this landscape; "
403
+ f"your progress: <code>{my_p:.3g}</code>")
404
+ else:
405
+ speedup_line = (f"Speedup vs Adam: <code>{speedup:.3g}×</code> "
406
+ f"(your descent <code>{my_p:.3g}</code>, Adam's "
407
+ f"<code>{adam_p:.3g}</code>)")
408
+
409
  return {
410
  "contour": contour or _empty_fig(f"{req.template} · dim={dim}\nContour is 2-D only"),
411
  "progress": _bar_fig(
 
419
  "summary_md": (
420
  f"<h3>Results</h3>"
421
  f"<ul>"
422
+ f"<li>{speedup_line}</li>"
423
+ f"<li>Tuned Adam LR: <code>{best_lr:g}</code></li>"
 
 
424
  f"<li>Your crash fraction: <code>{user_arena.crash_fraction:.0%}</code></li>"
425
+ f"<li><strong>Total reward: <code>{reward.r_total:+.3f}</code></strong>"
426
+ + (f"<span style='color:#b5ada0'> "
427
+ f"= {' + '.join(parts)}</span>" if parts else "")
428
+ + "</li>"
429
  f"</ul>"
430
  ),
431
  }