Spaces:

mnawfal29
/

landscapeforge

Sleeping

App Files Files Community

mnawfal29 commited on 14 days ago

Commit

901a0ed

verified ·

1 Parent(s): 4a535ea

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

arena.py +56 -6
frontend/src/components/TopBar.jsx +10 -17
server/api_routes.py +37 -5

arena.py CHANGED Viewed

@@ -1,7 +1,39 @@
-"""Phase-D runner: run a compiled optimizer for N steps from K fresh seeds.
-Computes per-run final regret and aggregate stats used by the reward module.
-Also handles auto-test during draft actions (single fixed seed, fewer steps).
 """
 from dataclasses import dataclass
@@ -102,12 +134,24 @@ def run_arena(optimizer: CompiledOptimizer, ls: Landscape,
     )
 def auto_test_draft(optimizer: CompiledOptimizer, ls: Landscape,
                     seed: int = 0, steps: int = 20, init_scale: float = 0.5) -> dict:
     """Single-seed quick test used at draft() time.
-    Returns a lightweight summary (not the full trajectory) plus a detailed
-    per-step record that `inspect` can later dig into.
     """
     rng = np.random.default_rng(seed)
     x = rng.normal(0.0, init_scale, size=ls.dim)
@@ -128,6 +172,12 @@ def auto_test_draft(optimizer: CompiledOptimizer, ls: Landscape,
                 "t": t, "x": x.tolist(), "f": float(ls.f(x)),
                 "grad_norm": gn, "update_norm": update_norm, "step_size_eff": step_size,
             })
     except SandboxError as e:
         diverged = True
         err = str(e)

+"""Arena — run a compiled OptCoder submission on a landscape and score it.
+Two entry points:
+1. **`run_arena(opt, landscape, seeds, steps)`** — Phase-D full evaluation.
+   Run the committed optimizer for ``steps`` iterations from each seed's fresh
+   random init, collect per-seed trajectories, and return an ``ArenaResult``.
+   Used at `commit` time to produce the terminal reward.
+2. **`auto_test_draft(opt, landscape, seed, steps)`** — lightweight per-draft
+   test. Single fixed seed, 20 steps by default. Result is a compact summary
+   surfaced to the LLM in the next observation so it can decide whether to
+   refine or commit.
+Returned summary from ``auto_test_draft`` has these fields (consumed by the
+LLM via the observation, and by the demo UI):
+    converged    True if final_f < 1% of initial_f (a real convergence signal,
+                 not just "code didn't raise").
+    diverged     True if code raised a SandboxError (NaN, shape mismatch,
+                 timeout, Python exception) OR if final_f blew up to an
+                 astronomical value while the optimizer returned finite arrays
+                 (a.k.a. "silent divergence" — e.g. LR too high, f grows
+                 geometrically but no step() call errors).
+    error        The SandboxError message if one was raised, else None.
+    initial_f    f(x0) — the value before any step was taken.
+    final_f      f(x_N) — the value at step N. This is the single most
+                 important diagnostic; an agent looking at final_f=5e13 knows
+                 its optimizer exploded even though the code compiled.
+    min_f        min over all visited points (catches transient improvement
+                 that was then lost).
+    step_of_min  index (0..N-1) of the step at which min_f was achieved.
+A draft with ``diverged=True`` always scores ``r_eval_failures > 0`` when
+committed — the ``compute_optcoder_reward`` pipeline treats exploded final_f
+the same as a hard crash.
 """
 from dataclasses import dataclass
     )
+#: final_f above this (absolute value) is treated as silent divergence even
+#: if every step() call returned finite values. Picked so genuinely bad
+#: convergence on stiff landscapes (f around 1e6) still counts as a valid
+#: run, but runaway growth (f around 1e10+) gets flagged.
+DIVERGENCE_F_THRESHOLD = 1e10
 def auto_test_draft(optimizer: CompiledOptimizer, ls: Landscape,
                     seed: int = 0, steps: int = 20, init_scale: float = 0.5) -> dict:
     """Single-seed quick test used at draft() time.
+    Runs the optimizer for ``steps`` steps from a fixed seed, returning a
+    summary (for the LLM) + full per-step detail (for ``inspect`` follow-ups).
+    Silent-divergence detection: if the code raises, we flag diverged=True as
+    usual. But we ALSO flag diverged when the optimizer returned finite arrays
+    yet the landscape's f(x) blew past ``DIVERGENCE_F_THRESHOLD`` — the common
+    "LR too high, f grows geometrically, no errors" case.
     """
     rng = np.random.default_rng(seed)
     x = rng.normal(0.0, init_scale, size=ls.dim)
                 "t": t, "x": x.tolist(), "f": float(ls.f(x)),
                 "grad_norm": gn, "update_norm": update_norm, "step_size_eff": step_size,
             })
+            # Silent-divergence guard — f exploded even though step() didn't raise
+            if abs(detail[-1]["f"]) > DIVERGENCE_F_THRESHOLD:
+                diverged = True
+                err = (f"silent divergence: |f| = {detail[-1]['f']:.3g} "
+                        f"> {DIVERGENCE_F_THRESHOLD:.0e} at step {t}")
+                break
     except SandboxError as e:
         diverged = True
         err = str(e)

frontend/src/components/TopBar.jsx CHANGED Viewed

@@ -15,23 +15,16 @@ export function TopBar() {
         </div>
       </div>
       <nav className="flex items-center gap-1">
-        {[
-          ['Space', 'https://huggingface.co/spaces/mnawfal29/landscapeforge'],
-          ['API schema', '/schema'],
-          ['OpenAPI', '/openapi.json'],
-        ].map(([label, href]) => (
-          <a
-            key={label}
-            href={href}
-            target="_blank"
-            rel="noreferrer"
-            className="text-sm text-muted hover:text-ink px-3 py-1.5 rounded-md
-                       hover:bg-surface border border-transparent
-                       hover:border-border transition-colors"
-          >
-            {label}
-          </a>
-        ))}
       </nav>
     </header>
   )

         </div>
       </div>
       <nav className="flex items-center gap-1">
+        <a
+          href="https://huggingface.co/spaces/mnawfal29/landscapeforge"
+          target="_blank"
+          rel="noreferrer"
+          className="text-sm text-muted hover:text-ink px-3 py-1.5 rounded-md
+                     hover:bg-surface border border-transparent
+                     hover:border-border transition-colors"
+        >
+          Space
+        </a>
       </nav>
     </header>
   )

server/api_routes.py CHANGED Viewed

@@ -375,6 +375,37 @@ def api_arena(req: ArenaReq):
                                  title=f"{req.template} — your optimizer vs tuned Adam")
     bk = reward.breakdown
     return {
         "contour":   contour or _empty_fig(f"{req.template} · dim={dim}\nContour is 2-D only"),
         "progress":  _bar_fig(
@@ -388,12 +419,13 @@ def api_arena(req: ArenaReq):
         "summary_md": (
             f"<h3>Results</h3>"
             f"<ul>"
-            f"<li>Your mean progress: <code>{user_arena.mean_progress:.4g}</code></li>"
-            f"<li>Tuned Adam progress: <code>{adam_arena.mean_progress:.4g}</code>"
-            f" (lr=<code>{best_lr:g}</code>)</li>"
-            f"<li>Speedup vs Adam: <code>{bk.get('speedup_vs_adam', 0):.3g}×</code></li>"
             f"<li>Your crash fraction: <code>{user_arena.crash_fraction:.0%}</code></li>"
-            f"<li><strong>Total reward: <code>{reward.r_total:+.3f}</code></strong></li>"
             f"</ul>"
         ),
     }

                                  title=f"{req.template} — your optimizer vs tuned Adam")
     bk = reward.breakdown
+    speedup = bk.get("speedup_vs_adam", 0.0)
+    # Narrate the reward decomposition so users aren't confused when reward
+    # is positive despite speedup≈1× (r_convergence + r_robustness contribute
+    # independently of beating Adam; see §9.1 of LANDSCAPEFORGE_DESIGN.md).
+    parts = []
+    if abs(bk["r_regret"] * 1.0) > 0.01:
+        parts.append(f"regret {bk['r_regret']*1.0:+.3f}")
+    if abs(bk["r_convergence"] * 0.3) > 0.01:
+        parts.append(f"convergence {bk['r_convergence']*0.3:+.3f}")
+    if abs(bk["r_robustness"] * 0.3) > 0.01:
+        parts.append(f"robustness {bk['r_robustness']*0.3:+.3f}")
+    if abs(bk["r_novelty"] * 0.1) > 0.01:
+        parts.append(f"novelty {bk['r_novelty']*0.1:+.3f}")
+    if abs(bk["r_budget"] * 0.05) > 0.01:
+        parts.append(f"budget {-bk['r_budget']*0.05:+.3f}")
+    if abs(bk["r_eval_failures"] * 0.5) > 0.01:
+        parts.append(f"eval {-bk['r_eval_failures']*0.5:+.3f}")
+    # Speedup phrasing — avoid nonsense like "0.00×" when diverged
+    my_p, adam_p = user_arena.mean_progress, adam_arena.mean_progress
+    if my_p < 0:
+        speedup_line = "your optimizer <strong>diverged</strong> (f moved uphill)"
+    elif adam_p <= 0:
+        speedup_line = (f"Adam made no progress on this landscape; "
+                         f"your progress: <code>{my_p:.3g}</code>")
+    else:
+        speedup_line = (f"Speedup vs Adam: <code>{speedup:.3g}×</code> "
+                         f"(your descent <code>{my_p:.3g}</code>, Adam's "
+                         f"<code>{adam_p:.3g}</code>)")
     return {
         "contour":   contour or _empty_fig(f"{req.template} · dim={dim}\nContour is 2-D only"),
         "progress":  _bar_fig(
         "summary_md": (
             f"<h3>Results</h3>"
             f"<ul>"
+            f"<li>{speedup_line}</li>"
+            f"<li>Tuned Adam LR: <code>{best_lr:g}</code></li>"
             f"<li>Your crash fraction: <code>{user_arena.crash_fraction:.0%}</code></li>"
+            f"<li><strong>Total reward: <code>{reward.r_total:+.3f}</code></strong>"
+            + (f"<span style='color:#b5ada0'> "
+               f"= {' + '.join(parts)}</span>" if parts else "")
+            + "</li>"
             f"</ul>"
         ),
     }