Spaces:

mnawfal29
/

landscapeforge

Sleeping

File size: 8,212 Bytes

"""Arena — run a compiled OptCoder submission on a landscape and score it.

Two entry points:

1. **`run_arena(opt, landscape, seeds, steps)`** — Phase-D full evaluation.
   Run the committed optimizer for ``steps`` iterations from each seed's fresh
   random init, collect per-seed trajectories, and return an ``ArenaResult``.
   Used at `commit` time to produce the terminal reward.

2. **`auto_test_draft(opt, landscape, seed, steps)`** — lightweight per-draft
   test. Single fixed seed, 20 steps by default. Result is a compact summary
   surfaced to the LLM in the next observation so it can decide whether to
   refine or commit.

Returned summary from ``auto_test_draft`` has these fields (consumed by the
LLM via the observation, and by the demo UI):

    converged    True if final_f < 1% of initial_f (a real convergence signal,
                 not just "code didn't raise").
    diverged     True if code raised a SandboxError (NaN, shape mismatch,
                 timeout, Python exception) OR if final_f blew up to an
                 astronomical value while the optimizer returned finite arrays
                 (a.k.a. "silent divergence" — e.g. LR too high, f grows
                 geometrically but no step() call errors).
    error        The SandboxError message if one was raised, else None.
    initial_f    f(x0) — the value before any step was taken.
    final_f      f(x_N) — the value at step N. This is the single most
                 important diagnostic; an agent looking at final_f=5e13 knows
                 its optimizer exploded even though the code compiled.
    min_f        min over all visited points (catches transient improvement
                 that was then lost).
    step_of_min  index (0..N-1) of the step at which min_f was achieved.

A draft with ``diverged=True`` always scores ``r_eval_failures > 0`` when
committed — the ``compute_optcoder_reward`` pipeline treats exploded final_f
the same as a hard crash.
"""

from dataclasses import dataclass

import numpy as np

try:
    from .landscapes import Landscape
    from .sandbox import CompiledOptimizer, SandboxError
except ImportError:  # flat layout (HF Space container)
    from landscapes import Landscape                       # type: ignore
    from sandbox import CompiledOptimizer, SandboxError    # type: ignore


@dataclass
class ArenaResult:
    initial_values: list[float]   # per-seed f(x_0)
    final_values: list[float]     # per-seed f(x_N); NaN if crashed
    crashed: list[bool]           # per-seed
    trajectories: list[list[dict]]  # per-seed trajectories (may be empty)

    @property
    def mean_progress(self) -> float:
        """Mean descent: f_initial - f_final, averaged across non-crashed seeds.
        Positive = optimizer descended; 0 = stayed put; negative = went uphill.
        Crashed seeds count as 0 progress (conservative).
        """
        prog: list[float] = []
        for init, fin, crashed in zip(self.initial_values, self.final_values,
                                       self.crashed):
            if crashed or not np.isfinite(fin):
                prog.append(0.0)
            else:
                prog.append(init - fin)
        return float(np.mean(prog)) if prog else 0.0

    @property
    def mean_initial_scale(self) -> float:
        """|mean initial f|; used to establish a denominator floor when Adam
        itself makes near-zero progress (rare but possible on plateaus)."""
        vals = [abs(v) for v in self.initial_values if np.isfinite(v)]
        return float(np.mean(vals)) if vals else 1.0

    @property
    def crash_fraction(self) -> float:
        return float(np.mean(self.crashed)) if self.crashed else 0.0

    @property
    def robustness(self) -> float:
        """1 - std/|mean|, clamped to [0, 1]. High = consistent across seeds."""
        vals = [v for v in self.final_values if np.isfinite(v)]
        if len(vals) < 2:
            return 0.0
        m = np.mean(vals)
        s = np.std(vals)
        if abs(m) < 1e-9:
            return 1.0 if s < 1e-6 else 0.0
        return float(np.clip(1.0 - s / (abs(m) + 1e-9), 0.0, 1.0))


def run_arena(optimizer: CompiledOptimizer, ls: Landscape,
              seeds: list[int], steps: int = 200,
              init_scale: float = 0.5) -> ArenaResult:
    """Run the compiled optimizer from fresh seeds; capture per-run metrics.

    Does NOT depend on `ls.f_min` — per-seed progress is `f_initial - f_final`,
    which is observable regardless of whether the global minimum is known.
    """
    initials, finals, crashed, trajs = [], [], [], []
    for seed in seeds:
        rng = np.random.default_rng(seed)
        x = rng.normal(0.0, init_scale, size=ls.dim)
        f0 = float(ls.f(x))
        initials.append(f0)
        traj: list[dict] = []
        did_crash = False
        try:
            for t in range(steps):
                fv = float(ls.f(x))
                g = np.asarray(ls.grad(x), dtype=float)
                traj.append({"t": t, "x": x.tolist(), "f": fv})
                x = optimizer.step(x, fv, g)
        except SandboxError:
            did_crash = True

        if did_crash:
            finals.append(float("nan"))
        else:
            finals.append(float(ls.f(x)))
        crashed.append(did_crash)
        trajs.append(traj)

    return ArenaResult(
        initial_values=initials,
        final_values=finals,
        crashed=crashed,
        trajectories=trajs,
    )


#: final_f above this (absolute value) is treated as silent divergence even
#: if every step() call returned finite values. Picked so genuinely bad
#: convergence on stiff landscapes (f around 1e6) still counts as a valid
#: run, but runaway growth (f around 1e10+) gets flagged.
DIVERGENCE_F_THRESHOLD = 1e10


def auto_test_draft(optimizer: CompiledOptimizer, ls: Landscape,
                    seed: int = 0, steps: int = 20, init_scale: float = 0.5) -> dict:
    """Single-seed quick test used at draft() time.

    Runs the optimizer for ``steps`` steps from a fixed seed, returning a
    summary (for the LLM) + full per-step detail (for ``inspect`` follow-ups).

    Silent-divergence detection: if the code raises, we flag diverged=True as
    usual. But we ALSO flag diverged when the optimizer returned finite arrays
    yet the landscape's f(x) blew past ``DIVERGENCE_F_THRESHOLD`` — the common
    "LR too high, f grows geometrically, no errors" case.
    """
    rng = np.random.default_rng(seed)
    x = rng.normal(0.0, init_scale, size=ls.dim)
    x0 = x.copy()
    detail: list[dict] = []
    diverged = False
    err: str | None = None
    try:
        for t in range(steps):
            fv = float(ls.f(x))
            g = np.asarray(ls.grad(x), dtype=float)
            gn = float(np.linalg.norm(g))
            prev_x = x.copy()
            x = optimizer.step(x, fv, g)
            update_norm = float(np.linalg.norm(x - prev_x))
            step_size = update_norm / (gn + 1e-12)
            detail.append({
                "t": t, "x": x.tolist(), "f": float(ls.f(x)),
                "grad_norm": gn, "update_norm": update_norm, "step_size_eff": step_size,
            })
            # Silent-divergence guard — f exploded even though step() didn't raise
            if abs(detail[-1]["f"]) > DIVERGENCE_F_THRESHOLD:
                diverged = True
                err = (f"silent divergence: |f| = {detail[-1]['f']:.3g} "
                        f"> {DIVERGENCE_F_THRESHOLD:.0e} at step {t}")
                break
    except SandboxError as e:
        diverged = True
        err = str(e)

    if diverged or not detail:
        summary = {
            "converged": False, "diverged": True, "error": err,
            "final_f": None, "initial_f": float(ls.f(x0)),
            "step_of_min": None, "min_f": None,
        }
    else:
        fs = [d["f"] for d in detail]
        step_of_min = int(np.argmin(fs))
        summary = {
            "converged": bool(fs[-1] < 0.1 * ls.f(x0)),
            "diverged": False, "error": None,
            "final_f": fs[-1], "initial_f": float(ls.f(x0)),
            "step_of_min": step_of_min, "min_f": min(fs),
        }
    return {"summary": summary, "detail": detail}