File size: 20,454 Bytes
b0b140b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
"""LandscapeForge OpenEnv environment — OptCoder REPL (Phase C).

For v1 we ship OptCoder-only: LandscapeForge is a fixed template picker
controlled by the env itself (uniform random over the tier menu). The agent
acting through OpenEnv is OptCoder.

Each `reset()` samples a new landscape from the current tier. Each `step()`
executes one OptCoder action (run_baseline / draft / inspect / commit),
mutates env state, and returns an Observation reflecting the new state.
Episode ends when OptCoder commits or budget is exhausted.
"""

from __future__ import annotations

from typing import Any, Optional
from uuid import uuid4

import numpy as np
from openenv.core.env_server.interfaces import Environment
from openenv.core.env_server.types import State

try:
    from ..models import (
        ACTION_COSTS,
        LandscapeforgeAction,
        LandscapeforgeObservation,
    )
    from ..landscapes import (
        TIER_MENU,
        Landscape,
        build_landscape,
        structural_hints,
    )
    from ..reference_optimizers import run_baseline as run_reference_baseline
    from ..reference_optimizers import tune_adam_lr
    from ..sandbox import SandboxError, compile_optimizer
    from ..arena import ArenaResult, auto_test_draft, run_arena
    from ..rewards import ast_novelty_score, compute_optcoder_reward, compute_step_reward
except ImportError:
    # Running from repo root or package layout quirks
    from models import (                                    # type: ignore
        ACTION_COSTS,
        LandscapeforgeAction,
        LandscapeforgeObservation,
    )
    from landscapes import (                                 # type: ignore
        TIER_MENU,
        Landscape,
        build_landscape,
        structural_hints,
    )
    from reference_optimizers import run_baseline as run_reference_baseline  # type: ignore
    from reference_optimizers import tune_adam_lr  # type: ignore
    from sandbox import SandboxError, compile_optimizer     # type: ignore
    from arena import ArenaResult, auto_test_draft, run_arena  # type: ignore
    from rewards import ast_novelty_score, compute_optcoder_reward, compute_step_reward  # type: ignore


BUDGET_TOTAL = 12
ARENA_SEEDS = [101, 202, 303, 404, 505, 606, 707, 808, 909, 1010]
ARENA_STEPS = 200
BASELINE_STEPS = 30    # env-controlled; agent does not choose


# Reference source blobs for AST novelty comparison (short pseudo-implementations).
# Kept minimal — enough to detect "this commit is basically Adam".
_REF_SGD = """
class Optimizer:
    def __init__(self, dim): self.lr = 0.01
    def step(self, x, f, g): return x - self.lr * g
""".strip()

def _adam_source(lr: float) -> str:
    """Adam reference implementation parameterized by LR.

    Used by `_ensure_adam_arena` after LR tuning — the baseline is
    Adam-at-best-LR-for-this-landscape, not Adam-at-fixed-default.
    """
    return f"""
class Optimizer:
    def __init__(self, dim):
        self.lr = {lr}
        self.b1 = 0.9
        self.b2 = 0.999
        self.eps = 1e-8
        self.m = np.zeros(dim)
        self.v = np.zeros(dim)
        self.t = 0
    def step(self, x, f_val, g):
        self.t += 1
        self.m = self.b1*self.m + (1-self.b1)*g
        self.v = self.b2*self.v + (1-self.b2)*g*g
        mh = self.m/(1-self.b1**self.t)
        vh = self.v/(1-self.b2**self.t)
        return x - self.lr * mh / (np.sqrt(vh) + self.eps)
""".strip()


# Frozen default-LR source used only for AST-novelty comparison (so r_novelty
# measures "structurally different from Adam" regardless of the tuned LR).
_REF_ADAM = _adam_source(0.001)

_REF_MOMENTUM = """
class Optimizer:
    def __init__(self, dim):
        import numpy as np
        self.lr=0.01; self.beta=0.9; self.v = np.zeros(dim)
    def step(self, x, f, g):
        self.v = self.beta*self.v - self.lr*g
        return x + self.v
""".strip()

REFERENCE_SOURCES = [_REF_SGD, _REF_ADAM, _REF_MOMENTUM]


class LandscapeforgeEnvironment(Environment):
    """OptCoder-facing OpenEnv environment.

    LandscapeForge is internal (template picker) in v1.
    """

    SUPPORTS_CONCURRENT_SESSIONS: bool = True

    def __init__(self, tier: str = "T0", seed: int = 0):
        self._initial_tier = tier
        self._master_rng = np.random.default_rng(seed)
        self._reset_count = 0
        self._tier = tier
        self._state = State(episode_id=str(uuid4()), step_count=0)
        # Populated by reset()
        self._landscape: Optional[Landscape] = None
        self._hints: dict = {}
        self._baseline_history: list[dict] = []
        self._draft_history: list[dict] = []
        self._draft_details: list[list[dict]] = []     # per-draft per-step detail
        self._inspect_requests: list[dict] = []
        self._current_draft: Optional[str] = None
        self._budget_spent: int = 0
        self._committed: bool = False
        self._final_obs: Optional[LandscapeforgeObservation] = None
        # Cache Adam's full arena result per episode (computed lazily, for
        # reward normalization via progress-based r_regret). The baseline is
        # Adam-at-tuned-LR — per-landscape LR is selected via a short sweep.
        self._adam_arena_cache: Optional[ArenaResult] = None
        self._adam_tuned_lr: Optional[float] = None
        # Stepwise feedback log (PBS delta + compile penalty). This is shown to
        # the LLM in the observation so it can course-correct mid-episode, but
        # NEVER added to the training scalar — final reward is purely terminal
        # arena reward (§9.1) for robustness against reward hacking.
        self._step_feedback_log: list[dict] = []

    # ---------- OpenEnv API ----------

    def reset(self) -> LandscapeforgeObservation:
        self._reset_count += 1
        self._state = State(episode_id=str(uuid4()), step_count=0)

        # Pick a landscape from the current tier's menu.
        menu = TIER_MENU[self._tier]
        template = str(self._master_rng.choice(menu))
        dim = int(self._master_rng.integers(2, 6))   # small dims for v1
        params = self._sample_params(template)
        self._landscape = build_landscape(
            template=template, dim=dim, params=params,
            rng=np.random.default_rng(int(self._master_rng.integers(0, 2**31))),
        )
        self._hints = structural_hints(
            self._landscape,
            rng=np.random.default_rng(int(self._master_rng.integers(0, 2**31))),
        )

        # Wipe REPL state
        self._baseline_history = []
        self._draft_history = []
        self._draft_details = []
        self._inspect_requests = []
        self._current_draft = None
        self._budget_spent = 0
        self._committed = False
        self._final_obs = None
        self._adam_arena_cache = None
        self._adam_tuned_lr = None
        self._step_feedback_log = []

        return self._make_observation(
            last_kind=None, last_result={"reset": True}, done=False, reward=0.0,
        )

    def step(self, action: LandscapeforgeAction) -> LandscapeforgeObservation:  # type: ignore[override]
        if self._landscape is None:
            raise RuntimeError("step() called before reset()")
        if self._committed:
            # Episode already done; return terminal obs.
            assert self._final_obs is not None
            return self._final_obs

        self._state.step_count += 1
        cost = ACTION_COSTS[action.kind]
        # Charge budget first so over-limit actions are rejected.
        if self._budget_spent + cost > BUDGET_TOTAL and action.kind != "commit":
            return self._force_commit(reason="budget_exhausted")

        self._budget_spent += cost

        # Snapshot draft history for PBS computation
        prev_draft_history_snapshot = list(self._draft_history)

        if action.kind == "run_baseline":
            result = self._do_run_baseline(action)
        elif action.kind == "draft":
            result = self._do_draft(action)
        elif action.kind == "inspect":
            result = self._do_inspect(action)
        elif action.kind == "commit":
            return self._do_commit()
        else:
            raise ValueError(f"Unknown action kind: {action.kind}")

        # Compute stepwise FEEDBACK (NOT reward). Signals the LLM can use to
        # course-correct mid-episode — exposed through last_action_result.
        # Explicitly NOT summed into training reward; terminal arena reward
        # is the only signal GRPO sees (robust against reward hacking).
        step_feedback = compute_step_reward(
            prev_draft_history=prev_draft_history_snapshot,
            new_draft_history=self._draft_history,
            action_kind=action.kind,
            action_result=result,
        )
        if step_feedback["breakdown"]:
            entry = {
                "turn": self._state.step_count,
                "action_kind": action.kind,
                **step_feedback["breakdown"],
            }
            self._step_feedback_log.append(entry)
            # Surface on this turn's action result so the LLM sees it immediately.
            result = {**result, "feedback": step_feedback["breakdown"]}

        # Check if budget now exhausted; if so, auto-commit.
        if self._budget_spent >= BUDGET_TOTAL:
            return self._force_commit(reason="budget_exhausted")

        return self._make_observation(
            last_kind=action.kind, last_result=result,
            done=False, reward=0.0,    # no reward on non-terminal steps
        )

    @property
    def state(self) -> State:
        return self._state

    # ---------- Action handlers ----------

    def _do_run_baseline(self, action: LandscapeforgeAction) -> dict:
        assert self._landscape is not None
        # Fixed init AND fixed step count for baseline comparability across
        # episodes and rollouts (important for GRPO group-relative advantages).
        rng = np.random.default_rng(42)
        x0 = rng.normal(0.0, 0.5, size=self._landscape.dim)
        result = run_reference_baseline(
            name=action.baseline_name, f=self._landscape.f, grad=self._landscape.grad,
            x0=x0, steps=BASELINE_STEPS,
        )
        self._baseline_history.append(result)
        return {
            "baseline_index": len(self._baseline_history) - 1,
            "name": result["name"],
            "n_steps": len(result["trajectory"]),
            "final_f": (result["trajectory"][-1]["f"]
                        if result["trajectory"] and result["trajectory"][-1]["f"] is not None
                        else None),
        }

    def _do_draft(self, action: LandscapeforgeAction) -> dict:
        assert self._landscape is not None
        code = action.code or ""
        self._current_draft = code
        try:
            opt = compile_optimizer(code, dim=self._landscape.dim)
        except SandboxError as e:
            # Record failed draft; still counts toward history for inspect.
            self._draft_history.append({
                "code": code,
                "compile_error": str(e),
                "summary": {"converged": False, "diverged": True, "error": str(e),
                            "final_f": None, "step_of_min": None, "min_f": None},
            })
            self._draft_details.append([])
            return {"draft_index": len(self._draft_history) - 1,
                    "compile_error": str(e), "summary": None}

        test = auto_test_draft(opt, self._landscape, seed=0, steps=20)
        self._draft_history.append({
            "code": code,
            "compile_error": None,
            "summary": test["summary"],
        })
        self._draft_details.append(test["detail"])
        return {"draft_index": len(self._draft_history) - 1,
                "compile_error": None, "summary": test["summary"]}

    def _do_inspect(self, action: LandscapeforgeAction) -> dict:
        idx = action.draft_idx
        if idx is None or idx < 0 or idx >= len(self._draft_details):
            return {"error": f"draft_idx {idx} out of range (have {len(self._draft_details)} drafts)"}
        detail = self._draft_details[idx]
        start = action.step_range_start
        end = min(action.step_range_end, len(detail))
        sliced = detail[start:end]
        record = {
            "draft_idx": idx,
            "step_range": [start, end],
            "detail": sliced,
        }
        self._inspect_requests.append(record)
        return {"draft_idx": idx, "step_range": [start, end], "n_steps": len(sliced)}

    def _do_commit(self) -> LandscapeforgeObservation:
        return self._finalize_episode(reason="commit")

    def _force_commit(self, reason: str) -> LandscapeforgeObservation:
        return self._finalize_episode(reason=reason)

    # ---------- Episode finalization ----------

    def _finalize_episode(self, reason: str) -> LandscapeforgeObservation:
        assert self._landscape is not None
        self._committed = True

        # Need a current_draft. If none, produce a worst-case result.
        if not self._current_draft:
            result = {
                "reason": reason,
                "no_draft": True,
                "final_regret": 1.0,
            }
            r_total = -1.0
            breakdown = {"no_draft": 1.0}
            obs = self._make_observation(
                last_kind="commit", last_result=result,
                done=True, reward=r_total,
            )
            obs.committed = True
            obs.final_regret = 1.0
            obs.r_optcoder = r_total
            obs.r_optcoder_breakdown = breakdown
            self._final_obs = obs
            return obs

        # Full Phase-D arena eval
        try:
            opt = compile_optimizer(self._current_draft, dim=self._landscape.dim)
            arena = run_arena(opt, self._landscape, seeds=ARENA_SEEDS, steps=ARENA_STEPS)
        except SandboxError as e:
            # Committed code fails to compile -> worst-case result
            arena = ArenaResult(
                initial_values=[1.0] * len(ARENA_SEEDS),
                final_values=[float("nan")] * len(ARENA_SEEDS),
                crashed=[True] * len(ARENA_SEEDS),
                trajectories=[[] for _ in ARENA_SEEDS],
            )

        # Adam baseline arena for normalization (always run for reward stability).
        adam_arena = self._ensure_adam_arena()

        novelty = ast_novelty_score(self._current_draft, REFERENCE_SOURCES)
        # Convergence step: first seed's trajectory, first step where f < 0.01 * f0
        convergence_step = self._compute_convergence_step(arena)

        reward = compute_optcoder_reward(
            arena=arena,
            adam_arena=adam_arena,
            actions_used_cost=self._budget_spent,
            budget_total=BUDGET_TOTAL,
            novelty_score=novelty,
            convergence_step=convergence_step,
            arena_steps=ARENA_STEPS,
        )

        result = {
            "reason": reason,
            "my_mean_progress": arena.mean_progress,
            "adam_mean_progress": adam_arena.mean_progress,
            "adam_tuned_lr": self._adam_tuned_lr,
            "speedup_vs_adam": reward.breakdown.get("speedup_vs_adam"),
            "crash_fraction": arena.crash_fraction,
            "novelty_score": novelty,
            "convergence_step": convergence_step,
        }

        obs = self._make_observation(
            last_kind="commit", last_result=result,
            done=True, reward=reward.r_total,
        )
        obs.committed = True
        # `final_regret` is reinterpreted (no f_min dependency): Adam-shortfall
        # in [0, 1]. 0 = matched or beat Adam's descent; 1 = made zero progress
        # while Adam descended normally. Capped at 1.
        speedup = reward.breakdown.get("speedup_vs_adam", 0.0)
        obs.final_regret = float(max(0.0, min(1.0, 1.0 - speedup)))
        obs.r_optcoder = reward.r_total
        obs.r_optcoder_breakdown = reward.breakdown
        self._final_obs = obs
        return obs

    # ---------- Helpers ----------

    def _make_observation(self, last_kind: Optional[str], last_result: dict,
                          done: bool, reward: float) -> LandscapeforgeObservation:
        assert self._landscape is not None
        return LandscapeforgeObservation(
            landscape_description=self._landscape.description,
            dim=self._landscape.dim,
            structural_hints=self._hints,
            baseline_history=self._serialize_baseline_history(),
            draft_history=self._serialize_draft_history(),
            inspect_requests=list(self._inspect_requests),
            current_draft=self._current_draft,
            budget_remaining=BUDGET_TOTAL - self._budget_spent,
            last_action_kind=last_kind,
            last_action_result=last_result,
            done=done,
            reward=reward,
        )

    def _serialize_baseline_history(self) -> list[dict]:
        # Trim trajectory to summary-friendly size (every step, x as list).
        return [
            {"name": b["name"], "trajectory": b["trajectory"]}
            for b in self._baseline_history
        ]

    def _serialize_draft_history(self) -> list[dict]:
        # For the observation we include code + summary per draft.
        return [
            {"code": d["code"], "summary": d["summary"], "compile_error": d["compile_error"]}
            for d in self._draft_history
        ]

    def _sample_params(self, template: str) -> dict:
        rng = self._master_rng
        if template == "quadratic":
            # T0 uses cond up to 100; T1 up to 1000; T2 higher.
            cap = {"T0": 100.0, "T1": 1000.0, "T2": 10_000.0}[self._tier]
            return {"cond": float(rng.uniform(1.0, cap))}
        if template == "gaussian_mix":
            return {
                "k": int(rng.integers(2, 6)),
                "sigma": float(rng.uniform(0.3, 1.0)),
                "spread": float(rng.uniform(1.0, 4.0)),
            }
        if template == "huber":
            return {"delta": float(rng.uniform(0.5, 2.0))}
        return {}

    def _ensure_adam_arena(self) -> ArenaResult:
        """Build the Adam baseline, FAIRLY — LR is tuned per landscape before
        running the arena. The tuning uses a short 30-step sweep on a dedicated
        seed (not one of the arena seeds) to avoid overfitting.

        Cached per episode in `_adam_arena_cache`. Tuned LR is stored in
        `_adam_tuned_lr` for logging / demo surfacing.
        """
        if self._adam_arena_cache is not None:
            return self._adam_arena_cache
        assert self._landscape is not None
        try:
            # Tune LR on seed 0 (not in ARENA_SEEDS), 30-step sweep.
            tune_rng = np.random.default_rng(0)
            tune_x0 = tune_rng.normal(0.0, 0.5, size=self._landscape.dim)
            best_lr = tune_adam_lr(
                f=self._landscape.f, grad=self._landscape.grad,
                x0=tune_x0, sweep_steps=30,
            )
            self._adam_tuned_lr = best_lr

            adam_opt = compile_optimizer(_adam_source(best_lr), dim=self._landscape.dim)
            self._adam_arena_cache = run_arena(
                adam_opt, self._landscape,
                seeds=ARENA_SEEDS, steps=ARENA_STEPS,
            )
        except Exception:
            self._adam_tuned_lr = None
            self._adam_arena_cache = ArenaResult(
                initial_values=[1.0] * len(ARENA_SEEDS),
                final_values=[1.0] * len(ARENA_SEEDS),
                crashed=[True] * len(ARENA_SEEDS),
                trajectories=[[] for _ in ARENA_SEEDS],
            )
        return self._adam_arena_cache

    def _compute_convergence_step(self, arena) -> Optional[int]:
        """First step on first seed where f < 1% of initial f."""
        if not arena.trajectories or not arena.trajectories[0]:
            return None
        traj = arena.trajectories[0]
        if not traj:
            return None
        f0 = traj[0]["f"]
        if f0 <= 0:
            return None
        threshold = 0.01 * f0
        for t, snap in enumerate(traj):
            if snap["f"] < threshold:
                return t
        return None

    # ---------- Tier advancement API (used by trainer, not agent) ----------

    def advance_tier(self, new_tier: str) -> None:
        if new_tier not in TIER_MENU:
            raise ValueError(f"Unknown tier {new_tier}")
        self._tier = new_tier