File size: 5,453 Bytes
b0b140b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
"""End-to-end smoke test: scripted episode, in-process, no server.

Runs: run_baseline(adam) -> draft(Adam-ish) -> inspect -> draft(SGD+momentum)
      -> commit, and verifies the env threads state correctly and produces a
      finite reward.
"""

from __future__ import annotations

import sys
from pathlib import Path

# Allow running directly: `python tests/test_episode.py`
sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))

from landscapeforge.models import LandscapeforgeAction              # type: ignore
from landscapeforge.server.landscapeforge_environment import (       # type: ignore
    LandscapeforgeEnvironment,
)


ADAM_CODE = """
import numpy as np

class Optimizer:
    def __init__(self, dim):
        self.lr = 1e-3
        self.b1 = 0.9
        self.b2 = 0.999
        self.eps = 1e-8
        self.m = np.zeros(dim)
        self.v = np.zeros(dim)
        self.t = 0

    def step(self, x, f_val, grad):
        self.t += 1
        self.m = self.b1 * self.m + (1 - self.b1) * grad
        self.v = self.b2 * self.v + (1 - self.b2) * grad * grad
        m_hat = self.m / (1 - self.b1 ** self.t)
        v_hat = self.v / (1 - self.b2 ** self.t)
        return x - self.lr * m_hat / (np.sqrt(v_hat) + self.eps)
"""

SGDM_CODE = """
import numpy as np

class Optimizer:
    def __init__(self, dim):
        self.lr = 0.05
        self.beta = 0.9
        self.v = np.zeros(dim)

    def step(self, x, f_val, grad):
        self.v = self.beta * self.v - self.lr * grad
        return x + self.v
"""


def scripted_episode() -> None:
    env = LandscapeforgeEnvironment(tier="T0", seed=42)
    obs = env.reset()
    print(f"[reset] landscape: {obs.landscape_description}")
    print(f"        dim={obs.dim}, hints={obs.structural_hints}")
    print(f"        budget={obs.budget_remaining}")

    # 1. Run Adam baseline to see what it does.
    obs = env.step(LandscapeforgeAction(
        kind="run_baseline", baseline_name="adam",
    ))
    print(f"\n[run_baseline adam] result={obs.last_action_result}")
    print(f"                    budget_remaining={obs.budget_remaining}")

    # 2. Submit an Adam draft.
    obs = env.step(LandscapeforgeAction(kind="draft", code=ADAM_CODE))
    print(f"\n[draft adam] compile_error={obs.last_action_result.get('compile_error')}")
    print(f"             summary={obs.last_action_result.get('summary')}")
    print(f"             budget_remaining={obs.budget_remaining}")

    # 3. Inspect the first draft.
    obs = env.step(LandscapeforgeAction(
        kind="inspect", draft_idx=0, step_range_start=10, step_range_end=20,
    ))
    print(f"\n[inspect 0 steps 10-20] result={obs.last_action_result}")
    print(f"                        budget_remaining={obs.budget_remaining}")

    # 4. Submit an SGD+momentum alternative.
    obs = env.step(LandscapeforgeAction(kind="draft", code=SGDM_CODE))
    print(f"\n[draft sgdm] compile_error={obs.last_action_result.get('compile_error')}")
    print(f"             summary={obs.last_action_result.get('summary')}")
    print(f"             budget_remaining={obs.budget_remaining}")

    # 5. Commit.
    obs = env.step(LandscapeforgeAction(kind="commit"))
    print(f"\n[commit]")
    print(f"  done={obs.done}")
    print(f"  reward={obs.reward}")
    print(f"  final_regret={obs.final_regret}")
    print(f"  r_optcoder_breakdown={obs.r_optcoder_breakdown}")
    print(f"  last_action_result={obs.last_action_result}")

    # Sanity checks
    assert obs.done is True, "should be done after commit"
    assert obs.reward is not None, "reward must be produced"
    assert obs.final_regret is not None, "final_regret must be produced"
    assert obs.r_optcoder_breakdown, "breakdown must be populated"
    print("\n✓ scripted_episode PASSED")


def episode_with_broken_code() -> None:
    """Submitting code that fails to compile should not crash the env."""
    env = LandscapeforgeEnvironment(tier="T0", seed=7)
    env.reset()

    # Intentional syntax error
    obs = env.step(LandscapeforgeAction(
        kind="draft", code="this is not python",
    ))
    print(f"\n[broken draft] compile_error={obs.last_action_result.get('compile_error')}")
    assert obs.last_action_result.get("compile_error") is not None
    assert obs.done is False

    # Commit with bad code — should produce worst-case regret, not crash
    obs = env.step(LandscapeforgeAction(kind="commit"))
    print(f"[broken commit] reward={obs.reward}, final_regret={obs.final_regret}")
    assert obs.done is True
    assert obs.reward is not None
    print("\n✓ episode_with_broken_code PASSED")


def budget_exhaustion() -> None:
    """Spamming drafts until budget runs out should auto-commit."""
    env = LandscapeforgeEnvironment(tier="T0", seed=3)
    env.reset()

    for i in range(10):
        obs = env.step(LandscapeforgeAction(kind="draft", code=ADAM_CODE))
        if obs.done:
            print(f"\n[budget_exhaustion] auto-committed after {i+1} drafts")
            print(f"                    reason={obs.last_action_result.get('reason')}")
            assert obs.last_action_result.get("reason") == "budget_exhausted"
            print("\n✓ budget_exhaustion PASSED")
            return
    raise AssertionError("Budget never exhausted — shouldn't happen with draft cost 2, budget 12")


if __name__ == "__main__":
    scripted_episode()
    episode_with_broken_code()
    budget_exhaustion()
    print("\nAll tests passed.")