Spaces:

mnawfal29
/

landscapeforge

Sleeping

File size: 5,453 Bytes

b0b140b

"""End-to-end smoke test: scripted episode, in-process, no server.

Runs: run_baseline(adam) -> draft(Adam-ish) -> inspect -> draft(SGD+momentum)
      -> commit, and verifies the env threads state correctly and produces a
      finite reward.
"""

from __future__ import annotations

import sys
from pathlib import Path

# Allow running directly: `python tests/test_episode.py`
sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))

from landscapeforge.models import LandscapeforgeAction              # type: ignore
from landscapeforge.server.landscapeforge_environment import (       # type: ignore
    LandscapeforgeEnvironment,
)


ADAM_CODE = """
import numpy as np

class Optimizer:
    def __init__(self, dim):
        self.lr = 1e-3
        self.b1 = 0.9
        self.b2 = 0.999
        self.eps = 1e-8
        self.m = np.zeros(dim)
        self.v = np.zeros(dim)
        self.t = 0

    def step(self, x, f_val, grad):
        self.t += 1
        self.m = self.b1 * self.m + (1 - self.b1) * grad
        self.v = self.b2 * self.v + (1 - self.b2) * grad * grad
        m_hat = self.m / (1 - self.b1 ** self.t)
        v_hat = self.v / (1 - self.b2 ** self.t)
        return x - self.lr * m_hat / (np.sqrt(v_hat) + self.eps)
"""

SGDM_CODE = """
import numpy as np

class Optimizer:
    def __init__(self, dim):
        self.lr = 0.05
        self.beta = 0.9
        self.v = np.zeros(dim)

    def step(self, x, f_val, grad):
        self.v = self.beta * self.v - self.lr * grad
        return x + self.v
"""


def scripted_episode() -> None:
    env = LandscapeforgeEnvironment(tier="T0", seed=42)
    obs = env.reset()
    print(f"[reset] landscape: {obs.landscape_description}")
    print(f"        dim={obs.dim}, hints={obs.structural_hints}")
    print(f"        budget={obs.budget_remaining}")

    # 1. Run Adam baseline to see what it does.
    obs = env.step(LandscapeforgeAction(
        kind="run_baseline", baseline_name="adam",
    ))
    print(f"\n[run_baseline adam] result={obs.last_action_result}")
    print(f"                    budget_remaining={obs.budget_remaining}")

    # 2. Submit an Adam draft.
    obs = env.step(LandscapeforgeAction(kind="draft", code=ADAM_CODE))
    print(f"\n[draft adam] compile_error={obs.last_action_result.get('compile_error')}")
    print(f"             summary={obs.last_action_result.get('summary')}")
    print(f"             budget_remaining={obs.budget_remaining}")

    # 3. Inspect the first draft.
    obs = env.step(LandscapeforgeAction(
        kind="inspect", draft_idx=0, step_range_start=10, step_range_end=20,
    ))
    print(f"\n[inspect 0 steps 10-20] result={obs.last_action_result}")
    print(f"                        budget_remaining={obs.budget_remaining}")

    # 4. Submit an SGD+momentum alternative.
    obs = env.step(LandscapeforgeAction(kind="draft", code=SGDM_CODE))
    print(f"\n[draft sgdm] compile_error={obs.last_action_result.get('compile_error')}")
    print(f"             summary={obs.last_action_result.get('summary')}")
    print(f"             budget_remaining={obs.budget_remaining}")

    # 5. Commit.
    obs = env.step(LandscapeforgeAction(kind="commit"))
    print(f"\n[commit]")
    print(f"  done={obs.done}")
    print(f"  reward={obs.reward}")
    print(f"  final_regret={obs.final_regret}")
    print(f"  r_optcoder_breakdown={obs.r_optcoder_breakdown}")
    print(f"  last_action_result={obs.last_action_result}")

    # Sanity checks
    assert obs.done is True, "should be done after commit"
    assert obs.reward is not None, "reward must be produced"
    assert obs.final_regret is not None, "final_regret must be produced"
    assert obs.r_optcoder_breakdown, "breakdown must be populated"
    print("\n✓ scripted_episode PASSED")


def episode_with_broken_code() -> None:
    """Submitting code that fails to compile should not crash the env."""
    env = LandscapeforgeEnvironment(tier="T0", seed=7)
    env.reset()

    # Intentional syntax error
    obs = env.step(LandscapeforgeAction(
        kind="draft", code="this is not python",
    ))
    print(f"\n[broken draft] compile_error={obs.last_action_result.get('compile_error')}")
    assert obs.last_action_result.get("compile_error") is not None
    assert obs.done is False

    # Commit with bad code — should produce worst-case regret, not crash
    obs = env.step(LandscapeforgeAction(kind="commit"))
    print(f"[broken commit] reward={obs.reward}, final_regret={obs.final_regret}")
    assert obs.done is True
    assert obs.reward is not None
    print("\n✓ episode_with_broken_code PASSED")


def budget_exhaustion() -> None:
    """Spamming drafts until budget runs out should auto-commit."""
    env = LandscapeforgeEnvironment(tier="T0", seed=3)
    env.reset()

    for i in range(10):
        obs = env.step(LandscapeforgeAction(kind="draft", code=ADAM_CODE))
        if obs.done:
            print(f"\n[budget_exhaustion] auto-committed after {i+1} drafts")
            print(f"                    reason={obs.last_action_result.get('reason')}")
            assert obs.last_action_result.get("reason") == "budget_exhausted"
            print("\n✓ budget_exhaustion PASSED")
            return
    raise AssertionError("Budget never exhausted — shouldn't happen with draft cost 2, budget 12")


if __name__ == "__main__":
    scripted_episode()
    episode_with_broken_code()
    budget_exhaustion()
    print("\nAll tests passed.")