File size: 7,021 Bytes
c452421
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
"""Pre-submission validation script.

Validates all requirements:
  - openenv.yaml exists and is valid
  - All 7 tasks are defined
  - reset() / step() / state() work correctly
  - Graders produce scores in [0.0, 1.0]
  - Baseline is reproducible
  - Native OpenEnv adapter reset/step/state works when dependencies are installed
  - Typed models validate
"""

import sys
import traceback
from typing import List, Tuple

from src.environment import IncidentResponseEnv
from src.models import Action, ActionType
from src.scenarios import SCENARIOS
from src.tasks import get_all_tasks
from baseline.inference import run_all_tasks


def _check(name: str, fn) -> Tuple[bool, str]:
    try:
        result = fn()
        return True, result or "OK"
    except Exception as exc:
        return False, f"FAILED: {exc}\n{traceback.format_exc()}"


def validate() -> bool:
    checks: List[Tuple[str, bool, str]] = []
    env = IncidentResponseEnv()

    # 1. openenv.yaml exists
    def check_yaml():
        import yaml
        with open("openenv.yaml") as f:
            data = yaml.safe_load(f)
        assert data["name"] == "sentinel-oversight-command"
        assert len(data["tasks"]) >= 3
        return f"Found {len(data['tasks'])} tasks"

    try:
        ok, msg = _check("openenv.yaml", check_yaml)
    except ImportError:
        # yaml not installed, just check file exists
        import os
        ok = os.path.exists("openenv.yaml")
        msg = "File exists (yaml not installed for full check)"
    checks.append(("openenv.yaml valid", ok, msg))

    # 2. Tasks defined
    def check_tasks():
        tasks = get_all_tasks()
        assert len(tasks) >= 7
        for t in tasks:
            assert t.difficulty in ("easy", "medium", "hard", "expert")
        return f"{len(tasks)} tasks defined"
    ok, msg = _check("Tasks", check_tasks)
    checks.append(("7 tasks defined", ok, msg))

    # 3. reset() for all tasks
    def check_reset():
        for task_id in SCENARIOS:
            obs = env.reset(task_id)
            assert obs.step_number == 0
            assert len(obs.alerts) > 0
        return "All tasks reset successfully"
    ok, msg = _check("reset()", check_reset)
    checks.append(("reset() works", ok, msg))

    # 4. step() returns correct types
    def check_step():
        env.reset("severity_classification")
        result = env.step(Action(
            action_type=ActionType.INVESTIGATE,
            target="postgres-primary",
        ))
        assert hasattr(result, "observation")
        assert hasattr(result, "reward")
        assert hasattr(result, "done")
        assert hasattr(result, "info")
        assert -1.0 <= result.reward.value <= 1.0
        return "Step returns correct StepResult"
    ok, msg = _check("step()", check_step)
    checks.append(("step() returns StepResult", ok, msg))

    # 5. state() returns correct type
    def check_state():
        env.reset("severity_classification")
        env.step(Action(action_type=ActionType.INVESTIGATE, target="user-service"))
        state = env.state()
        assert state.step_number == 1
        assert state.task_id == "severity_classification"
        return "State snapshot correct"
    ok, msg = _check("state()", check_state)
    checks.append(("state() works", ok, msg))

    # 6. Graders in [0.0, 1.0]
    def check_graders():
        for task_id in SCENARIOS:
            env.reset(task_id)
            svc = SCENARIOS[task_id].available_services[0]
            env.step(Action(action_type=ActionType.INVESTIGATE, target=svc))
            result = env.grade()
            assert 0.0 <= result.score <= 1.0, f"{task_id}: {result.score}"
        return "All graders in [0.0, 1.0]"
    ok, msg = _check("Graders", check_graders)
    checks.append(("Graders score [0.0-1.0]", ok, msg))

    # 7. Baseline reproducible
    def check_baseline():
        r1 = run_all_tasks(env_instance=env, mode="rules")
        r2 = run_all_tasks(env_instance=env, mode="rules")
        for a, b in zip(r1, r2):
            assert a["score"] == b["score"], f"Non-reproducible: {a['task_id']}"
        scores = [r["score"] for r in r1]
        return f"Baseline scores: {[f'{s:.4f}' for s in scores]}"
    ok, msg = _check("Baseline", check_baseline)
    checks.append(("Baseline reproducible", ok, msg))

    # 8. SENTINEL environment (if available)
    def check_sentinel():
        try:
            from sentinel.environment import SentinelEnv
            sent_env = SentinelEnv()
            sent_tasks = ["basic_oversight", "fleet_monitoring_conflict", "adversarial_worker", "multi_crisis_command"]
            for task_id in sent_tasks:
                obs = sent_env.reset(task_id, variant_seed=0)
                assert hasattr(obs, "step_number")
                assert hasattr(obs, "proposed_action")
                grade = sent_env.grade()
                assert 0.0 <= grade.score <= 1.0
            return f"SENTINEL: {len(sent_tasks)} tasks validated"
        except ImportError:
            return "SENTINEL not installed (optional)"
    ok, msg = _check("SENTINEL", check_sentinel)
    checks.append(("SENTINEL environment", ok, msg))

    # 9. Native OpenEnv adapter (skips only when local OpenEnv deps are absent)
    def check_native_openenv():
        try:
            import dotenv  # noqa: F401
            import openenv  # noqa: F401
        except ImportError as exc:
            return f"Skipped locally: missing OpenEnv dependency ({exc})"

        from server.openenv_native import SentinelNativeAction, SentinelNativeEnvironment

        native_env = SentinelNativeEnvironment()
        obs = native_env.reset(task_id="basic_oversight", seed=1)
        assert obs.task_id == "basic_oversight"
        assert obs.proposed_action, "Native reset did not expose a proposal"
        result = native_env.step(
            SentinelNativeAction(
                action="APPROVE",
                explanation="Validation smoke test; policy correctness is checked separately.",
            )
        )
        assert isinstance(result.reward, float)
        state = native_env.state
        assert state.task_id == "basic_oversight"
        assert state.step_count == 1
        assert isinstance(state.latest_proposal, dict)
        return "Native OpenEnv reset/step/state smoke passed"

    ok, msg = _check("Native OpenEnv", check_native_openenv)
    checks.append(("Native OpenEnv adapter", ok, msg))

    # Print results
    print("\n" + "=" * 60)
    print("OpenEnv Pre-Submission Validation")
    print("=" * 60)

    all_pass = True
    for name, passed, detail in checks:
        status = "PASS" if passed else "FAIL"
        print(f"  [{status}] {name}: {detail[:80]}")
        if not passed:
            all_pass = False

    print("=" * 60)
    if all_pass:
        print("ALL CHECKS PASSED")
    else:
        print("SOME CHECKS FAILED - fix before submitting")
    print("=" * 60)

    return all_pass


if __name__ == "__main__":
    success = validate()
    sys.exit(0 if success else 1)