siddeshwar-kagatikar commited on
Commit
3eeb606
·
1 Parent(s): fde79db

Add automated validation gate

Browse files
.github/workflows/validation.yml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Validation
2
+
3
+ on:
4
+ push:
5
+ pull_request:
6
+ workflow_dispatch:
7
+
8
+ jobs:
9
+ validate:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - name: Checkout
13
+ uses: actions/checkout@v4
14
+
15
+ - name: Set up Python
16
+ uses: actions/setup-python@v5
17
+ with:
18
+ python-version: "3.12"
19
+
20
+ - name: Install project
21
+ run: |
22
+ python -m pip install --upgrade pip
23
+ python -m pip install -e .[dev]
24
+
25
+ - name: Run test suite
26
+ run: python -m pytest -q
27
+
28
+ - name: Run validation gate
29
+ run: python scripts/validate_release.py
30
+
31
+ - name: Build Docker image
32
+ run: docker build -t osint-openenv-validation .
README.md CHANGED
@@ -109,6 +109,12 @@ Run tests:
109
  python -m pytest -q
110
  ```
111
 
 
 
 
 
 
 
112
  ## Usage
113
 
114
  Run one demo episode:
@@ -174,6 +180,29 @@ The FastAPI app serves:
174
  - `/api/environment`: environment metadata
175
  - `/healthz`: health check
176
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  ## Baseline Scores
178
 
179
  The fixed-level benchmark was expanded from the earlier 15-question set to a 30-question set with a larger seeded graph, so older benchmark artifacts should be treated as legacy and regenerated on the new dataset before using them as reference scores.
 
109
  python -m pytest -q
110
  ```
111
 
112
+ Run the automated release gate:
113
+
114
+ ```bash
115
+ python scripts/validate_release.py
116
+ ```
117
+
118
  ## Usage
119
 
120
  Run one demo episode:
 
180
  - `/api/environment`: environment metadata
181
  - `/healthz`: health check
182
 
183
+ ## Automated Validation
184
+
185
+ The repository includes a pass/fail validation gate for the core delivery requirements:
186
+
187
+ - Hugging Face Space readiness
188
+ - OpenEnv spec compliance
189
+ - reproducible baseline behavior
190
+ - at least 3 fixed tasks with working graders
191
+ - Docker image build in CI
192
+
193
+ Local gate:
194
+
195
+ ```bash
196
+ python scripts/validate_release.py
197
+ ```
198
+
199
+ CI gate:
200
+
201
+ - `.github/workflows/validation.yml`
202
+ - runs `pytest`
203
+ - runs the validation script
204
+ - runs `docker build`
205
+
206
  ## Baseline Scores
207
 
208
  The fixed-level benchmark was expanded from the earlier 15-question set to a 30-question set with a larger seeded graph, so older benchmark artifacts should be treated as legacy and regenerated on the new dataset before using them as reference scores.
scripts/validate_release.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ ROOT = Path(__file__).resolve().parents[1]
8
+ if str(ROOT) not in sys.path:
9
+ sys.path.insert(0, str(ROOT))
10
+
11
+ from osint_env.validation import run_validation_suite
12
+
13
+
14
+ def main() -> int:
15
+ result = run_validation_suite()
16
+ print(json.dumps(result, indent=2, sort_keys=True))
17
+ return 0 if result["passed"] else 1
18
+
19
+
20
+ if __name__ == "__main__":
21
+ raise SystemExit(main())
src/osint_env/validation.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import tempfile
5
+ from dataclasses import asdict, dataclass
6
+ from pathlib import Path
7
+ from types import SimpleNamespace
8
+ from typing import Any
9
+
10
+ from fastapi.testclient import TestClient
11
+
12
+ from server import app
13
+ from osint_env.baselines.openai_runner import OpenAIBaselineConfig, OpenAIBaselineRunner, build_action_tools
14
+ from osint_env.config import clone_environment_config, load_seeding_config, load_shared_config
15
+ from osint_env.env.environment import OSINTEnvironment
16
+ from osint_env.env.openenv_compat import Env
17
+ from osint_env.env.reward import compute_answer_reward
18
+
19
+
20
+ README_PATH = Path("README.md")
21
+ DOCKERFILE_PATH = Path("Dockerfile")
22
+ SHARED_CONFIG_PATH = "datasets/fixed_levels/shared_config_fixed_levels.json"
23
+ SEED_FILE_PATH = "datasets/fixed_levels/seed_fixed_levels.json"
24
+
25
+
26
+ @dataclass(slots=True)
27
+ class ValidationResult:
28
+ name: str
29
+ passed: bool
30
+ details: dict[str, Any]
31
+
32
+
33
+ def _build_environment() -> OSINTEnvironment:
34
+ shared = load_shared_config(SHARED_CONFIG_PATH)
35
+ env_cfg = clone_environment_config(shared.environment)
36
+ env_cfg.seeding = load_seeding_config(SEED_FILE_PATH)
37
+ env_cfg.llm.provider = "mock"
38
+ return OSINTEnvironment(env_cfg)
39
+
40
+
41
+ def check_hf_space_readiness() -> ValidationResult:
42
+ text = README_PATH.read_text(encoding="utf-8")
43
+ has_sdk = "sdk: docker" in text
44
+ has_port = "app_port: 7860" in text
45
+ has_openenv_tag = "- openenv" in text
46
+ client = TestClient(app)
47
+ health = client.get("/healthz")
48
+ dashboard = client.get("/api/environment")
49
+ passed = all(
50
+ [
51
+ README_PATH.exists(),
52
+ DOCKERFILE_PATH.exists(),
53
+ has_sdk,
54
+ has_port,
55
+ has_openenv_tag,
56
+ health.status_code == 200,
57
+ dashboard.status_code == 200,
58
+ ]
59
+ )
60
+ return ValidationResult(
61
+ name="hf_space_readiness",
62
+ passed=passed,
63
+ details={
64
+ "readme_exists": README_PATH.exists(),
65
+ "dockerfile_exists": DOCKERFILE_PATH.exists(),
66
+ "has_sdk_docker": has_sdk,
67
+ "has_app_port": has_port,
68
+ "has_openenv_tag": has_openenv_tag,
69
+ "healthz_status": health.status_code,
70
+ "environment_status": dashboard.status_code,
71
+ },
72
+ )
73
+
74
+
75
+ def check_openenv_spec_compliance() -> ValidationResult:
76
+ env = _build_environment()
77
+ obs = env.reset()
78
+ passed = all(
79
+ [
80
+ isinstance(env, Env),
81
+ hasattr(env, "reset"),
82
+ hasattr(env, "step"),
83
+ env.name == "OSINTEnvironment",
84
+ env.state_space == "json-observation",
85
+ env.action_space == ["CALL_TOOL", "ADD_EDGE", "ANSWER"],
86
+ env.episode_max_length == env.config.max_steps,
87
+ isinstance(obs.task, dict),
88
+ "question" in obs.task,
89
+ ]
90
+ )
91
+ return ValidationResult(
92
+ name="openenv_spec_compliance",
93
+ passed=passed,
94
+ details={
95
+ "env_class": type(env).__name__,
96
+ "state_space": env.state_space,
97
+ "action_space": list(env.action_space),
98
+ "episode_max_length": env.episode_max_length,
99
+ "task_keys": sorted(obs.task.keys()),
100
+ },
101
+ )
102
+
103
+
104
+ class _FakeMessage:
105
+ def __init__(self, answer: str):
106
+ self.content = ""
107
+ self.tool_calls = [
108
+ SimpleNamespace(
109
+ id="fake_tool_call_0",
110
+ function=SimpleNamespace(name="submit_answer", arguments=json.dumps({"answer": answer})),
111
+ )
112
+ ]
113
+
114
+
115
+ class _FakeCompletion:
116
+ def __init__(self, answer: str):
117
+ self.choices = [SimpleNamespace(message=_FakeMessage(answer))]
118
+ self.usage = SimpleNamespace(prompt_tokens=0, completion_tokens=0, total_tokens=0)
119
+ self.system_fingerprint = "validation_fp"
120
+
121
+
122
+ class _FakeChatCompletions:
123
+ def create(self, **kwargs: Any) -> _FakeCompletion:
124
+ messages = list(kwargs.get("messages", []))
125
+ initial_observation = {}
126
+ for message in messages:
127
+ if message.get("role") == "user":
128
+ try:
129
+ initial_observation = json.loads(message.get("content", "{}"))
130
+ except json.JSONDecodeError:
131
+ initial_observation = {}
132
+ break
133
+ task_id = ((initial_observation.get("task") or {}).get("task_id")) or ""
134
+ env = _build_environment()
135
+ task = next((task for task in env.tasks if task.task_id == task_id), None)
136
+ answer = task.answer if task is not None else "unknown"
137
+ return _FakeCompletion(answer)
138
+
139
+
140
+ class _FakeOpenAIClient:
141
+ def __init__(self) -> None:
142
+ self.chat = SimpleNamespace(completions=_FakeChatCompletions())
143
+
144
+
145
+ def _run_fake_baseline_once(output_dir: Path) -> dict[str, Any]:
146
+ config = OpenAIBaselineConfig(
147
+ api_key="validation",
148
+ episodes=3,
149
+ max_steps=4,
150
+ append_leaderboard=False,
151
+ output_path=str(output_dir / "baseline.json"),
152
+ dashboard_path=str(output_dir / "baseline.html"),
153
+ leaderboard_path=str(output_dir / "leaderboard.json"),
154
+ run_name="validation_baseline",
155
+ )
156
+ runner = OpenAIBaselineRunner.__new__(OpenAIBaselineRunner)
157
+ runner.config = config
158
+ runner.client = _FakeOpenAIClient()
159
+ runner.tools = build_action_tools()
160
+ return runner.run()
161
+
162
+
163
+ def check_baseline_reproducibility() -> ValidationResult:
164
+ with tempfile.TemporaryDirectory() as left_dir_name, tempfile.TemporaryDirectory() as right_dir_name:
165
+ left = _run_fake_baseline_once(Path(left_dir_name))
166
+ right = _run_fake_baseline_once(Path(right_dir_name))
167
+
168
+ left_signature = {
169
+ "summary": left["summary"],
170
+ "episodes": [
171
+ {
172
+ "task_id": episode["task_id"],
173
+ "task_answer": episode["task_answer"],
174
+ "agent_answer": episode["agent_answer"],
175
+ "success": episode["success"],
176
+ "steps": episode["steps"],
177
+ }
178
+ for episode in left["episodes"]
179
+ ],
180
+ }
181
+ right_signature = {
182
+ "summary": right["summary"],
183
+ "episodes": [
184
+ {
185
+ "task_id": episode["task_id"],
186
+ "task_answer": episode["task_answer"],
187
+ "agent_answer": episode["agent_answer"],
188
+ "success": episode["success"],
189
+ "steps": episode["steps"],
190
+ }
191
+ for episode in right["episodes"]
192
+ ],
193
+ }
194
+ passed = left_signature == right_signature
195
+ return ValidationResult(
196
+ name="baseline_reproducibility",
197
+ passed=passed,
198
+ details={
199
+ "episodes_checked": len(left_signature["episodes"]),
200
+ "left_signature": left_signature,
201
+ "right_signature": right_signature,
202
+ },
203
+ )
204
+
205
+
206
+ def check_task_and_grader_coverage() -> ValidationResult:
207
+ env = _build_environment()
208
+ tasks = env.tasks
209
+ grader_checks: list[dict[str, Any]] = []
210
+ for task in tasks[:3]:
211
+ correct = compute_answer_reward(
212
+ proposed_answer=task.answer,
213
+ task=task,
214
+ pred_edges=list(task.supporting_edges),
215
+ tool_outputs=[],
216
+ step_count=1,
217
+ model=env.reward_model,
218
+ )
219
+ wrong = compute_answer_reward(
220
+ proposed_answer="unknown",
221
+ task=task,
222
+ pred_edges=[],
223
+ tool_outputs=[],
224
+ step_count=1,
225
+ model=env.reward_model,
226
+ )
227
+ grader_checks.append(
228
+ {
229
+ "task_id": task.task_id,
230
+ "support_edges": len(task.supporting_edges),
231
+ "correct_reward": correct.total,
232
+ "wrong_reward": wrong.total,
233
+ "grader_prefers_correct": correct.total > wrong.total,
234
+ }
235
+ )
236
+ passed = len(tasks) >= 3 and all(row["support_edges"] > 0 and row["grader_prefers_correct"] for row in grader_checks)
237
+ return ValidationResult(
238
+ name="task_and_grader_coverage",
239
+ passed=passed,
240
+ details={
241
+ "task_count": len(tasks),
242
+ "grader_checks": grader_checks,
243
+ },
244
+ )
245
+
246
+
247
+ def run_validation_suite() -> dict[str, Any]:
248
+ results = [
249
+ check_hf_space_readiness(),
250
+ check_openenv_spec_compliance(),
251
+ check_baseline_reproducibility(),
252
+ check_task_and_grader_coverage(),
253
+ ]
254
+ passed = all(result.passed for result in results)
255
+ return {
256
+ "passed": passed,
257
+ "checks": [asdict(result) for result in results],
258
+ }
tests/test_validation.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from osint_env.validation import run_validation_suite
2
+
3
+
4
+ def test_validation_suite_passes_repo_gate():
5
+ result = run_validation_suite()
6
+ assert result["passed"] is True
7
+ assert len(result["checks"]) >= 4