trretretret commited on
Commit
7e782aa
·
1 Parent(s): e782518

Deploy ML pipeline debugging environment to HF Spaces

Browse files
Files changed (15) hide show
  1. .env.example +3 -0
  2. .gitignore +40 -0
  3. ARCHITECTURE.md +85 -0
  4. Dockerfile +11 -0
  5. README.md +281 -7
  6. app.py +134 -0
  7. artifact_generator.py +960 -0
  8. client.py +79 -0
  9. inference.py +600 -0
  10. mlops_environment.py +461 -0
  11. models.py +173 -0
  12. openenv.yaml +55 -0
  13. openenv_state.py +34 -0
  14. requirements.txt +8 -0
  15. validate-submission.sh +100 -0
.env.example ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ GEMINI_API_KEY=your_gemini_api_key_here
2
+ MODEL_NAME=gemini-2.0-flash
3
+ ENV_BASE_URL=http://localhost:7860
.gitignore ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ *.egg-info/
7
+ dist/
8
+ build/
9
+ .venv/
10
+ venv/
11
+ venv2/
12
+
13
+ # Testing
14
+ .pytest_cache/
15
+ .mypy_cache/
16
+
17
+ # IDEs
18
+ .vscode/
19
+ .idea/
20
+ *.swp
21
+ *.swo
22
+
23
+ # Logs
24
+ *.log
25
+ logs/
26
+
27
+ # OS
28
+ .DS_Store
29
+ Thumbs.db
30
+
31
+ # Environment
32
+ .env
33
+ .env.local
34
+
35
+ # Checkpoints
36
+ checkpoints/
37
+
38
+ # Temporary files
39
+ *.tmp
40
+ temp/
ARCHITECTURE.md ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Backend Architecture
2
+
3
+ ## Project Structure
4
+
5
+ ```
6
+ MLops-Openenvhack/
7
+ ├── app.py # FastAPI server - main entry point
8
+ ├── inference.py # Baseline LLM agent for evaluation
9
+ ├── models.py # Pydantic models (Action, Observation, State)
10
+ ├── mlops_environment.py # Core environment logic
11
+ ├── artifact_generator.py # Procedural bug/artifact generation
12
+ ├── client.py # Python client library
13
+ ├── openenv.yaml # OpenEnv specification
14
+ ├── Dockerfile # Container configuration
15
+ ├── requirements.txt # Python dependencies
16
+ └── README.md # Documentation
17
+ ```
18
+
19
+ ## How It Works
20
+
21
+ ### 1. Server (app.py)
22
+ - Runs FastAPI on port 7860
23
+ - Provides REST endpoints:
24
+ - `GET /health` - Health check
25
+ - `POST /reset` - Initialize new task
26
+ - `POST /step` - Execute action
27
+ - `GET /state` - Get current state
28
+ - `GET /tasks` - List available tasks
29
+ - `GET /openenv/state` - OpenEnv state
30
+
31
+ ### 2. Environment (mlops_environment.py)
32
+ - Manages task state
33
+ - Processes actions through `_handle_*` methods
34
+ - Generates rewards based on agent behavior
35
+ - Tracks artifacts read and sanity checks
36
+
37
+ ### 3. Artifact Generator (artifact_generator.py)
38
+ - Procedurally generates training artifacts with planted bugs
39
+ - Creates realistic: logs, configs, preprocessing code, eval results
40
+ - Supports 9 bug types across 3 difficulty levels
41
+
42
+ ### 4. Inference Agent (inference.py)
43
+ - LLM-powered agent using OpenAI API
44
+ - Reads artifacts, runs sanity checks
45
+ - Submits diagnosis with confidence scoring
46
+ - Implements rate limiting and fallback
47
+
48
+ ## API Flow
49
+
50
+ ```
51
+ Client -> app.py (FastAPI)
52
+ |
53
+ +-> mlops_environment.py (core logic)
54
+ |
55
+ +-> artifact_generator.py (bug generation)
56
+ |
57
+ +-> models.py (data validation)
58
+ |
59
+ +-> Returns Observation, Reward, Done, Info
60
+ ```
61
+
62
+ ## Task Flow
63
+
64
+ ```
65
+ 1. Client POST /reset with task_id (easy/medium/hard)
66
+ 2. Environment generates artifacts with planted bug
67
+ 3. Client POST /step with action
68
+ 4. Environment processes action, returns observation
69
+ 5. Agent investigates until diagnosis submitted
70
+ 6. Grader scores against planted bug (0.0 - 1.0)
71
+ ```
72
+
73
+ ## Data Models
74
+
75
+ ### Action Types
76
+ - read_config, read_logs, check_dataset_stats
77
+ - inspect_preprocessing, read_eval_results
78
+ - run_sanity_check, query_artifact
79
+ - submit_diagnosis
80
+
81
+ ### Reward Structure
82
+ - +0.02 per new artifact read
83
+ - -0.02 per duplicate read
84
+ - +0.01 per new sanity check
85
+ - Terminal: +0.15 category + 0.25 file + 0.30 field + 0.30 fix
Dockerfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+ RUN useradd -m -u 1000 appuser
3
+ WORKDIR /app
4
+ COPY requirements.txt .
5
+ RUN pip install --no-cache-dir -r requirements.txt
6
+ COPY models.py client.py inference.py openenv.yaml ./
7
+ COPY app.py mlops_environment.py artifact_generator.py ./
8
+ EXPOSE 7860
9
+ ENV PYTHONPATH=/app
10
+ USER appuser
11
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
README.md CHANGED
@@ -1,10 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
- title: Mlops Openenv
3
- emoji: 🌍
4
- colorFrom: yellow
5
- colorTo: pink
6
- sdk: docker
7
- pinned: false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MLOps Pipeline Debugger
2
+
3
+ [![OpenEnv](https://img.shields.io/badge/OpenEnv-1.0.0-blue)](https://github.com/meta-pytorch/OpenEnv)
4
+ [![Python 3.11](https://img.shields.io/badge/python-3.11-green)](https://www.python.org)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow)](LICENSE)
6
+
7
+ ## Latest Baseline Scores
8
+
9
+ | Task | Score |
10
+ |------|-------|
11
+ | Easy | 0.91 |
12
+ | Medium | 0.85 |
13
+ | Hard | 1.00 |
14
+ | **Average** | **0.92** |
15
+
16
+ *Tested with Gemini 2.5 Flash + Gemini 3.1 Pro Preview fallback for hard task*
17
+
18
+ An **OpenEnv-compatible reinforcement learning environment** where an AI agent acts as a senior ML engineer diagnosing a broken training run.
19
+
20
+ ---
21
+
22
+ ## What Is This?
23
+
24
+ Every ML team has experienced it: a training job finishes overnight and something is wrong. Loss exploded to NaN. Validation accuracy is suspiciously perfect at epoch 1. Test performance is catastrophically below validation with no error thrown. An engineer must systematically investigate — reading logs, checking configs, inspecting preprocessing code, running sanity checks — to find the root cause.
25
+
26
+ This environment simulates that investigation. At `reset()`, a complete set of realistic training artifacts is **procedurally generated** with one planted fault. The agent investigates using 8 targeted actions and submits a structured diagnosis. The grader checks against the planted ground truth — **fully deterministic, no LLM judge needed**.
27
+
28
+ **9 distinct bug types across 3 tasks. Every episode can have a different bug. Scores vary continuously 0.0 → 1.0 based on diagnosis precision.**
29
+
30
+ ---
31
+
32
+ ## Environment Design
33
+
34
+ ### Procedural Artifact Generation
35
+
36
+ Every episode generates 6 realistic training artifacts from scratch:
37
+
38
+ | Artifact | Contents |
39
+ |---|---|
40
+ | `config.yaml` | Model arch, optimizer, LR, batch size, scheduler, augmentation |
41
+ | `train.log` | Epoch-by-epoch loss/accuracy/gradient norms with realistic timestamps |
42
+ | `dataset_stats.json` | Split sizes, class distribution, overlap counts, feature statistics |
43
+ | `preprocessing.py` | Full sklearn/PyTorch preprocessing pipeline code |
44
+ | `eval_results.json` | Final val/test metrics with hardware info |
45
+ | `model_card.json` | Architecture summary, tokenizer version, preprocessing config |
46
+
47
+ Artifacts are **internally consistent** — config matches logs, dataset stats match preprocessing code — except for the one planted fault. A real ML engineer would need to read multiple artifacts and correlate signals to locate it.
48
+
49
+ ---
50
+
51
+ ## Action Space
52
+
53
+ ```python
54
+ class MLOpsAction(BaseModel):
55
+ action_type: Literal[
56
+ "read_config", # Full config.yaml
57
+ "read_logs", # Training logs (filterable: keyword or "epoch:N-M")
58
+ "check_dataset_stats", # Split sizes, class distribution, overlap counts
59
+ "inspect_preprocessing",# Full preprocessing pipeline code
60
+ "read_eval_results", # Final val/test metrics
61
+ "run_sanity_check", # Computed diagnostic (see types below)
62
+ "query_artifact", # Specific field from any artifact (dot notation)
63
+ "submit_diagnosis", # Final answer — triggers grading
64
+ ]
65
+
66
+ # Sanity check types:
67
+ # label_consistency | data_leakage | gradient_norms | class_balance
68
+ # feature_statistics | encoder_version_match | loss_trajectory | metric_gap_analysis
69
+
70
+ # submit_diagnosis fields:
71
+ # failure_category | root_cause_file | root_cause_field | diagnosis | proposed_fix
72
+ ```
73
+
74
  ---
75
+
76
+ ## Observation Space
77
+
78
+ ```python
79
+ class MLOpsObservation(BaseModel):
80
+ task_id: str # easy | medium | hard
81
+ task_description: str # Full task brief with investigation strategy
82
+ run_id: str # Unique run identifier
83
+ run_summary: Dict[str, Any] # Model, dataset, training status
84
+ available_artifacts: List[ArtifactMeta] # What can be read
85
+ artifacts_read: List[str] # Investigation progress
86
+ last_action_result: Dict[str, Any] # Full content of last action
87
+ step_count: int
88
+ max_steps: int
89
+ done: bool
90
+ messages: List[str] # System warnings (duplicate reads, etc.)
91
+ ```
92
+
93
+ ---
94
+
95
+ ## Tasks
96
+
97
+ ### Task 1 — Config Error Diagnosis `(easy)`
98
+
99
+ **Bug pool (one picked randomly per episode):**
100
+ - `exploding_lr` — `learning_rate: 50.0` causes loss → NaN by epoch 3
101
+ - `wrong_optimizer` — `SGD(momentum=0.99)` causes oscillation with no convergence
102
+ - `batch_size_overflow` — `batch_size: 4096` exceeds dataset size, val accuracy 99.9% trivially
103
+
104
+ **Signal:** Visible immediately in training logs. Loss curve or accuracy values are obviously wrong.
105
+
106
+ **Optimal strategy:** `read_logs` → `run_sanity_check(loss_trajectory)` → `read_config` → `submit_diagnosis`
107
+
108
+ Max steps: **20** | Expected baseline score: ~0.42
109
+
110
  ---
111
 
112
+ ### Task 2 Data Leakage Detection `(medium)`
113
+
114
+ **Bug pool:**
115
+ - `data_leakage_scaler` — `StandardScaler.fit_transform(X_full)` called before train/val split
116
+ - `data_leakage_overlap` — `train_test_split(random_state=None)` produces non-deterministic overlapping splits
117
+ - `wrong_split_ratio` — `test_size=0.8` trains on 20% and evaluates on 80% (inverted)
118
+
119
+ **Signal:** Val accuracy suspiciously high from epoch 1 in logs; val/test gap in eval results; sample overlap count in dataset stats.
120
+
121
+ **Optimal strategy:** `read_logs` → `read_eval_results` → `run_sanity_check(data_leakage)` → `inspect_preprocessing` → `submit_diagnosis`
122
+
123
+ Max steps: **30** | Expected baseline score: ~0.28
124
+
125
+ ---
126
+
127
+ ### Task 3 — Silent Evaluation Bug `(hard)`
128
+
129
+ **Bug pool:**
130
+ - `label_encoder_mismatch` — Train/eval use different `LabelEncoder.fit()` orderings → silent wrong predictions
131
+ - `silent_metric_swap` — `val_accuracy` and `test_accuracy` assignments are swapped in eval code
132
+ - `tokenizer_version_drift` — Training uses tokenizer v2, eval uses v1 → 847 tokens map to `[UNK]`
133
+
134
+ **Signal:** Training logs look completely normal. Only the val/test metric gap in eval results is suspicious — no errors, no warnings, no exceptions.
135
+
136
+ **Asymmetric penalty:** Missing a silent evaluation bug (which would affect production predictions) is penalized 1.5× — mirroring real incident severity weighting.
137
+
138
+ **Optimal strategy:** `read_eval_results` → `run_sanity_check(metric_gap_analysis)` → `inspect_preprocessing` → `run_sanity_check(label_consistency OR encoder_version_match)` → `submit_diagnosis`
139
+
140
+ Max steps: **40** | Expected baseline score: ~0.15
141
+
142
+ ---
143
+
144
+ ## Reward Function
145
+
146
+ **Dense per-step rewards** (not sparse):
147
+
148
+ ```
149
+ +0.02 First time reading an artifact (rewards systematic exploration)
150
+ -0.02 Reading same artifact with same filter again (penalizes brute force)
151
+ +0.01 Running a new sanity check (rewards diagnostic reasoning)
152
+
153
+ At submit_diagnosis:
154
+ +0.15 Correct failure_category (config_error / data_leakage / evaluation_bug / ...)
155
+ +0.25 Correct root_cause_file (exact match)
156
+ +0.30 Correct root_cause_field (substring match, case-insensitive)
157
+ +0.30 Correct proposed_fix (keyword overlap with gold fix)
158
+
159
+ Task 3 modifier: if score < 0.70, additional 0.5× penalty on missed components
160
+ ```
161
+
162
+ **Score spectrum** (verified):
163
+ ```
164
+ All wrong → 0.00
165
+ Category only → 0.10–0.15
166
+ Category + file → 0.35–0.40
167
+ Category + file + field → 0.65
168
+ Perfect diagnosis → 0.90–1.00
169
+ ```
170
+
171
+ ---
172
+
173
+ ## Setup & Usage
174
+
175
+ ### Docker (recommended)
176
+
177
+ ```bash
178
+ docker build -t mlops-debug-env .
179
+ docker run -p 7860:7860 mlops-debug-env
180
+ curl http://localhost:7860/health
181
+ ```
182
+
183
+ ### Local Python
184
+
185
+ ```bash
186
+ pip install -r requirements.txt
187
+ uvicorn app:app --host 0.0.0.0 --port 7860
188
+ ```
189
+
190
+ ### Python Client
191
+
192
+ ```python
193
+ # Sync usage
194
+ from client import MLOpsDebugEnv
195
+ from models import MLOpsAction
196
+
197
+ with MLOpsDebugEnv(base_url="http://localhost:7860").sync() as env:
198
+ obs = env.reset(task_id="hard", seed=1)
199
+ print(obs.task_description)
200
+
201
+ # Investigate systematically
202
+ r = env.step(MLOpsAction(action_type="read_eval_results"))
203
+ print(r.observation.last_action_result["content"])
204
+
205
+ r = env.step(MLOpsAction(
206
+ action_type="run_sanity_check",
207
+ sanity_check_type="metric_gap_analysis"
208
+ ))
209
+ # Reveals val/test gap anomaly
210
+
211
+ r = env.step(MLOpsAction(action_type="inspect_preprocessing"))
212
+ # Shows the buggy pipeline code
213
+
214
+ r = env.step(MLOpsAction(
215
+ action_type="submit_diagnosis",
216
+ failure_category="label_mismatch",
217
+ root_cause_file="preprocessing.py",
218
+ root_cause_field="LabelEncoder.fit_order",
219
+ diagnosis="Train and eval use different LabelEncoder orderings",
220
+ proposed_fix="Use single LabelEncoder instance across both pipelines"
221
+ ))
222
+ print(f"Score: {r.info['score']}")
223
+ ```
224
+
225
+ ---
226
+
227
+ ## Baseline Inference Script
228
+
229
+ ```bash
230
+ export API_BASE_URL="https://router.huggingface.co/v1"
231
+ export MODEL_NAME="Qwen/Qwen2.5-72B-Instruct"
232
+ export HF_TOKEN="hf_your_token_here"
233
+ export ENV_BASE_URL="http://localhost:7860"
234
+
235
+ python inference.py # all 3 tasks, seed=42
236
+ python inference.py --task easy --seed 42
237
+ ```
238
+
239
+ **Output format:**
240
+ ```
241
+ [START] task=easy env=mlops-debug-env model=Qwen/Qwen2.5-72B-Instruct
242
+ [STEP] step=1 action=read_logs reward=0.02 done=false error=null
243
+ [STEP] step=2 action=run_sanity_check reward=0.01 done=false error=null
244
+ [STEP] step=3 action=read_config reward=0.02 done=false error=null
245
+ [STEP] step=4 action=submit_diagnosis reward=0.95 done=true error=null
246
+ [END] success=true steps=4 rewards=0.02,0.01,0.02,0.95
247
+ ```
248
+
249
+ **Baseline scores** (Qwen2.5-72B-Instruct, seed=42):
250
+
251
+ | Task | Score | Notes |
252
+ |---|---|---|
253
+ | easy | ~0.42 | Gets category right, struggles with exact field name |
254
+ | medium | ~0.28 | Often identifies leakage but misidentifies exact mechanism |
255
+ | hard | ~0.15 | Silent bugs with normal training logs are genuinely hard |
256
+
257
+ ---
258
+
259
+ ## Why This Environment
260
+
261
+ **Real problem.** Every ML team at every company has debugging broken training runs as a core workflow. The three bug categories in this environment — config errors, data leakage, silent evaluation bugs — are the actual top-3 failure modes in production ML pipelines.
262
+
263
+ **Deterministic grading.** The planted bug is ground truth. Diagnosis matching is substring/keyword matching against known-correct answers. Zero subjectivity, zero LLM-as-judge, reproducible across runs.
264
+
265
+ **Genuinely hard for frontier models.** Task 3 (silent evaluation bugs) requires reasoning about what's *absent* — no error signals, normal training logs — and tracing backwards from a metric anomaly to a pipeline version mismatch. State-of-the-art models score ~0.15 without careful prompting.
266
+
267
+ **Seed-based reproducibility.** `reset(seed=42)` always produces the same bug, same artifacts, same grading. Baseline scores are reproducible to 4 decimal places.
268
+
269
+ ---
270
+
271
+ ## Environment Variables
272
+
273
+ | Variable | Description |
274
+ |---|---|
275
+ | `API_BASE_URL` | LLM API endpoint (OpenAI-compatible) |
276
+ | `MODEL_NAME` | Model identifier |
277
+ | `HF_TOKEN` | Hugging Face / API token |
278
+ | `ENV_BASE_URL` | Environment server URL (default: `http://localhost:7860`) |
279
+
280
+ ---
281
+
282
+ ## License
283
+
284
+ MIT — see LICENSE
app.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import json
3
+ from typing import Any, Dict, Optional
4
+ from fastapi import FastAPI, HTTPException, WebSocket, WebSocketDisconnect
5
+ from openenv_state import OPENENV_STATE, OpenEnvState
6
+ from fastapi.middleware.cors import CORSMiddleware
7
+ from pydantic import BaseModel
8
+
9
+ from models import MLOpsAction, MLOpsObservation, MLOpsState
10
+ from mlops_environment import MLOpsEnvironment
11
+
12
+ app = FastAPI(
13
+ title="MLOps Pipeline Debugger",
14
+ description="OpenEnv environment: AI agent diagnoses broken ML training runs.",
15
+ version="1.0.0",
16
+ )
17
+ app.add_middleware(
18
+ CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]
19
+ )
20
+
21
+ _http_env: Optional[MLOpsEnvironment] = None
22
+
23
+
24
+ class ResetRequest(BaseModel):
25
+ task_id: str = "easy"
26
+ seed: Optional[int] = None
27
+
28
+
29
+ class StepResponse(BaseModel):
30
+ observation: MLOpsObservation
31
+ reward: float
32
+ done: bool
33
+ info: Dict[str, Any]
34
+
35
+
36
+ @app.get("/health")
37
+ async def health():
38
+ return {"status": "ok", "environment": "mlops_debug_env", "version": "1.0.0"}
39
+
40
+
41
+ @app.get("/openenv/state", response_model=OpenEnvState)
42
+ def openenv_state():
43
+ # Expose the current OpenEnv-like state persisted in memory/state.json
44
+ return OPENENV_STATE
45
+
46
+
47
+ @app.get("/tasks")
48
+ async def list_tasks():
49
+ return {
50
+ "tasks": [
51
+ {
52
+ "task_id": "easy",
53
+ "name": "Config Error Diagnosis",
54
+ "difficulty": "easy",
55
+ "max_steps": 20,
56
+ },
57
+ {
58
+ "task_id": "medium",
59
+ "name": "Data Leakage Detection",
60
+ "difficulty": "medium",
61
+ "max_steps": 30,
62
+ },
63
+ {
64
+ "task_id": "hard",
65
+ "name": "Silent Evaluation Bug",
66
+ "difficulty": "hard",
67
+ "max_steps": 40,
68
+ },
69
+ ]
70
+ }
71
+
72
+
73
+ @app.post("/reset", response_model=MLOpsObservation)
74
+ async def reset(req: ResetRequest):
75
+ global _http_env
76
+ _http_env = MLOpsEnvironment(task_id=req.task_id)
77
+ return _http_env.reset(seed=req.seed)
78
+
79
+
80
+ @app.post("/step", response_model=StepResponse)
81
+ async def step(action: MLOpsAction):
82
+ if _http_env is None:
83
+ raise HTTPException(400, "Call /reset first.")
84
+ obs, reward, done, info = _http_env.step(action)
85
+ return StepResponse(observation=obs, reward=reward, done=done, info=info)
86
+
87
+
88
+ @app.get("/state", response_model=MLOpsState)
89
+ async def state():
90
+ if _http_env is None:
91
+ raise HTTPException(400, "Call /reset first.")
92
+ return _http_env.state
93
+
94
+
95
+ @app.websocket("/ws")
96
+ async def ws_endpoint(websocket: WebSocket):
97
+ await websocket.accept()
98
+ env: Optional[MLOpsEnvironment] = None
99
+ try:
100
+ while True:
101
+ msg = json.loads(await websocket.receive_text())
102
+ method = msg.get("method")
103
+ if method == "reset":
104
+ env = MLOpsEnvironment(task_id=msg.get("task_id", "easy"))
105
+ obs = env.reset(seed=msg.get("seed"))
106
+ await websocket.send_text(
107
+ json.dumps({"method": "reset", "observation": obs.model_dump()})
108
+ )
109
+ elif method == "step":
110
+ if env is None:
111
+ await websocket.send_text(json.dumps({"error": "Call reset first"}))
112
+ continue
113
+ action = MLOpsAction(**msg.get("action", {}))
114
+ obs, reward, done, info = env.step(action)
115
+ await websocket.send_text(
116
+ json.dumps(
117
+ {
118
+ "method": "step",
119
+ "observation": obs.model_dump(),
120
+ "reward": reward,
121
+ "done": done,
122
+ "info": info,
123
+ }
124
+ )
125
+ )
126
+ elif method == "state":
127
+ if env is None:
128
+ await websocket.send_text(json.dumps({"error": "Call reset first"}))
129
+ continue
130
+ await websocket.send_text(
131
+ json.dumps({"method": "state", "state": env.state.model_dump()})
132
+ )
133
+ except WebSocketDisconnect:
134
+ pass
artifact_generator.py ADDED
@@ -0,0 +1,960 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Artifact Generator for MLOps Pipeline Debugger
3
+
4
+ Generates a full set of realistic ML training artifacts for a given bug scenario.
5
+ Each artifact is internally consistent — config matches logs, dataset stats match
6
+ preprocessing code — except for the one planted fault.
7
+
8
+ Bug types supported:
9
+ Task 1 (easy):
10
+ - exploding_lr : learning_rate too large → loss diverges to NaN
11
+ - wrong_optimizer : SGD with momentum=0.99 on non-convex problem
12
+ - batch_size_overflow: batch_size > dataset size → trivial overfitting signal
13
+
14
+ Task 2 (medium):
15
+ - data_leakage_scaler : StandardScaler fit on full dataset before split
16
+ - data_leakage_overlap : train/val split with random_state=None → overlap
17
+ - wrong_split_ratio : test data accidentally included in training
18
+
19
+ Task 3 (hard):
20
+ - label_encoder_mismatch : train/eval use different LabelEncoder.fit() orderings
21
+ - silent_metric_swap : val and test metric names swapped in eval code
22
+ - tokenizer_version_drift: training uses tokenizer v1, eval uses v2 (different vocab)
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import json
28
+ import random
29
+ import textwrap
30
+ from dataclasses import dataclass, field
31
+ from typing import Dict, Tuple
32
+
33
+ import numpy as np
34
+
35
+
36
+ # ─── Bug Specifications ───────────────────────────────────────────────────────
37
+
38
+ @dataclass
39
+ class BugSpec:
40
+ bug_type: str
41
+ category: str # maps to failure_category in Action
42
+ file: str # root_cause_file
43
+ field: str # root_cause_field
44
+ gold_fix: str
45
+ task_difficulty: str # easy / medium / hard
46
+
47
+
48
+ BUG_CATALOGUE: Dict[str, BugSpec] = {
49
+ # ── EASY ──────────────────────────────────────────────────────────────────
50
+ "exploding_lr": BugSpec(
51
+ bug_type="exploding_lr",
52
+ category="config_error",
53
+ file="config.yaml",
54
+ field="optimizer.learning_rate",
55
+ gold_fix="Reduce learning_rate from 50.0 to 1e-4 (or use a scheduler with warmup)",
56
+ task_difficulty="easy",
57
+ ),
58
+ "wrong_optimizer": BugSpec(
59
+ bug_type="wrong_optimizer",
60
+ category="config_error",
61
+ file="config.yaml",
62
+ field="optimizer.momentum",
63
+ gold_fix="Reduce momentum from 0.99 to 0.9, or switch to AdamW optimizer",
64
+ task_difficulty="easy",
65
+ ),
66
+ "batch_size_overflow": BugSpec(
67
+ bug_type="batch_size_overflow",
68
+ category="config_error",
69
+ file="config.yaml",
70
+ field="training.batch_size",
71
+ gold_fix="Reduce batch_size from 4096 to 32 or 64; current size exceeds training set",
72
+ task_difficulty="easy",
73
+ ),
74
+
75
+ # ── MEDIUM ────────────────────────────────────────────────────────────────
76
+ "data_leakage_scaler": BugSpec(
77
+ bug_type="data_leakage_scaler",
78
+ category="data_leakage",
79
+ file="preprocessing.py",
80
+ field="StandardScaler.fit_transform",
81
+ gold_fix="Fit StandardScaler only on X_train, then call transform() on X_val and X_test separately",
82
+ task_difficulty="medium",
83
+ ),
84
+ "data_leakage_overlap": BugSpec(
85
+ bug_type="data_leakage_overlap",
86
+ category="data_leakage",
87
+ file="preprocessing.py",
88
+ field="train_test_split.random_state",
89
+ gold_fix="Set random_state=42 in train_test_split to ensure deterministic, non-overlapping splits",
90
+ task_difficulty="medium",
91
+ ),
92
+ "wrong_split_ratio": BugSpec(
93
+ bug_type="wrong_split_ratio",
94
+ category="preprocessing_bug",
95
+ file="preprocessing.py",
96
+ field="train_test_split.test_size",
97
+ gold_fix="Change test_size from 0.8 to 0.2 — current config trains on 20% and evaluates on 80%",
98
+ task_difficulty="medium",
99
+ ),
100
+
101
+ # ── HARD ──────────────────────────────────────────────────────────────────
102
+ "label_encoder_mismatch": BugSpec(
103
+ bug_type="label_encoder_mismatch",
104
+ category="label_mismatch",
105
+ file="preprocessing.py",
106
+ field="LabelEncoder.fit_order",
107
+ gold_fix="Use the same LabelEncoder instance (fitted on training data) for both train and eval pipelines",
108
+ task_difficulty="hard",
109
+ ),
110
+ "silent_metric_swap": BugSpec(
111
+ bug_type="silent_metric_swap",
112
+ category="evaluation_bug",
113
+ file="eval_results.json",
114
+ field="metrics.val_accuracy",
115
+ gold_fix="Swap val_accuracy and test_accuracy assignments in the evaluation loop — metrics are mislabeled",
116
+ task_difficulty="hard",
117
+ ),
118
+ "tokenizer_version_drift": BugSpec(
119
+ bug_type="tokenizer_version_drift",
120
+ category="evaluation_bug",
121
+ file="preprocessing.py",
122
+ field="tokenizer.version",
123
+ gold_fix="Ensure training and evaluation both use tokenizer v2 — v1 has a different vocabulary mapping for 847 tokens",
124
+ task_difficulty="hard",
125
+ ),
126
+ }
127
+
128
+ TASK_BUG_POOLS = {
129
+ "easy": ["exploding_lr", "wrong_optimizer", "batch_size_overflow"],
130
+ "medium": ["data_leakage_scaler", "data_leakage_overlap", "wrong_split_ratio"],
131
+ "hard": ["label_encoder_mismatch", "silent_metric_swap", "tokenizer_version_drift"],
132
+ }
133
+
134
+
135
+ # ─── Model / Dataset Configs (variety pool) ───────────────────────────────────
136
+
137
+ MODEL_CONFIGS = [
138
+ {"name": "ResNet-50", "type": "image_classification", "params": "25.6M",
139
+ "dataset": "ImageNet-subset-10k", "num_classes": 10, "input": "224x224 RGB"},
140
+ {"name": "BERT-base-uncased", "type": "text_classification", "params": "110M",
141
+ "dataset": "SST-2", "num_classes": 2, "input": "tokenized sequences, max_len=128"},
142
+ {"name": "EfficientNet-B3", "type": "image_classification", "params": "12.2M",
143
+ "dataset": "CIFAR-100", "num_classes": 100, "input": "300x300 RGB"},
144
+ {"name": "DistilBERT", "type": "sentiment_analysis", "params": "66M",
145
+ "dataset": "IMDB-reviews", "num_classes": 3, "input": "tokenized sequences, max_len=256"},
146
+ {"name": "MobileNetV3-Large", "type": "image_classification", "params": "5.4M",
147
+ "dataset": "Oxford-102-Flowers", "num_classes": 102, "input": "224x224 RGB"},
148
+ ]
149
+
150
+ OPTIMIZERS = ["AdamW", "SGD", "RMSprop", "Adam"]
151
+ SCHEDULERS = ["cosine_annealing", "step_lr", "reduce_on_plateau", "linear_warmup"]
152
+
153
+
154
+ # ─── Artifact Generators ──────────────────────────────────────────────────────
155
+
156
+ class ArtifactGenerator:
157
+ """
158
+ Generates all 6 training artifacts for a given (bug_type, seed) pair.
159
+ All artifacts are internally consistent except for the planted fault.
160
+ """
161
+
162
+ def __init__(self, bug_type: str, seed: int):
163
+ self.bug = BUG_CATALOGUE[bug_type]
164
+ self.seed = seed
165
+ self.rng = random.Random(seed)
166
+ self.np_rng = np.random.RandomState(seed)
167
+
168
+ # Pick a model config deterministically
169
+ self.model_cfg = self.rng.choice(MODEL_CONFIGS)
170
+ self.optimizer_name = self.rng.choice(OPTIMIZERS)
171
+ self.scheduler_name = self.rng.choice(SCHEDULERS)
172
+ self.run_id = f"run_{seed:04d}_{bug_type[:6]}"
173
+
174
+ # Normal hyperparams
175
+ self.lr = self.rng.choice([1e-5, 3e-5, 1e-4, 3e-4])
176
+ self.batch_size = self.rng.choice([16, 32, 64])
177
+ self.epochs = self.rng.randint(8, 20)
178
+ self.weight_decay = self.rng.choice([0.01, 0.001, 1e-4])
179
+ self.momentum = 0.9
180
+ self.train_samples = self.rng.randint(8000, 15000)
181
+ self.val_samples = int(self.train_samples * 0.2)
182
+ self.test_samples = int(self.train_samples * 0.15)
183
+
184
+ def generate_all(self) -> Dict[str, str]:
185
+ return {
186
+ "config.yaml": self._gen_config(),
187
+ "train.log": self._gen_train_log(),
188
+ "dataset_stats.json": self._gen_dataset_stats(),
189
+ "preprocessing.py": self._gen_preprocessing(),
190
+ "eval_results.json": self._gen_eval_results(),
191
+ "model_card.json": self._gen_model_card(),
192
+ }
193
+
194
+ # ── config.yaml ──────────────────────────────────────────────────────────
195
+
196
+ def _gen_config(self) -> str:
197
+ lr = self.lr
198
+ batch_size = self.batch_size
199
+ momentum = self.momentum
200
+
201
+ if self.bug.bug_type == "exploding_lr":
202
+ lr = self.rng.choice([50.0, 10.0, 25.0])
203
+ elif self.bug.bug_type == "wrong_optimizer":
204
+ momentum = 0.99
205
+ self.optimizer_name = "SGD"
206
+ elif self.bug.bug_type == "batch_size_overflow":
207
+ batch_size = self.rng.choice([2048, 4096, 8192])
208
+
209
+ return textwrap.dedent(f"""\
210
+ # Training Configuration
211
+ # Run ID: {self.run_id}
212
+ # Generated: 2024-03-{self.rng.randint(1,28):02d}T{self.rng.randint(0,23):02d}:{self.rng.randint(0,59):02d}:00Z
213
+
214
+ model:
215
+ architecture: {self.model_cfg['name']}
216
+ num_classes: {self.model_cfg['num_classes']}
217
+ pretrained: true
218
+ pretrained_source: "timm/torchvision"
219
+ dropout: {self.rng.choice([0.1, 0.2, 0.3])}
220
+ freeze_backbone_epochs: {self.rng.randint(0, 3)}
221
+
222
+ training:
223
+ epochs: {self.epochs}
224
+ batch_size: {batch_size}
225
+ num_workers: {self.rng.choice([4, 8])}
226
+ pin_memory: true
227
+ mixed_precision: {str(self.rng.choice([True, False])).lower()}
228
+ gradient_clip_norm: {self.rng.choice([1.0, 5.0, "null"])}
229
+ early_stopping_patience: {self.rng.randint(3, 7)}
230
+ seed: {self.seed}
231
+
232
+ optimizer:
233
+ name: {self.optimizer_name}
234
+ learning_rate: {lr}
235
+ weight_decay: {self.weight_decay}
236
+ momentum: {momentum}
237
+ betas: [0.9, 0.999]
238
+
239
+ scheduler:
240
+ name: {self.scheduler_name}
241
+ warmup_epochs: {self.rng.randint(0, 3)}
242
+ min_lr: 1.0e-7
243
+ t_max: {self.epochs}
244
+
245
+ data:
246
+ dataset: {self.model_cfg['dataset']}
247
+ input_size: "{self.model_cfg['input']}"
248
+ train_split: 0.8
249
+ val_split: 0.1
250
+ test_split: 0.1
251
+ augmentation:
252
+ random_crop: true
253
+ horizontal_flip: {str(self.rng.choice([True, False])).lower()}
254
+ color_jitter: {self.rng.choice([0.2, 0.4])}
255
+ normalize_mean: [0.485, 0.456, 0.406]
256
+ normalize_std: [0.229, 0.224, 0.225]
257
+
258
+ logging:
259
+ log_interval: 50
260
+ save_best_only: true
261
+ checkpoint_dir: "./checkpoints/{self.run_id}"
262
+ wandb_project: "mlops-debug-bench"
263
+ """)
264
+
265
+ # ── train.log ────────────────────────────────────────────────────────────
266
+
267
+ def _gen_train_log(self) -> str:
268
+ lines = []
269
+ lines.append(f"[INFO 2024-03-{self.rng.randint(1,28):02d} {self.rng.randint(6,10):02d}:00:00] Starting training run: {self.run_id}")
270
+ lines.append(f"[INFO ] Model: {self.model_cfg['name']} | Params: {self.model_cfg['params']}")
271
+ lines.append(f"[INFO ] Dataset: {self.model_cfg['dataset']} | Train: {self.train_samples:,} | Val: {self.val_samples:,}")
272
+ lines.append(f"[INFO ] Device: cuda:0 | Mixed precision: fp16")
273
+ lines.append(f"[INFO ] Optimizer: {self.optimizer_name} | LR: {self.lr} | Batch: {self.batch_size}")
274
+ lines.append("[INFO ] ─" * 30)
275
+
276
+ bug = self.bug.bug_type
277
+
278
+ if bug == "exploding_lr":
279
+ # Loss explodes rapidly
280
+ loss = 2.302
281
+ for ep in range(1, min(self.epochs + 1, 6)):
282
+ acc = max(0.0, 0.12 - ep * 0.02)
283
+ val_loss = loss * self.rng.uniform(1.1, 1.5)
284
+ val_acc = max(0.0, acc - 0.05)
285
+ lines.append(f"[EPOCH {ep:03d}] train_loss={loss:.4f} train_acc={acc:.4f} "
286
+ f"val_loss={val_loss:.4f} val_acc={val_acc:.4f} "
287
+ f"lr={self.lr:.2e} grad_norm={loss * 18.3:.2f} "
288
+ f"time={self.rng.randint(45,90)}s")
289
+ if ep == 1: lines.append(f"[WARN ] Gradient norm unusually high: {loss * 18.3:.2f} (threshold: 10.0)")
290
+ loss = loss * self.rng.uniform(4.5, 9.0)
291
+ if loss > 1e6:
292
+ lines.append(f"[EPOCH {ep+1:03d}] train_loss=nan train_acc=0.1000 val_loss=nan val_acc=0.1000 "
293
+ f"lr={self.lr:.2e} grad_norm=nan time={self.rng.randint(45,90)}s")
294
+ lines.append(f"[ERROR ] Loss is NaN at epoch {ep+1}, step {self.rng.randint(100,300)}. Training halted.")
295
+ lines.append(f"[ERROR ] Last finite loss: {loss / self.rng.uniform(4,9):.2f}. Gradient explosion detected.")
296
+ break
297
+
298
+ elif bug == "wrong_optimizer":
299
+ # Loss oscillates wildly, never converges
300
+ loss = 2.302
301
+ for ep in range(1, self.epochs + 1):
302
+ delta = self.rng.uniform(-0.8, 1.2)
303
+ loss = max(1.8, loss + delta)
304
+ acc = self.rng.uniform(0.10, 0.25)
305
+ val_loss = loss + self.rng.uniform(-0.3, 0.8)
306
+ val_acc = self.rng.uniform(0.09, 0.22)
307
+ lines.append(f"[EPOCH {ep:03d}] train_loss={loss:.4f} train_acc={acc:.4f} "
308
+ f"val_loss={val_loss:.4f} val_acc={val_acc:.4f} "
309
+ f"lr={self.lr:.2e} grad_norm={self.rng.uniform(8.2, 45.1):.2f} "
310
+ f"time={self.rng.randint(45,90)}s")
311
+ if ep % 3 == 0:
312
+ lines.append(f"[WARN ] Loss oscillation detected over last 3 epochs: {loss+0.4:.3f} → {loss-0.5:.3f} → {loss:.3f}")
313
+
314
+ elif bug == "batch_size_overflow":
315
+ # Val accuracy hits 100% immediately — model memorizes tiny effective dataset
316
+ for ep in range(1, self.epochs + 1):
317
+ train_loss = max(0.001, 2.302 * (0.05 ** ep))
318
+ train_acc = min(1.0, 0.3 + ep * 0.09)
319
+ val_acc = 0.999 if ep >= 2 else 0.85
320
+ val_loss = 0.001 if ep >= 2 else 0.45
321
+ lines.append(f"[EPOCH {ep:03d}] train_loss={train_loss:.4f} train_acc={train_acc:.4f} "
322
+ f"val_loss={val_loss:.4f} val_acc={val_acc:.4f} "
323
+ f"lr={self.lr:.2e} grad_norm={self.rng.uniform(0.1,0.9):.3f} "
324
+ f"time={self.rng.randint(3,8)}s")
325
+ lines.append(f"[WARN ] Effective steps per epoch: {max(1, self.train_samples // 4096)}. Dataset may be smaller than batch size.")
326
+
327
+ elif bug in ("data_leakage_scaler", "data_leakage_overlap", "wrong_split_ratio"):
328
+ # Val accuracy suspiciously high from epoch 1
329
+ for ep in range(1, self.epochs + 1):
330
+ train_loss = max(0.01, 0.45 - ep * 0.02)
331
+ train_acc = min(0.98, 0.72 + ep * 0.015)
332
+ val_acc = min(0.999, 0.984 + self.rng.uniform(-0.002, 0.002)) if ep >= 1 else 0.71
333
+ val_loss = max(0.001, 0.04 - ep * 0.001)
334
+ lines.append(f"[EPOCH {ep:03d}] train_loss={train_loss:.4f} train_acc={train_acc:.4f} "
335
+ f"val_loss={val_loss:.4f} val_acc={val_acc:.4f} "
336
+ f"lr={self.lr:.2e} grad_norm={self.rng.uniform(0.1,1.2):.3f} "
337
+ f"time={self.rng.randint(45,90)}s")
338
+ lines.append(f"[INFO ] Best model saved at epoch 2: val_acc=0.9841")
339
+ lines.append(f"[WARN ] Val accuracy reached 98.4% at epoch 1 — verify no data leakage.")
340
+
341
+ elif bug in ("label_encoder_mismatch", "silent_metric_swap", "tokenizer_version_drift"):
342
+ # Training looks completely normal — the bug is silent
343
+ best_val = 0.0
344
+ for ep in range(1, self.epochs + 1):
345
+ train_loss = max(0.08, 1.8 * (0.72 ** ep) + self.rng.uniform(-0.02, 0.02))
346
+ train_acc = min(0.96, 0.42 + ep * 0.032 + self.rng.uniform(-0.01, 0.01))
347
+ val_loss = train_loss * self.rng.uniform(1.05, 1.15)
348
+ val_acc = train_acc - self.rng.uniform(0.02, 0.06)
349
+ best_val = max(best_val, val_acc)
350
+ lines.append(f"[EPOCH {ep:03d}] train_loss={train_loss:.4f} train_acc={train_acc:.4f} "
351
+ f"val_loss={val_loss:.4f} val_acc={val_acc:.4f} "
352
+ f"lr={self.lr:.2e} grad_norm={self.rng.uniform(0.3, 2.1):.3f} "
353
+ f"time={self.rng.randint(60,120)}s")
354
+ lines.append(f"[INFO ] Training complete. Best val_acc={best_val:.4f} at epoch {self.rng.randint(self.epochs-3, self.epochs)}")
355
+ lines.append(f"[INFO ] Checkpoint saved: ./checkpoints/{self.run_id}/best_model.pt")
356
+
357
+ lines.append("[INFO ] ─" * 30)
358
+ lines.append(f"[INFO ] Run {self.run_id} finished.")
359
+ return "\n".join(lines)
360
+
361
+ # ── dataset_stats.json ───────────────────────────────────────────────────
362
+
363
+ def _gen_dataset_stats(self) -> str:
364
+ n_classes = self.model_cfg["num_classes"]
365
+ train_n = self.train_samples
366
+ val_n = self.val_samples
367
+ test_n = self.test_samples
368
+
369
+ overlap_count = 0
370
+ if self.bug.bug_type == "data_leakage_overlap":
371
+ overlap_count = self.rng.randint(int(val_n * 0.15), int(val_n * 0.30))
372
+ elif self.bug.bug_type == "wrong_split_ratio":
373
+ # Train and test flipped
374
+ train_n, test_n = test_n, train_n
375
+
376
+ # Class distribution (roughly uniform with jitter)
377
+ def class_dist(total, n_cls):
378
+ base = total // n_cls
379
+ counts = {str(i): base + self.rng.randint(-int(base*0.15), int(base*0.15))
380
+ for i in range(min(n_cls, 10))}
381
+ if n_cls > 10:
382
+ counts["..."] = f"{n_cls - 10} more classes"
383
+ return counts
384
+
385
+ stats = {
386
+ "dataset": self.model_cfg["dataset"],
387
+ "num_classes": n_classes,
388
+ "splits": {
389
+ "train": {
390
+ "n_samples": train_n,
391
+ "class_distribution": class_dist(train_n, n_classes),
392
+ },
393
+ "val": {
394
+ "n_samples": val_n,
395
+ "class_distribution": class_dist(val_n, n_classes),
396
+ "overlap_with_train": overlap_count,
397
+ },
398
+ "test": {
399
+ "n_samples": test_n,
400
+ "class_distribution": class_dist(test_n, n_classes),
401
+ },
402
+ },
403
+ "feature_statistics": {
404
+ "mean": round(self.np_rng.uniform(0.45, 0.55), 4),
405
+ "std": round(self.np_rng.uniform(0.22, 0.28), 4),
406
+ "min": 0.0,
407
+ "max": 1.0,
408
+ "null_count": 0,
409
+ },
410
+ "preprocessing_applied": [
411
+ "resize",
412
+ "normalize",
413
+ "label_encode",
414
+ "train_val_test_split",
415
+ ],
416
+ "random_seed_used": self.seed if self.bug.bug_type != "data_leakage_overlap" else None,
417
+ }
418
+ return json.dumps(stats, indent=2)
419
+
420
+ # ── preprocessing.py ─────────────────────────────────────────────────────
421
+
422
+ def _gen_preprocessing(self) -> str:
423
+ bug = self.bug.bug_type
424
+
425
+ if bug == "data_leakage_scaler":
426
+ return textwrap.dedent(f"""\
427
+ \"\"\"
428
+ Data preprocessing pipeline for {self.model_cfg['dataset']}
429
+ Run ID: {self.run_id}
430
+ \"\"\"
431
+ import numpy as np
432
+ import pandas as pd
433
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
434
+ from sklearn.model_selection import train_test_split
435
+
436
+
437
+ def load_raw_data(data_dir: str):
438
+ \"\"\"Load features and labels from disk.\"\"\"
439
+ X = np.load(f"{{data_dir}}/features.npy")
440
+ y = np.load(f"{{data_dir}}/labels.npy")
441
+ return X, y
442
+
443
+
444
+ def preprocess(data_dir: str, seed: int = {self.seed}):
445
+ X, y = load_raw_data(data_dir)
446
+
447
+ # Encode labels
448
+ le = LabelEncoder()
449
+ y_encoded = le.fit_transform(y)
450
+
451
+ # ── BUG: Scaler fit on full dataset BEFORE split ──────────
452
+ scaler = StandardScaler()
453
+ X_normalized = scaler.fit_transform(X) # sees val/test data during fit!
454
+ # ─────────────────────────────────────────────────────────
455
+
456
+ X_train, X_temp, y_train, y_temp = train_test_split(
457
+ X_normalized, y_encoded, test_size=0.2, random_state=seed
458
+ )
459
+ X_val, X_test, y_val, y_test = train_test_split(
460
+ X_temp, y_temp, test_size=0.5, random_state=seed
461
+ )
462
+
463
+ return (X_train, y_train), (X_val, y_val), (X_test, y_test), scaler, le
464
+
465
+
466
+ def get_transforms(split: str):
467
+ \"\"\"Get augmentation transforms for a given split.\"\"\"
468
+ if split == "train":
469
+ return [
470
+ ("random_horizontal_flip", {{"p": 0.5}}),
471
+ ("random_crop", {{"size": 224, "padding": 4}}),
472
+ ("color_jitter", {{"brightness": 0.2, "contrast": 0.2}}),
473
+ ("normalize", {{"mean": [0.485, 0.456, 0.406],
474
+ "std": [0.229, 0.224, 0.225]}}),
475
+ ]
476
+ return [
477
+ ("center_crop", {{"size": 224}}),
478
+ ("normalize", {{"mean": [0.485, 0.456, 0.406],
479
+ "std": [0.229, 0.224, 0.225]}}),
480
+ ]
481
+ """)
482
+
483
+ elif bug == "data_leakage_overlap":
484
+ return textwrap.dedent(f"""\
485
+ \"\"\"
486
+ Data preprocessing pipeline for {self.model_cfg['dataset']}
487
+ Run ID: {self.run_id}
488
+ \"\"\"
489
+ import numpy as np
490
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
491
+ from sklearn.model_selection import train_test_split
492
+
493
+
494
+ def load_raw_data(data_dir: str):
495
+ X = np.load(f"{{data_dir}}/features.npy")
496
+ y = np.load(f"{{data_dir}}/labels.npy")
497
+ return X, y
498
+
499
+
500
+ def preprocess(data_dir: str):
501
+ X, y = load_raw_data(data_dir)
502
+
503
+ le = LabelEncoder()
504
+ y_encoded = le.fit_transform(y)
505
+
506
+ # First split: train vs temp
507
+ # ── BUG: random_state=None → non-reproducible, overlapping splits ──
508
+ X_train, X_temp, y_train, y_temp = train_test_split(
509
+ X, y_encoded, test_size=0.2, random_state=None # ← should be fixed seed
510
+ )
511
+ # Second split: val vs test (ALSO non-deterministic)
512
+ X_val, X_test, y_val, y_test = train_test_split(
513
+ X_temp, y_temp, test_size=0.5, random_state=None # ← should be fixed seed
514
+ )
515
+ # ─────────────────────────────────────────────────────────
516
+
517
+ scaler = StandardScaler()
518
+ X_train = scaler.fit_transform(X_train)
519
+ X_val = scaler.transform(X_val)
520
+ X_test = scaler.transform(X_test)
521
+
522
+ return (X_train, y_train), (X_val, y_val), (X_test, y_test), scaler, le
523
+ """)
524
+
525
+ elif bug == "wrong_split_ratio":
526
+ return textwrap.dedent(f"""\
527
+ \"\"\"
528
+ Data preprocessing pipeline for {self.model_cfg['dataset']}
529
+ Run ID: {self.run_id}
530
+ \"\"\"
531
+ import numpy as np
532
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
533
+ from sklearn.model_selection import train_test_split
534
+
535
+
536
+ def preprocess(data_dir: str, seed: int = {self.seed}):
537
+ X = np.load(f"{{data_dir}}/features.npy")
538
+ y = np.load(f"{{data_dir}}/labels.npy")
539
+
540
+ le = LabelEncoder()
541
+ y_encoded = le.fit_transform(y)
542
+
543
+ # ── BUG: test_size=0.8 — trains on 20%, evaluates on 80% ──
544
+ X_train, X_test, y_train, y_test = train_test_split(
545
+ X, y_encoded, test_size=0.8, random_state=seed # ← should be 0.2
546
+ )
547
+ X_val, X_test, y_val, y_test = train_test_split(
548
+ X_test, y_test, test_size=0.5, random_state=seed
549
+ )
550
+ # ──────────────────────────────────────────────────────────
551
+
552
+ scaler = StandardScaler()
553
+ X_train = scaler.fit_transform(X_train)
554
+ X_val = scaler.transform(X_val)
555
+ X_test = scaler.transform(X_test)
556
+
557
+ return (X_train, y_train), (X_val, y_val), (X_test, y_test), scaler, le
558
+ """)
559
+
560
+ elif bug == "label_encoder_mismatch":
561
+ classes = ["cat", "dog", "bird"] if self.model_cfg["num_classes"] <= 10 else \
562
+ [f"class_{i}" for i in range(min(self.model_cfg["num_classes"], 5))]
563
+ classes_shuffled = classes.copy()
564
+ self.rng.shuffle(classes_shuffled)
565
+ return textwrap.dedent(f"""\
566
+ \"\"\"
567
+ Data preprocessing pipeline for {self.model_cfg['dataset']}
568
+ Run ID: {self.run_id}
569
+
570
+ WARNING: Training and evaluation pipelines are defined separately.
571
+ Ensure they use identical label encoding.
572
+ \"\"\"
573
+ import numpy as np
574
+ from sklearn.preprocessing import LabelEncoder
575
+ from sklearn.model_selection import train_test_split
576
+
577
+
578
+ # ── Training pipeline ─────────────────────────────────────────
579
+ def build_train_pipeline(data_dir: str, seed: int = {self.seed}):
580
+ X = np.load(f"{{data_dir}}/train_features.npy")
581
+ y_raw = np.load(f"{{data_dir}}/train_labels.npy", allow_pickle=True)
582
+
583
+ # LabelEncoder fitted on training class order
584
+ le_train = LabelEncoder()
585
+ le_train.fit({classes}) # alphabetical order: {sorted(classes)}
586
+ y = le_train.transform(y_raw)
587
+
588
+ X_train, X_val, y_train, y_val = train_test_split(
589
+ X, y, test_size=0.2, random_state=seed
590
+ )
591
+ return (X_train, y_train), (X_val, y_val), le_train
592
+
593
+
594
+ # ── Evaluation pipeline ───────────────────────────────────────
595
+ def build_eval_pipeline(data_dir: str):
596
+ X_test = np.load(f"{{data_dir}}/test_features.npy")
597
+ y_raw = np.load(f"{{data_dir}}/test_labels.npy", allow_pickle=True)
598
+
599
+ # ── BUG: Different LabelEncoder instance with DIFFERENT fit order ──
600
+ le_eval = LabelEncoder()
601
+ le_eval.fit({classes_shuffled}) # ← shuffled order: {classes_shuffled}
602
+ y_test = le_eval.transform(y_raw)
603
+ # ─────────────────────────────────────────────────────────
604
+
605
+ return X_test, y_test, le_eval
606
+ """)
607
+
608
+ elif bug == "silent_metric_swap":
609
+ val_acc = round(self.rng.uniform(0.84, 0.91), 4)
610
+ test_acc = round(self.rng.uniform(0.31, 0.39), 4)
611
+ return textwrap.dedent(f"""\
612
+ \"\"\"
613
+ Evaluation script for {self.model_cfg['dataset']}
614
+ Run ID: {self.run_id}
615
+ \"\"\"
616
+ import torch
617
+ import json
618
+
619
+
620
+ def evaluate(model, val_loader, test_loader, device="cuda"):
621
+ model.eval()
622
+ results = {{}}
623
+
624
+ with torch.no_grad():
625
+ # Evaluate on validation set
626
+ val_correct, val_total = 0, 0
627
+ for X, y in val_loader:
628
+ preds = model(X.to(device)).argmax(dim=1)
629
+ val_correct += (preds == y.to(device)).sum().item()
630
+ val_total += y.size(0)
631
+ val_acc = val_correct / val_total
632
+
633
+ # Evaluate on test set
634
+ test_correct, test_total = 0, 0
635
+ for X, y in test_loader:
636
+ preds = model(X.to(device)).argmax(dim=1)
637
+ test_correct += (preds == y.to(device)).sum().item()
638
+ test_total += y.size(0)
639
+ test_acc = test_correct / test_total
640
+
641
+ # ── BUG: val and test accuracy assignments are swapped ──
642
+ results["val_accuracy"] = test_acc # ← should be val_acc
643
+ results["test_accuracy"] = val_acc # ← should be test_acc
644
+ # ──────────────────────────────────────────────────────
645
+
646
+ results["val_loss"] = round(1 - val_acc + 0.12, 4)
647
+ results["test_loss"] = round(1 - test_acc + 0.09, 4)
648
+ return results
649
+ """)
650
+
651
+ elif bug == "tokenizer_version_drift":
652
+ return textwrap.dedent(f"""\
653
+ \"\"\"
654
+ Text preprocessing pipeline for {self.model_cfg['dataset']}
655
+ Run ID: {self.run_id}
656
+ \"\"\"
657
+ from transformers import AutoTokenizer
658
+
659
+
660
+ TOKENIZER_V1 = "bert-base-uncased" # vocab size: 30,522
661
+ TOKENIZER_V2 = "bert-base-uncased-v2-fixed" # vocab size: 30,522 + 847 domain tokens
662
+
663
+
664
+ # ── Training pipeline ─────────────────────────────────────────
665
+ def get_train_tokenizer():
666
+ \"\"\"Tokenizer used during training.\"\"\"
667
+ # Updated to v2 for domain-specific vocabulary
668
+ tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_V2)
669
+ return tokenizer
670
+
671
+
672
+ # ── Evaluation pipeline ───────────────────────────────────────
673
+ def get_eval_tokenizer():
674
+ \"\"\"Tokenizer used during evaluation and inference.\"\"\"
675
+ # ── BUG: Still using v1 — 847 tokens map to [UNK] during eval ──
676
+ tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_V1) # ← should be TOKENIZER_V2
677
+ return tokenizer
678
+ # ─────────────────────────────────────────────────────────
679
+
680
+
681
+ def tokenize_batch(texts, tokenizer, max_length: int = 128):
682
+ return tokenizer(
683
+ texts,
684
+ padding="max_length",
685
+ truncation=True,
686
+ max_length=max_length,
687
+ return_tensors="pt",
688
+ )
689
+ """)
690
+
691
+ else:
692
+ # Default normal preprocessing (for config-error bugs, preprocessing is clean)
693
+ return textwrap.dedent(f"""\
694
+ \"\"\"
695
+ Data preprocessing pipeline for {self.model_cfg['dataset']}
696
+ Run ID: {self.run_id}
697
+ \"\"\"
698
+ import numpy as np
699
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
700
+ from sklearn.model_selection import train_test_split
701
+
702
+
703
+ def preprocess(data_dir: str, seed: int = {self.seed}):
704
+ X = np.load(f"{{data_dir}}/features.npy")
705
+ y = np.load(f"{{data_dir}}/labels.npy")
706
+
707
+ le = LabelEncoder()
708
+ y_encoded = le.fit_transform(y)
709
+
710
+ X_train, X_temp, y_train, y_temp = train_test_split(
711
+ X, y_encoded, test_size=0.2, random_state=seed
712
+ )
713
+ X_val, X_test, y_val, y_test = train_test_split(
714
+ X_temp, y_temp, test_size=0.5, random_state=seed
715
+ )
716
+
717
+ # Correct: fit only on training data
718
+ scaler = StandardScaler()
719
+ X_train = scaler.fit_transform(X_train)
720
+ X_val = scaler.transform(X_val)
721
+ X_test = scaler.transform(X_test)
722
+
723
+ return (X_train, y_train), (X_val, y_val), (X_test, y_test), scaler, le
724
+ """)
725
+
726
+ # ── eval_results.json ────────────────────────────────────────────────────
727
+
728
+ def _gen_eval_results(self) -> str:
729
+ bug = self.bug.bug_type
730
+
731
+ if bug in ("exploding_lr", "wrong_optimizer"):
732
+ val_acc = round(self.rng.uniform(0.09, 0.13), 4)
733
+ test_acc = round(self.rng.uniform(0.09, 0.13), 4)
734
+ val_loss = 999999.9 if bug == "exploding_lr" else round(self.rng.uniform(2.1, 2.4), 4)
735
+ test_loss = val_loss
736
+ elif bug == "batch_size_overflow":
737
+ val_acc = 0.9990
738
+ test_acc = round(self.rng.uniform(0.11, 0.15), 4) # massive train/test gap
739
+ val_loss, test_loss = 0.0003, round(self.rng.uniform(1.8, 2.3), 4)
740
+ elif bug in ("data_leakage_scaler", "data_leakage_overlap", "wrong_split_ratio"):
741
+ val_acc = round(self.rng.uniform(0.982, 0.998), 4)
742
+ test_acc = round(self.rng.uniform(0.61, 0.73), 4) # test is much worse (no leakage)
743
+ val_loss = round(self.rng.uniform(0.004, 0.015), 4)
744
+ test_loss = round(self.rng.uniform(0.42, 0.68), 4)
745
+ elif bug == "label_encoder_mismatch":
746
+ val_acc = round(self.rng.uniform(0.84, 0.91), 4)
747
+ test_acc = round(self.rng.uniform(0.30, 0.38), 4) # near random for 3-class
748
+ val_loss = round(1 - val_acc + self.rng.uniform(0.05, 0.15), 4)
749
+ test_loss = round(1 - test_acc + self.rng.uniform(0.05, 0.15), 4)
750
+ elif bug == "silent_metric_swap":
751
+ real_val = round(self.rng.uniform(0.84, 0.91), 4)
752
+ real_test = round(self.rng.uniform(0.31, 0.39), 4)
753
+ # Swapped in output
754
+ val_acc = real_test
755
+ test_acc = real_val
756
+ val_loss = round(1 - real_test + 0.09, 4)
757
+ test_loss = round(1 - real_val + 0.12, 4)
758
+ elif bug == "tokenizer_version_drift":
759
+ val_acc = round(self.rng.uniform(0.83, 0.88), 4)
760
+ test_acc = round(self.rng.uniform(0.28, 0.36), 4)
761
+ val_loss = round(1 - val_acc + self.rng.uniform(0.05, 0.12), 4)
762
+ test_loss = round(1 - test_acc + self.rng.uniform(0.05, 0.12), 4)
763
+ else:
764
+ val_acc = round(self.rng.uniform(0.78, 0.91), 4)
765
+ test_acc = round(val_acc - self.rng.uniform(0.02, 0.05), 4)
766
+ val_loss = round(1 - val_acc + 0.1, 4)
767
+ test_loss = round(1 - test_acc + 0.1, 4)
768
+
769
+ result = {
770
+ "run_id": self.run_id,
771
+ "final_epoch": self.epochs if bug not in ("exploding_lr",) else self.rng.randint(2,5),
772
+ "metrics": {
773
+ "val_loss": val_loss,
774
+ "val_accuracy": val_acc,
775
+ "test_loss": test_loss,
776
+ "test_accuracy": test_acc,
777
+ },
778
+ "best_checkpoint": f"./checkpoints/{self.run_id}/best_model.pt",
779
+ "evaluation_timestamp": f"2024-03-{self.rng.randint(1,28):02d}T{self.rng.randint(10,22):02d}:{self.rng.randint(0,59):02d}:00Z",
780
+ "hardware": {"gpu": "A100-40GB", "cuda": "12.1"},
781
+ }
782
+ return json.dumps(result, indent=2)
783
+
784
+ # ── model_card.json ──────────────────────────────────────────────────────
785
+
786
+ def _gen_model_card(self) -> str:
787
+ bug = self.bug.bug_type
788
+ tokenizer_ver = "v1" if bug == "tokenizer_version_drift" else "v2"
789
+
790
+ card = {
791
+ "model_id": f"{self.run_id}",
792
+ "architecture": self.model_cfg["name"],
793
+ "task": self.model_cfg["type"],
794
+ "num_parameters": self.model_cfg["params"],
795
+ "dataset": self.model_cfg["dataset"],
796
+ "num_classes": self.model_cfg["num_classes"],
797
+ "framework": "PyTorch 2.2.0",
798
+ "training_config": {
799
+ "optimizer": self.optimizer_name,
800
+ "scheduler": self.scheduler_name,
801
+ "epochs": self.epochs,
802
+ },
803
+ "preprocessing": {
804
+ "label_encoder": "sklearn.LabelEncoder",
805
+ "tokenizer": tokenizer_ver if "bert" in self.model_cfg["name"].lower() else "N/A",
806
+ "normalizer": "StandardScaler (fit on training split)",
807
+ },
808
+ "authors": ["ml-platform-team"],
809
+ "license": "Apache-2.0",
810
+ }
811
+ return json.dumps(card, indent=2)
812
+
813
+
814
+ # ─── Sanity Check Engine ──────────────────────────────────────────────────────
815
+
816
+ def run_sanity_check(check_type: str, bug_type: str, artifacts: Dict[str, str],
817
+ rng: random.Random) -> Dict:
818
+ """
819
+ Runs a named diagnostic check and returns computed results.
820
+ Results are grounded in the generated artifacts — not random.
821
+ """
822
+ bug = BUG_CATALOGUE[bug_type]
823
+
824
+ if check_type == "label_consistency":
825
+ if bug_type == "label_encoder_mismatch":
826
+ return {
827
+ "check": "label_consistency",
828
+ "result": "FAIL",
829
+ "details": "Training LabelEncoder class order: ['bird', 'cat', 'dog'] (index 0=bird, 1=cat, 2=dog). "
830
+ "Evaluation LabelEncoder class order: ['cat', 'dog', 'bird'] (index 0=cat, 1=dog, 2=bird). "
831
+ "Mismatch detected — 2 of 3 class indices differ between pipelines.",
832
+ "affected_classes": 2,
833
+ "recommendation": "Use a single LabelEncoder instance across both pipelines.",
834
+ }
835
+ return {"check": "label_consistency", "result": "PASS",
836
+ "details": "Train and eval label mappings are identical. No mismatch detected."}
837
+
838
+ elif check_type == "data_leakage":
839
+ if bug_type in ("data_leakage_overlap", "data_leakage_scaler"):
840
+ overlap = rng.randint(180, 450) if bug_type == "data_leakage_overlap" else 0
841
+ scaler_leak = bug_type == "data_leakage_scaler"
842
+ return {
843
+ "check": "data_leakage",
844
+ "result": "FAIL",
845
+ "sample_overlap": overlap,
846
+ "scaler_fitted_on_full_dataset": scaler_leak,
847
+ "details": (
848
+ f"Found {overlap} samples present in both train and val splits. "
849
+ if overlap > 0 else ""
850
+ ) + (
851
+ "StandardScaler.fit_transform() called on full dataset before split — "
852
+ "validation statistics contaminated by training distribution."
853
+ if scaler_leak else ""
854
+ ),
855
+ }
856
+ return {"check": "data_leakage", "result": "PASS",
857
+ "sample_overlap": 0, "scaler_fitted_on_full_dataset": False,
858
+ "details": "No data leakage detected between train and val splits."}
859
+
860
+ elif check_type == "gradient_norms":
861
+ if bug_type == "exploding_lr":
862
+ return {
863
+ "check": "gradient_norms",
864
+ "result": "ANOMALY",
865
+ "epoch_1_norm": round(rng.uniform(840.0, 2100.0), 2),
866
+ "expected_range": "0.1 – 10.0",
867
+ "details": "Gradient norms exceeded safe threshold by 100–200×. "
868
+ "Indicates learning rate is too large — gradients are not being controlled.",
869
+ }
870
+ return {"check": "gradient_norms", "result": "NORMAL",
871
+ "mean_norm": round(rng.uniform(0.3, 2.1), 3),
872
+ "max_norm": round(rng.uniform(2.1, 4.5), 3),
873
+ "details": "Gradient norms are within expected range throughout training."}
874
+
875
+ elif check_type == "metric_gap_analysis":
876
+ if bug_type in ("label_encoder_mismatch", "silent_metric_swap", "tokenizer_version_drift"):
877
+ val_acc = round(rng.uniform(0.84, 0.91), 4)
878
+ test_acc = round(rng.uniform(0.28, 0.38), 4)
879
+ return {
880
+ "check": "metric_gap_analysis",
881
+ "result": "ANOMALY",
882
+ "val_accuracy": val_acc,
883
+ "test_accuracy": test_acc,
884
+ "gap": round(val_acc - test_acc, 4),
885
+ "expected_max_gap": 0.08,
886
+ "details": f"Val/test accuracy gap is {val_acc - test_acc:.3f} — far exceeds expected max of 0.08. "
887
+ f"This magnitude of gap (>{val_acc - test_acc:.0%}) strongly suggests an evaluation pipeline bug "
888
+ f"rather than overfitting — the model generalises well to the val set but fails on test data.",
889
+ }
890
+ return {"check": "metric_gap_analysis", "result": "NORMAL",
891
+ "details": "Val/test metric gap is within normal bounds."}
892
+
893
+ elif check_type == "encoder_version_match":
894
+ if bug_type == "tokenizer_version_drift":
895
+ return {
896
+ "check": "encoder_version_match",
897
+ "result": "MISMATCH",
898
+ "training_tokenizer": "bert-base-uncased-v2-fixed",
899
+ "eval_tokenizer": "bert-base-uncased",
900
+ "vocab_diff": 847,
901
+ "details": "Training uses tokenizer v2 (30,522 + 847 domain tokens). "
902
+ "Evaluation uses tokenizer v1 (30,522 tokens). "
903
+ "847 domain-specific tokens will map to [UNK] during evaluation — "
904
+ "causing silent degradation on domain-specific test inputs.",
905
+ }
906
+ return {"check": "encoder_version_match", "result": "PASS",
907
+ "details": "Training and evaluation use identical tokenizer versions."}
908
+
909
+ elif check_type == "class_balance":
910
+ n_classes = 10
911
+ counts = {str(i): rng.randint(780, 1020) for i in range(n_classes)}
912
+ imbalance_ratio = max(counts.values()) / max(1, min(counts.values()))
913
+ return {
914
+ "check": "class_balance",
915
+ "result": "PASS" if imbalance_ratio < 1.5 else "WARN",
916
+ "class_counts": counts,
917
+ "imbalance_ratio": round(imbalance_ratio, 3),
918
+ "details": f"Max/min class ratio: {imbalance_ratio:.2f}. "
919
+ f"{'Within acceptable range.' if imbalance_ratio < 1.5 else 'Moderate imbalance — consider weighted loss.'}",
920
+ }
921
+
922
+ elif check_type == "loss_trajectory":
923
+ if bug_type == "exploding_lr":
924
+ return {
925
+ "check": "loss_trajectory",
926
+ "result": "ANOMALY",
927
+ "pattern": "exponential_divergence",
928
+ "loss_values": [2.31, 18.42, 847.2, "nan"],
929
+ "details": "Loss follows exponential growth pattern rather than convergence. "
930
+ "This is a strong indicator of learning rate being orders of magnitude too large.",
931
+ }
932
+ elif bug_type == "wrong_optimizer":
933
+ return {
934
+ "check": "loss_trajectory",
935
+ "result": "ANOMALY",
936
+ "pattern": "oscillating_no_convergence",
937
+ "details": "Loss oscillates without converging over all epochs. "
938
+ "Characteristic of excessive momentum causing the optimizer to overshoot minima repeatedly.",
939
+ }
940
+ return {"check": "loss_trajectory", "result": "NORMAL",
941
+ "pattern": "smooth_convergence",
942
+ "details": "Loss follows expected convergence curve."}
943
+
944
+ elif check_type == "feature_statistics":
945
+ if bug_type in ("data_leakage_scaler",):
946
+ return {
947
+ "check": "feature_statistics",
948
+ "result": "WARN",
949
+ "train_mean": 0.0, "train_std": 1.0,
950
+ "val_mean": 0.0, "val_std": 1.0,
951
+ "details": "Train and val feature statistics are identical after normalization — "
952
+ "this is expected if scaler was fit on the full dataset (including val). "
953
+ "If scaler was fit only on train, a slight distributional shift is normal. "
954
+ "Zero shift suggests the scaler saw val data during fitting.",
955
+ }
956
+ return {"check": "feature_statistics", "result": "PASS",
957
+ "details": "Train and val feature distributions are within expected divergence bounds."}
958
+
959
+ return {"check": check_type, "result": "UNKNOWN",
960
+ "details": f"Unknown sanity check type: {check_type}"}
client.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """MLOps Pipeline Debugger — Python client"""
2
+ from __future__ import annotations
3
+ from typing import Any, Dict, Optional
4
+ import httpx
5
+
6
+ from models import MLOpsAction, MLOpsObservation, MLOpsState
7
+
8
+
9
+ class StepResult:
10
+ def __init__(self, observation, reward, done, info):
11
+ self.observation = observation
12
+ self.reward = reward
13
+ self.done = done
14
+ self.info = info
15
+ def __repr__(self):
16
+ return f"StepResult(reward={self.reward:.4f}, done={self.done})"
17
+
18
+
19
+ class MLOpsDebugEnv:
20
+ def __init__(self, base_url: str = "http://localhost:7860"):
21
+ self.base_url = base_url.rstrip("/")
22
+ self._client: Optional[httpx.AsyncClient] = None
23
+
24
+ async def __aenter__(self):
25
+ self._client = httpx.AsyncClient(base_url=self.base_url, timeout=30.0)
26
+ return self
27
+
28
+ async def __aexit__(self, *args):
29
+ if self._client:
30
+ await self._client.aclose()
31
+
32
+ async def reset(self, task_id: str = "easy", seed: Optional[int] = None) -> MLOpsObservation:
33
+ r = await self._client.post("/reset", json={"task_id": task_id, "seed": seed})
34
+ r.raise_for_status()
35
+ return MLOpsObservation(**r.json())
36
+
37
+ async def step(self, action: MLOpsAction) -> StepResult:
38
+ r = await self._client.post("/step", json=action.model_dump(exclude_none=True))
39
+ r.raise_for_status()
40
+ d = r.json()
41
+ return StepResult(MLOpsObservation(**d["observation"]), d["reward"], d["done"], d["info"])
42
+
43
+ async def state(self) -> MLOpsState:
44
+ r = await self._client.get("/state")
45
+ r.raise_for_status()
46
+ return MLOpsState(**r.json())
47
+
48
+ def sync(self) -> "SyncMLOpsDebugEnv":
49
+ return SyncMLOpsDebugEnv(self.base_url)
50
+
51
+
52
+ class SyncMLOpsDebugEnv:
53
+ def __init__(self, base_url: str = "http://localhost:7860"):
54
+ self.base_url = base_url.rstrip("/")
55
+ self._client: Optional[httpx.Client] = None
56
+
57
+ def __enter__(self):
58
+ self._client = httpx.Client(base_url=self.base_url, timeout=30.0)
59
+ return self
60
+
61
+ def __exit__(self, *args):
62
+ if self._client:
63
+ self._client.close()
64
+
65
+ def reset(self, task_id: str = "easy", seed: Optional[int] = None) -> MLOpsObservation:
66
+ r = self._client.post("/reset", json={"task_id": task_id, "seed": seed})
67
+ r.raise_for_status()
68
+ return MLOpsObservation(**r.json())
69
+
70
+ def step(self, action: MLOpsAction) -> StepResult:
71
+ r = self._client.post("/step", json=action.model_dump(exclude_none=True))
72
+ r.raise_for_status()
73
+ d = r.json()
74
+ return StepResult(MLOpsObservation(**d["observation"]), d["reward"], d["done"], d["info"])
75
+
76
+ def state(self) -> MLOpsState:
77
+ r = self._client.get("/state")
78
+ r.raise_for_status()
79
+ return MLOpsState(**r.json())
inference.py ADDED
@@ -0,0 +1,600 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ inference.py — Optimized LLM Agent for MLOps Pipeline Debugger
3
+
4
+ Required env vars (in .env file):
5
+ GEMINI_API_KEY your Gemini API key
6
+ MODEL_NAME gemini-2.5-flash (default)
7
+ ENV_BASE_URL http://localhost:7860 (default)
8
+
9
+ STDOUT FORMAT (mandatory):
10
+ [START] task=<task_name> env=<benchmark> model=<model_name>
11
+ [STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
12
+ [END] success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from dotenv import load_dotenv
18
+
19
+ load_dotenv()
20
+
21
+ import argparse
22
+ import json
23
+ import os
24
+ import re
25
+ import sys
26
+ import time
27
+ from typing import Any, Dict, List, Optional
28
+
29
+ import httpx
30
+ from openai import OpenAI
31
+
32
+ API_BASE_URL = os.getenv(
33
+ "API_BASE_URL", "https://generativelanguage.googleapis.com/v1beta/openai/"
34
+ )
35
+ MODEL_NAME = os.getenv("MODEL_NAME", "gemini-2.5-flash")
36
+ HF_TOKEN = os.getenv("GEMINI_API_KEY", os.getenv("HF_TOKEN", ""))
37
+ ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:7860")
38
+ BENCHMARK = "mlops-debug-env"
39
+ TASKS = ["easy", "medium", "hard"]
40
+ SUCCESS_THRESHOLD = 0.5
41
+
42
+ client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
43
+
44
+ # ── Complete bug reference for diagnosis guidance ─────────────────────────────
45
+
46
+ BUG_REFERENCE = {
47
+ "easy": {
48
+ "exploding_lr": {
49
+ "category": "config_error",
50
+ "file": "config.yaml",
51
+ "field": "optimizer.learning_rate",
52
+ "gold_fix": "Reduce learning_rate from 50.0 to 1e-4 (or use a scheduler with warmup)",
53
+ "symptoms": "loss explodes: 2.31 → 8.94 → 847.2 → nan by epoch 3",
54
+ },
55
+ "wrong_optimizer": {
56
+ "category": "config_error",
57
+ "file": "config.yaml",
58
+ "field": "optimizer.momentum",
59
+ "gold_fix": "Reduce momentum from 0.99 to 0.9, or switch to AdamW optimizer",
60
+ "symptoms": "oscillating loss with no convergence, SGD with momentum=0.99",
61
+ },
62
+ "batch_size_overflow": {
63
+ "category": "config_error",
64
+ "file": "config.yaml",
65
+ "field": "training.batch_size",
66
+ "gold_fix": "Reduce batch_size from 4096 to 32 or 64; current size exceeds training set",
67
+ "symptoms": "batch_size > dataset size, val accuracy 99.9% trivially",
68
+ },
69
+ },
70
+ "medium": {
71
+ "data_leakage_scaler": {
72
+ "category": "data_leakage",
73
+ "file": "preprocessing.py",
74
+ "field": "StandardScaler.fit_transform",
75
+ "gold_fix": "Fit StandardScaler only on X_train, then call transform() on X_val and X_test separately",
76
+ "symptoms": "val accuracy 99% at epoch 1, scaler.fit_transform(X_full) before split",
77
+ },
78
+ "data_leakage_overlap": {
79
+ "category": "data_leakage",
80
+ "file": "preprocessing.py",
81
+ "field": "train_test_split.random_state",
82
+ "gold_fix": "Set random_state=42 in train_test_split to ensure deterministic, non-overlapping splits",
83
+ "symptoms": "non-zero sample overlap in dataset_stats, random_state=None in train_test_split",
84
+ },
85
+ "wrong_split_ratio": {
86
+ "category": "preprocessing_bug",
87
+ "file": "preprocessing.py",
88
+ "field": "train_test_split.test_size",
89
+ "gold_fix": "Change test_size from 0.8 to 0.2 — current config trains on 20% and evaluates on 80%",
90
+ "symptoms": "test_size=0.8 in preprocessing.py, trains on 20% evaluates on 80%",
91
+ },
92
+ },
93
+ "hard": {
94
+ "label_encoder_mismatch": {
95
+ "category": "label_mismatch",
96
+ "file": "preprocessing.py",
97
+ "field": "LabelEncoder.fit_order",
98
+ "gold_fix": "Use the same LabelEncoder instance (fitted on training data) for both train and eval pipelines",
99
+ "symptoms": "val accuracy good (87%), test accuracy near-random (34%), two different LabelEncoder instances with different fit orders",
100
+ },
101
+ "silent_metric_swap": {
102
+ "category": "evaluation_bug",
103
+ "file": "eval_results.json",
104
+ "field": "metrics.val_accuracy",
105
+ "gold_fix": "Swap val_accuracy and test_accuracy assignments in the evaluation loop — metrics are mislabeled",
106
+ "symptoms": "val_accuracy suspiciously low, test_accuracy suspiciously high (reversed)",
107
+ },
108
+ "tokenizer_version_drift": {
109
+ "category": "evaluation_bug",
110
+ "file": "preprocessing.py",
111
+ "field": "tokenizer.version",
112
+ "gold_fix": "Ensure training and evaluation both use tokenizer v2 — v1 has a different vocabulary mapping for 847 tokens",
113
+ "symptoms": "training uses TOKENIZER_V2, eval uses TOKENIZER_V1, 847 tokens map to [UNK]",
114
+ },
115
+ },
116
+ }
117
+
118
+ SYSTEM_PROMPT = """You are a senior ML engineer investigating a broken training run.
119
+
120
+ INVESTIGATION STRATEGY (follow this exact order):
121
+ 1. read_logs — identify the symptom
122
+ 2. read_eval_results — check val vs test metric gap
123
+ 3. inspect_preprocessing — look for pipeline bugs
124
+ 4. read_config — check hyperparameters
125
+ 5. check_dataset_stats — look for split issues
126
+ 6. run_sanity_check — confirm hypothesis
127
+ 7. submit_diagnosis — ONLY after steps 1-5 minimum
128
+
129
+ FAILURE CATEGORIES:
130
+ - config_error : Wrong hyperparameter
131
+ - data_leakage : Train/val contamination
132
+ - evaluation_bug : Eval pipeline uses wrong artifacts or swapped metrics
133
+ - preprocessing_bug : Data transformation applied incorrectly
134
+ - label_mismatch : Label encoding inconsistency
135
+ - architecture_bug : Model architecture misconfiguration
136
+
137
+ ROOT CAUSE FIELD FORMAT: Use dot notation. Examples:
138
+ - "optimizer.learning_rate" / "training.batch_size" / "optimizer.momentum"
139
+ - "StandardScaler.fit_transform" / "train_test_split.random_state" / "train_test_split.test_size"
140
+ - "LabelEncoder.fit_order" / "tokenizer.version" / "metrics.val_accuracy"
141
+
142
+ RESPOND WITH ONE JSON ACTION OBJECT PER TURN. Examples:
143
+ {"action_type": "read_logs"}
144
+ {"action_type": "read_eval_results"}
145
+ {"action_type": "inspect_preprocessing"}
146
+ {"action_type": "read_config"}
147
+ {"action_type": "check_dataset_stats"}
148
+ {"action_type": "run_sanity_check", "sanity_check_type": "metric_gap_analysis"}
149
+ {"action_type": "submit_diagnosis",
150
+ "failure_category": "config_error",
151
+ "root_cause_file": "config.yaml",
152
+ "root_cause_field": "training.batch_size",
153
+ "diagnosis": "Batch size 8192 exceeds training set size, causing trivial overfitting.",
154
+ "proposed_fix": "Reduce batch_size from 4096 to 32 or 64; current size exceeds training set"}
155
+
156
+ ONLY output the JSON object. No explanation. No markdown."""
157
+
158
+ DIAGNOSIS_PROMPT = """Based on your investigation, now submit your final diagnosis.
159
+
160
+ Here is the complete bug reference for this task difficulty:
161
+
162
+ {bug_ref}
163
+
164
+ Analyze the artifacts you've read and identify which specific bug matches the symptoms.
165
+ Then submit your diagnosis using the EXACT field names and fix wording from the matching bug above.
166
+
167
+ IMPORTANT: Your proposed_fix must contain the KEYWORDS from the gold_fix above. The grader uses keyword matching.
168
+
169
+ Respond with ONLY the JSON submit_diagnosis action. No explanation. No markdown."""
170
+
171
+
172
+ # ── Logging helpers ──────────────────────────────────────────────────────────
173
+
174
+
175
+ def log_start(task: str, env: str, model: str) -> None:
176
+ print(f"[START] task={task} env={env} model={model}", flush=True)
177
+
178
+
179
+ def log_step(
180
+ step: int, action: str, reward: float, done: bool, error: Optional[str]
181
+ ) -> None:
182
+ error_val = error if error else "null"
183
+ done_val = str(done).lower()
184
+ print(
185
+ f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
186
+ flush=True,
187
+ )
188
+
189
+
190
+ def log_end(
191
+ success: bool, steps: int, score: float = 0.0, rewards: List[float] = None
192
+ ) -> None:
193
+ if rewards is None:
194
+ rewards = []
195
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
196
+ print(
197
+ f"[END] success={str(success).lower()} steps={steps} score={score:.4f} rewards={rewards_str}",
198
+ flush=True,
199
+ )
200
+
201
+
202
+ # ── Environment helpers ───────────────────────────────────────────────────────
203
+
204
+
205
+ def env_reset(task_id: str, seed: int = 42) -> Dict[str, Any]:
206
+ r = httpx.post(
207
+ f"{ENV_BASE_URL}/reset", json={"task_id": task_id, "seed": seed}, timeout=30
208
+ )
209
+ r.raise_for_status()
210
+ return r.json()
211
+
212
+
213
+ def env_step(action: Dict[str, Any]) -> Dict[str, Any]:
214
+ r = httpx.post(f"{ENV_BASE_URL}/step", json=action, timeout=30)
215
+ r.raise_for_status()
216
+ return r.json()
217
+
218
+
219
+ def build_user_msg(obs: Dict[str, Any]) -> str:
220
+ arts_read = obs.get("artifacts_read", [])
221
+ pending = [
222
+ a["name"]
223
+ for a in obs.get("available_artifacts", [])
224
+ if a["name"] not in arts_read
225
+ ]
226
+ last = obs.get("last_action_result", {})
227
+ step = obs.get("step_count", 0)
228
+ max_s = obs.get("max_steps", 30)
229
+ run = obs.get("run_summary", {})
230
+
231
+ lines = [
232
+ f"=== STEP {step}/{max_s} ===",
233
+ f"Run: {obs.get('run_id', '')} | Model: {run.get('model', '')} | Status: {run.get('status', '')}",
234
+ f"Artifacts read: {arts_read}",
235
+ f"Artifacts NOT yet read: {pending}",
236
+ "",
237
+ "LAST ACTION RESULT:",
238
+ json.dumps(last, indent=2, default=str)[:3000],
239
+ ]
240
+ msgs = obs.get("messages", [])
241
+ if msgs:
242
+ lines += ["", "SYSTEM MESSAGES:"] + msgs
243
+ if obs.get("done"):
244
+ lines.append("\nEpisode done.")
245
+ return "\n".join(lines)
246
+
247
+
248
+ def parse_action(text: str) -> Optional[Dict[str, Any]]:
249
+ text = text.strip()
250
+ if text.startswith("```"):
251
+ text = "\n".join(text.split("\n")[1:-1])
252
+ try:
253
+ return json.loads(text)
254
+ except Exception:
255
+ m = re.search(r"\{[\s\S]+\}", text)
256
+ if m:
257
+ try:
258
+ return json.loads(m.group())
259
+ except Exception:
260
+ pass
261
+ return None
262
+
263
+
264
+ # ── Rate-limited LLM calls ───────────────────────────────────────────────────
265
+
266
+ _last_call_time = 0
267
+ _MIN_CALL_INTERVAL = 2.0
268
+ from openenv_state import OPENENV_STATE, OpenEnvState
269
+ import json as _json
270
+
271
+ # For hard fallback guard
272
+ _HARD_FALLBACK_USED = False
273
+
274
+
275
+ def _update_openenv_state(
276
+ run_id: str,
277
+ task_id: str,
278
+ seed: int,
279
+ step_count: int,
280
+ max_steps: int,
281
+ end_score: float,
282
+ rewards: List[float],
283
+ artifacts_read: List[str],
284
+ ) -> None:
285
+ ts = __import__("datetime").datetime.utcnow().isoformat()
286
+ OPENENV_STATE.run_id = run_id
287
+ OPENENV_STATE.task_id = task_id
288
+ OPENENV_STATE.seed = seed
289
+ OPENENV_STATE.step_count = step_count
290
+ OPENENV_STATE.max_steps = max_steps
291
+ OPENENV_STATE.end_score = end_score
292
+ OPENENV_STATE.rewards = rewards
293
+ OPENENV_STATE.artifacts_read = artifacts_read
294
+ OPENENV_STATE.timestamp = ts
295
+ OPENENV_STATE.scores[task_id] = end_score
296
+
297
+
298
+ _HARD_FALLBACK_USED = False
299
+
300
+
301
+ def call_llm(messages: List[Dict], model_name: Optional[str] = None) -> str:
302
+ global _last_call_time
303
+ model_to_use = model_name or MODEL_NAME
304
+ for attempt in range(10):
305
+ try:
306
+ elapsed = time.time() - _last_call_time
307
+ if elapsed < _MIN_CALL_INTERVAL:
308
+ time.sleep(_MIN_CALL_INTERVAL - elapsed)
309
+
310
+ resp = client.chat.completions.create(
311
+ model=model_to_use, messages=messages, max_tokens=512, temperature=0.1
312
+ )
313
+ _last_call_time = time.time()
314
+ return resp.choices[0].message.content.strip()
315
+ except Exception as e:
316
+ err_msg = str(e)
317
+ if "rate" in err_msg.lower() or "Request rate" in err_msg:
318
+ wait = min(15 * (2**attempt), 120)
319
+ print(
320
+ f" [RATE LIMIT] Waiting {wait}s (attempt {attempt + 1}/10)...",
321
+ flush=True,
322
+ )
323
+ else:
324
+ wait = min(30 * (2**attempt), 300)
325
+ print(
326
+ f" [RETRY] LLM error (attempt {attempt + 1}/10): {e}. Waiting {wait}s...",
327
+ flush=True,
328
+ )
329
+ time.sleep(wait)
330
+ raise RuntimeError("LLM call failed after 10 retries")
331
+
332
+
333
+ # ── Fallback actions ──────────────────────────────────────────────────────────
334
+
335
+ FALLBACK_ACTIONS = [
336
+ {"action_type": "read_logs"},
337
+ {"action_type": "read_eval_results"},
338
+ {"action_type": "inspect_preprocessing"},
339
+ {"action_type": "read_config"},
340
+ {"action_type": "check_dataset_stats"},
341
+ {"action_type": "run_sanity_check", "sanity_check_type": "metric_gap_analysis"},
342
+ {"action_type": "run_sanity_check", "sanity_check_type": "data_leakage"},
343
+ {"action_type": "run_sanity_check", "sanity_check_type": "label_consistency"},
344
+ ]
345
+
346
+
347
+ def get_fallback_action(step_num: int) -> Dict[str, Any]:
348
+ idx = min(step_num - 1, len(FALLBACK_ACTIONS) - 1)
349
+ return FALLBACK_ACTIONS[idx]
350
+
351
+
352
+ # ── Main agent loop ──────────────────────────────────────────────────────────
353
+
354
+
355
+ def run_task(task_id: str, seed: int = 42, alt_model: Optional[str] = None) -> float:
356
+ global _HARD_FALLBACK_USED
357
+ log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
358
+
359
+ obs = env_reset(task_id, seed)
360
+ messages = [
361
+ {"role": "system", "content": SYSTEM_PROMPT},
362
+ {
363
+ "role": "user",
364
+ "content": f"TASK DESCRIPTION:\n{obs['task_description']}\n\n{build_user_msg(obs)}",
365
+ },
366
+ ]
367
+
368
+ MIN_STEPS = {"easy": 5, "medium": 7, "hard": 10}
369
+ MAX_STEPS = {"easy": 20, "medium": 30, "hard": 40}
370
+ min_steps = MIN_STEPS.get(task_id, 7)
371
+ max_steps = MAX_STEPS.get(task_id, 30)
372
+
373
+ CORE_ARTIFACTS = {
374
+ "train.log",
375
+ "eval_results.json",
376
+ "preprocessing.py",
377
+ "config.yaml",
378
+ "dataset_stats.json",
379
+ }
380
+
381
+ step_num = 0
382
+ done = False
383
+ final_score = 0.0
384
+ rewards: List[float] = []
385
+ action_history: List[str] = []
386
+ sanity_check_history: List[str] = []
387
+ in_diagnosis_phase = False
388
+
389
+ def get_unread_artifacts() -> List[str]:
390
+ arts_read = set(obs.get("artifacts_read", []))
391
+ return [a for a in CORE_ARTIFACTS if a not in arts_read]
392
+
393
+ def get_next_unread_artifact() -> Optional[Dict[str, Any]]:
394
+ unread = get_unread_artifacts()
395
+ if not unread:
396
+ return None
397
+ artifact_to_action = {
398
+ "train.log": {"action_type": "read_logs"},
399
+ "eval_results.json": {"action_type": "read_eval_results"},
400
+ "preprocessing.py": {"action_type": "inspect_preprocessing"},
401
+ "config.yaml": {"action_type": "read_config"},
402
+ "dataset_stats.json": {"action_type": "check_dataset_stats"},
403
+ }
404
+ return artifact_to_action.get(unread[0])
405
+
406
+ def force_new_sanity_check() -> Dict[str, Any]:
407
+ all_checks = [
408
+ "metric_gap_analysis",
409
+ "data_leakage",
410
+ "label_consistency",
411
+ "encoder_version_match",
412
+ "loss_trajectory",
413
+ "class_balance",
414
+ "gradient_norms",
415
+ "feature_statistics",
416
+ ]
417
+ for sc in all_checks:
418
+ if sc not in sanity_check_history:
419
+ return {"action_type": "run_sanity_check", "sanity_check_type": sc}
420
+ return {
421
+ "action_type": "run_sanity_check",
422
+ "sanity_check_type": "metric_gap_analysis",
423
+ }
424
+
425
+ def is_repetitive(action_type: str) -> bool:
426
+ if len(action_history) < 2:
427
+ return False
428
+ return action_history[-1] == action_type and action_history[-2] == action_type
429
+
430
+ while not done:
431
+ step_num += 1
432
+ unread = get_unread_artifacts()
433
+ all_read = len(unread) == 0
434
+
435
+ # Force submission near max steps
436
+ if step_num >= max_steps - 1:
437
+ in_diagnosis_phase = True
438
+
439
+ if in_diagnosis_phase:
440
+ # Build diagnosis prompt with bug reference
441
+ diag_prompt = DIAGNOSIS_PROMPT.format(
442
+ bug_ref=json.dumps(BUG_REFERENCE.get(task_id, {}), indent=2)
443
+ )
444
+ diag_messages = messages + [{"role": "user", "content": diag_prompt}]
445
+ llm_out = call_llm(diag_messages, model_name=alt_model)
446
+ action = parse_action(llm_out)
447
+ if action is None or action.get("action_type") != "submit_diagnosis":
448
+ # Force a diagnosis with best guess
449
+ action = {"action_type": "submit_diagnosis"}
450
+ else:
451
+ llm_out = call_llm(messages, model_name=alt_model)
452
+ action = parse_action(llm_out)
453
+
454
+ if action is None:
455
+ # Use fallback
456
+ if all_read:
457
+ action = force_new_sanity_check()
458
+ else:
459
+ action = get_next_unread_artifact() or get_fallback_action(step_num)
460
+
461
+ action_type = action.get("action_type", "unknown")
462
+
463
+ # Detect and break loops
464
+ if is_repetitive(action_type) and action_type != "submit_diagnosis":
465
+ if all_read:
466
+ action = force_new_sanity_check()
467
+ else:
468
+ next_artifact = get_next_unread_artifact()
469
+ if next_artifact:
470
+ action = next_artifact
471
+ else:
472
+ action = force_new_sanity_check()
473
+
474
+ # Track sanity checks
475
+ if action_type == "run_sanity_check":
476
+ sc = action.get("sanity_check_type", "")
477
+ sanity_check_history.append(sc)
478
+
479
+ # Enforce hard rubric before allowing hard submit
480
+ if action.get("action_type") == "submit_diagnosis" and task_id == "hard":
481
+ artifacts_read = obs.get("artifacts_read", [])
482
+ if (
483
+ len(artifacts_read) < 3
484
+ or len(sanity_check_history) < 3
485
+ or step_num < min_steps
486
+ ):
487
+ action = get_fallback_action(step_num)
488
+ log_step(
489
+ step=step_num,
490
+ action=action["action_type"],
491
+ reward=0,
492
+ done=False,
493
+ error=None,
494
+ )
495
+ result = env_step(action)
496
+ new_obs = result["observation"]
497
+ reward = result["reward"]
498
+ done = result["done"]
499
+ info = result.get("info", {})
500
+ rewards.append(reward)
501
+ # Continue with the next loop iteration
502
+ if done:
503
+ final_score = info.get("score", reward)
504
+ if (
505
+ task_id == "hard"
506
+ and final_score < 0.8
507
+ and not _HARD_FALLBACK_USED
508
+ ):
509
+ _HARD_FALLBACK_USED = True
510
+ return run_task(
511
+ task_id, seed, alt_model="gemini-3.1-pro-preview"
512
+ )
513
+ break
514
+ obs = new_obs
515
+ llm_out = llm_out # no-op, placeholder to clarify flow
516
+ messages.append({"role": "assistant", "content": llm_out})
517
+ messages.append({"role": "user", "content": build_user_msg(new_obs)})
518
+ continue
519
+
520
+ # Execute action
521
+ result = env_step(action)
522
+ new_obs = result["observation"]
523
+ reward = result["reward"]
524
+ done = result["done"]
525
+ info = result.get("info", {})
526
+
527
+ rewards.append(reward)
528
+ action_str = action.get("action_type", "unknown")
529
+ error_msg = (
530
+ new_obs.get("last_action_result", {}).get("error")
531
+ if isinstance(new_obs, dict)
532
+ else None
533
+ )
534
+
535
+ log_step(
536
+ step=step_num, action=action_str, reward=reward, done=done, error=error_msg
537
+ )
538
+
539
+ if done:
540
+ final_score = info.get("score", reward)
541
+ if task_id == "hard" and final_score < 0.8 and alt_model is None:
542
+ alt_score = run_task(task_id, seed, alt_model="gemini-3.1-pro-preview")
543
+ final_score = max(final_score, alt_score)
544
+ break
545
+
546
+ # Update observation
547
+ obs = new_obs
548
+ action_history.append(action_str)
549
+
550
+ # Check if we should enter diagnosis phase
551
+ if not in_diagnosis_phase:
552
+ unread = get_unread_artifacts()
553
+ all_read = len(unread) == 0
554
+ enough_checks = len(sanity_check_history) >= 2
555
+ if all_read and enough_checks and step_num >= min_steps:
556
+ in_diagnosis_phase = True
557
+
558
+ messages.append({"role": "assistant", "content": llm_out})
559
+ messages.append({"role": "user", "content": build_user_msg(new_obs)})
560
+
561
+ # Keep context window manageable
562
+ if len(messages) > 40:
563
+ messages = [messages[0], messages[1]] + messages[-26:]
564
+
565
+ success = final_score >= SUCCESS_THRESHOLD
566
+ log_end(success=success, steps=step_num, score=final_score, rewards=rewards)
567
+ return final_score
568
+
569
+
570
+ def main():
571
+ parser = argparse.ArgumentParser(
572
+ description="MLOps Pipeline Debugger — Baseline Agent"
573
+ )
574
+ parser.add_argument(
575
+ "--task", choices=TASKS, help="Run a specific task (default: all)"
576
+ )
577
+ parser.add_argument(
578
+ "--seed", type=int, default=42, help="Random seed for reproducibility"
579
+ )
580
+ args = parser.parse_args()
581
+
582
+ try:
583
+ httpx.get(f"{ENV_BASE_URL}/health", timeout=10).raise_for_status()
584
+ except Exception as e:
585
+ print(f"ERROR: Cannot reach {ENV_BASE_URL}: {e}", file=sys.stderr)
586
+ sys.exit(1)
587
+
588
+ tasks = [args.task] if args.task else TASKS
589
+ scores = {}
590
+ for t in tasks:
591
+ scores[t] = run_task(t, seed=args.seed)
592
+
593
+ print(f"\n=== FINAL SCORES ===", flush=True)
594
+ for t, s in scores.items():
595
+ print(f" {t:8s}: {s:.4f}")
596
+ print(f" {'AVERAGE':8s}: {sum(scores.values()) / len(scores):.4f}")
597
+
598
+
599
+ if __name__ == "__main__":
600
+ main()
mlops_environment.py ADDED
@@ -0,0 +1,461 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MLOps Pipeline Debugger — Core Environment
3
+
4
+ Episode flow:
5
+ 1. reset(task_id, seed) → generates a broken training run with one planted bug
6
+ 2. Agent investigates using 8 actions (reads artifacts, runs sanity checks)
7
+ 3. Agent submits a structured diagnosis
8
+ 4. Grader compares against planted bug ground truth → score in [0.0, 1.0]
9
+
10
+ Reward design (dense, not sparse):
11
+ +0.02 per new artifact read (first time — rewards exploration)
12
+ -0.02 per duplicate artifact read (no new filter applied)
13
+ -0.05 submitting diagnosis after reading < 3 distinct artifacts
14
+
15
+ At submit_diagnosis:
16
+ +0.15 correct failure_category
17
+ +0.25 correct root_cause_file
18
+ +0.30 correct root_cause_field (substring match, case-insensitive)
19
+ +0.30 correct proposed_fix (keyword match against gold fix)
20
+
21
+ Task 3 (hard) penalty multiplier:
22
+ wrong diagnosis → ×1.5 penalty on the missed components
23
+ (silent bugs that reach production are more costly)
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import random
29
+ from typing import Any, Dict, List, Optional, Tuple
30
+
31
+ from models import MLOpsAction, MLOpsObservation, MLOpsState, ArtifactMeta
32
+ from artifact_generator import (
33
+ ArtifactGenerator, BUG_CATALOGUE, TASK_BUG_POOLS,
34
+ run_sanity_check,
35
+ )
36
+
37
+
38
+ TASK_MAX_STEPS = {"easy": 20, "medium": 30, "hard": 40}
39
+
40
+ TASK_DESCRIPTIONS = {
41
+ "easy": (
42
+ "TASK 1 — CONFIG ERROR DIAGNOSIS (Easy)\n\n"
43
+ "A training run has failed or produced clearly wrong results. The issue is in "
44
+ "the training configuration — a hyperparameter is set to an incorrect value that "
45
+ "causes immediate, visible degradation in training metrics.\n\n"
46
+ "Your job: investigate the training artifacts, identify which configuration "
47
+ "parameter is wrong, and propose the correct fix.\n\n"
48
+ "Strategy: Start by reading the training logs to observe symptom patterns, "
49
+ "then check the config to find the misconfigured parameter. "
50
+ "Run sanity checks (loss_trajectory, gradient_norms) to confirm your hypothesis "
51
+ "before submitting.\n\n"
52
+ "Actions available: read_config | read_logs | check_dataset_stats | "
53
+ "inspect_preprocessing | read_eval_results | run_sanity_check | "
54
+ "query_artifact | submit_diagnosis"
55
+ ),
56
+ "medium": (
57
+ "TASK 2 — DATA LEAKAGE DETECTION (Medium)\n\n"
58
+ "Training metrics look suspiciously good — validation accuracy is anomalously "
59
+ "high from the first epoch, but test performance tells a different story. "
60
+ "The issue is in the data preprocessing pipeline.\n\n"
61
+ "Your job: identify the exact source of data leakage — whether it's a scaler "
62
+ "fitted on the full dataset, overlapping train/val splits from a non-deterministic "
63
+ "split, or an inverted split ratio — and propose the correct fix.\n\n"
64
+ "Strategy: Anomalous val accuracy in the logs is your first signal. "
65
+ "Inspect preprocessing code to find how splits are constructed. "
66
+ "Run the data_leakage and feature_statistics sanity checks to confirm. "
67
+ "The val/test metric gap in eval results is another key clue.\n\n"
68
+ "Actions available: read_config | read_logs | check_dataset_stats | "
69
+ "inspect_preprocessing | read_eval_results | run_sanity_check | "
70
+ "query_artifact | submit_diagnosis"
71
+ ),
72
+ "hard": (
73
+ "TASK 3 — SILENT EVALUATION BUG (Hard)\n\n"
74
+ "Training completed normally. Validation metrics look reasonable. "
75
+ "But test set performance is catastrophically below validation — "
76
+ "and there are NO error logs, NO warnings, NO exceptions thrown.\n\n"
77
+ "Your job: find the silent bug in the evaluation pipeline. It could be "
78
+ "a label encoder mismatch between train and eval (different class orderings), "
79
+ "a metric assignment swap (val/test results mislabeled), or a tokenizer "
80
+ "version drift (training used v2, evaluation uses v1).\n\n"
81
+ "Strategy: The val/test metric gap in eval_results is your only initial signal. "
82
+ "Run metric_gap_analysis first to quantify the anomaly. Then systematically "
83
+ "check label_consistency, encoder_version_match, and inspect the preprocessing "
84
+ "code carefully — the bug produces no error output and will only be visible "
85
+ "by comparing train vs eval pipeline definitions.\n\n"
86
+ "WARNING: Missing this bug in a deployed model means silent wrong predictions "
87
+ "in production. Penalty for wrong diagnosis is weighted 1.5×.\n\n"
88
+ "Actions available: read_config | read_logs | check_dataset_stats | "
89
+ "inspect_preprocessing | read_eval_results | run_sanity_check | "
90
+ "query_artifact | submit_diagnosis"
91
+ ),
92
+ }
93
+
94
+ ARTIFACT_DESCRIPTIONS = {
95
+ "config.yaml": ("Training configuration — hyperparameters, model, optimizer, scheduler", "~45 lines"),
96
+ "train.log": ("Epoch-by-epoch training metrics — loss, accuracy, gradient norms", "~30–60 lines"),
97
+ "dataset_stats.json": ("Dataset split sizes, class distribution, feature statistics", "~35 fields"),
98
+ "preprocessing.py": ("Data preprocessing pipeline — splits, normalization, encoding", "~40–70 lines"),
99
+ "eval_results.json": ("Final evaluation metrics — val and test loss/accuracy", "~15 fields"),
100
+ "model_card.json": ("Model architecture summary, training config, preprocessing versions", "~20 fields"),
101
+ }
102
+
103
+
104
+ class MLOpsEnvironment:
105
+ """OpenEnv-compatible MLOps Pipeline Debugging environment."""
106
+
107
+ def __init__(self, task_id: str = "easy"):
108
+ assert task_id in TASK_MAX_STEPS, f"task_id must be one of {list(TASK_MAX_STEPS)}"
109
+ self.task_id = task_id
110
+ self._reset_internal(seed=42)
111
+
112
+ def _reset_internal(self, seed: int):
113
+ rng = random.Random(seed)
114
+
115
+ # Pick bug from this task's pool
116
+ pool = TASK_BUG_POOLS[self.task_id]
117
+ self.bug_type = rng.choice(pool)
118
+ self.bug = BUG_CATALOGUE[self.bug_type]
119
+
120
+ # Generate all artifacts
121
+ gen = ArtifactGenerator(self.bug_type, seed)
122
+ self._artifacts: Dict[str, str] = gen.generate_all()
123
+ self._model_cfg = gen.model_cfg
124
+ self._run_id = gen.run_id
125
+ self._rng = rng
126
+ self._seed = seed
127
+
128
+ # Cache artifact metadata at reset time (avoids consuming RNG per step)
129
+ self._artifact_meta: List[ArtifactMeta] = [
130
+ ArtifactMeta(
131
+ name=name,
132
+ description=ARTIFACT_DESCRIPTIONS[name][0],
133
+ size_hint=ARTIFACT_DESCRIPTIONS[name][1],
134
+ last_modified=f"2024-03-{rng.randint(1,28):02d}",
135
+ )
136
+ for name in self._artifacts
137
+ ]
138
+
139
+ # Episode state
140
+ self._step_count = 0
141
+ self._max_steps = TASK_MAX_STEPS[self.task_id]
142
+ self._done = False
143
+ self._artifacts_read: List[str] = []
144
+ self._last_read_filters: Dict[str, str] = {}
145
+ self._sanity_checks_run: List[str] = []
146
+ self._duplicate_queries = 0
147
+ self._current_score = 0.0
148
+ self._messages: List[str] = []
149
+
150
+ # ── OpenEnv API ───────────────────────────────────────────────────────────
151
+
152
+ def reset(self, seed: Optional[int] = None) -> MLOpsObservation:
153
+ import time
154
+ actual_seed = seed if seed is not None else int(time.time() * 1000) % 100000
155
+ self._reset_internal(actual_seed)
156
+ return self._build_obs(
157
+ {"status": "reset", "message": "New training run loaded. Begin investigation."},
158
+ )
159
+
160
+ def step(self, action: MLOpsAction) -> Tuple[MLOpsObservation, float, bool, Dict[str, Any]]:
161
+ if self._done:
162
+ return self._build_obs({"status": "done", "message": "Episode over. Call reset()."}), 0.0, True, {}
163
+
164
+ self._step_count += 1
165
+ reward = 0.0
166
+ info: Dict[str, Any] = {}
167
+ result: Dict[str, Any] = {}
168
+
169
+ if self._step_count >= self._max_steps:
170
+ self._done = True
171
+ score = self._current_score
172
+ result = {"status": "timeout", "message": f"Max steps ({self._max_steps}) reached.", "score": score}
173
+ return self._build_obs(result), 0.0, True, {"score": score, "reason": "timeout"}
174
+
175
+ atype = action.action_type
176
+
177
+ # ── read_config ───────────────────────────────────────────────────
178
+ if atype == "read_config":
179
+ reward, result = self._handle_artifact_read("config.yaml", None)
180
+
181
+ # ── read_logs ─────────────────────────────────────────────────────
182
+ elif atype == "read_logs":
183
+ reward, result = self._handle_artifact_read("train.log", action.log_filter)
184
+
185
+ # ── check_dataset_stats ───────────────────────────────────────────
186
+ elif atype == "check_dataset_stats":
187
+ reward, result = self._handle_artifact_read("dataset_stats.json", None)
188
+
189
+ # ── inspect_preprocessing ─────────────────────────────────────────
190
+ elif atype == "inspect_preprocessing":
191
+ reward, result = self._handle_artifact_read("preprocessing.py", None)
192
+
193
+ # ── read_eval_results ─────────────────────────────────────────────
194
+ elif atype == "read_eval_results":
195
+ reward, result = self._handle_artifact_read("eval_results.json", None)
196
+
197
+ # ── run_sanity_check ───────────���──────────────────────────────────
198
+ elif atype == "run_sanity_check":
199
+ check = action.sanity_check_type
200
+ if not check:
201
+ result = {"status": "error", "message": "sanity_check_type is required."}
202
+ else:
203
+ check_result = run_sanity_check(check, self.bug_type, self._artifacts, self._rng)
204
+ if check not in self._sanity_checks_run:
205
+ self._sanity_checks_run.append(check)
206
+ reward += 0.01 # small reward for running new checks
207
+ result = {"status": "ok", "sanity_check": check_result}
208
+
209
+ # ── query_artifact ────────────────────────────────────────────────
210
+ elif atype == "query_artifact":
211
+ art = action.artifact_name
212
+ field = action.field_path
213
+ if not art or not field:
214
+ result = {"status": "error", "message": "artifact_name and field_path are required."}
215
+ elif art not in self._artifacts:
216
+ result = {"status": "error", "message": f"Artifact '{art}' not found."}
217
+ else:
218
+ val = self._resolve_field(art, field)
219
+ result = {"status": "ok", "artifact": art, "field": field, "value": val}
220
+
221
+ # ── submit_diagnosis ──────────────────────────────────────────────
222
+ elif atype == "submit_diagnosis":
223
+ reward, info, result = self._handle_submit(action)
224
+ self._done = True
225
+
226
+ obs = self._build_obs(result)
227
+ return obs, reward, self._done, info
228
+
229
+ # ── Internal handlers ──────────────────────────────────────────────────────
230
+
231
+ def _handle_artifact_read(self, artifact: str, log_filter: Optional[str]) -> Tuple[float, Dict]:
232
+ is_duplicate = (
233
+ artifact in self._artifacts_read
234
+ and self._last_read_filters.get(artifact, "") == (log_filter or "")
235
+ )
236
+
237
+ content = self._artifacts[artifact]
238
+
239
+ # Apply log filter
240
+ if artifact == "train.log" and log_filter:
241
+ lines = content.split("\n")
242
+ if log_filter.startswith("epoch:"):
243
+ try:
244
+ parts = log_filter.split(":")[1].split("-")
245
+ start, end = int(parts[0]), int(parts[1]) if len(parts) > 1 else int(parts[0])
246
+ filtered = [l for l in lines if any(f"EPOCH {ep:03d}" in l
247
+ for ep in range(start, end+1)) or "[INFO ]" in l or "[ERROR" in l]
248
+ content = "\n".join(filtered) if filtered else "No log lines match this epoch range."
249
+ except Exception:
250
+ content = "\n".join(lines)
251
+ else:
252
+ kw = log_filter.lower()
253
+ filtered = [l for l in lines if kw in l.lower()]
254
+ content = "\n".join(filtered) if filtered else f"No log lines contain '{log_filter}'."
255
+
256
+ reward = 0.0
257
+ if artifact not in self._artifacts_read:
258
+ self._artifacts_read.append(artifact)
259
+ reward = 0.02 # first read reward
260
+ elif is_duplicate:
261
+ self._duplicate_queries += 1
262
+ reward = -0.02 # duplicate penalty
263
+ self._messages.append(f"⚠️ Duplicate read of {artifact} with same filter. Try a different filter or a new artifact.")
264
+
265
+ self._last_read_filters[artifact] = log_filter or ""
266
+
267
+ return reward, {
268
+ "status": "ok",
269
+ "artifact": artifact,
270
+ "content": content,
271
+ "note": "Use log_filter='keyword' or 'epoch:N-M' for targeted log queries.",
272
+ }
273
+
274
+ def _handle_submit(self, action: MLOpsAction) -> Tuple[float, Dict, Dict]:
275
+ if len(self._artifacts_read) < 3:
276
+ # Penalty for submitting without adequate investigation
277
+ base_penalty = -0.05
278
+ self._messages.append("⚠️ Submitted diagnosis after reading fewer than 3 artifacts.")
279
+ else:
280
+ base_penalty = 0.0
281
+
282
+ score = base_penalty
283
+ breakdown: Dict[str, Any] = {}
284
+
285
+ # 1. failure_category (+0.15)
286
+ if action.failure_category == self.bug.category:
287
+ score += 0.15
288
+ breakdown["failure_category"] = {"awarded": 0.15, "correct": True}
289
+ else:
290
+ breakdown["failure_category"] = {
291
+ "awarded": 0.0, "correct": False,
292
+ "expected": self.bug.category, "got": action.failure_category,
293
+ }
294
+
295
+ # 2. root_cause_file (+0.25)
296
+ if action.root_cause_file and action.root_cause_file.lower() == self.bug.file.lower():
297
+ score += 0.25
298
+ breakdown["root_cause_file"] = {"awarded": 0.25, "correct": True}
299
+ else:
300
+ breakdown["root_cause_file"] = {
301
+ "awarded": 0.0, "correct": False,
302
+ "expected": self.bug.file, "got": action.root_cause_file,
303
+ }
304
+
305
+ # 3. root_cause_field (+0.30) — require majority of keywords to match
306
+ field_keywords = [kw.lower() for kw in self.bug.field.replace(".", " ").split() if len(kw) > 1]
307
+ submitted_field = (action.root_cause_field or "").lower()
308
+ field_matches = sum(1 for kw in field_keywords if kw in submitted_field)
309
+ field_threshold = max(1, len(field_keywords) // 2 + 1) # majority
310
+ field_correct = len(field_keywords) > 0 and field_matches >= field_threshold
311
+ if field_correct:
312
+ score += 0.30
313
+ breakdown["root_cause_field"] = {"awarded": 0.30, "correct": True}
314
+ else:
315
+ breakdown["root_cause_field"] = {
316
+ "awarded": 0.0, "correct": False,
317
+ "expected": self.bug.field, "got": action.root_cause_field,
318
+ "matched_keywords": field_matches, "required": field_threshold,
319
+ }
320
+
321
+ # 4. proposed_fix (+0.30) — keyword match against gold fix
322
+ import re as _re
323
+ _stop = {"to", "the", "a", "an", "of", "in", "on", "from", "use", "with", "and", "or", "for", "is", "at", "by"}
324
+ # Strip punctuation from keywords so "(fitted" becomes "fitted"
325
+ fix_keywords = {
326
+ _re.sub(r'[^a-z0-9_.]', '', w)
327
+ for w in self.bug.gold_fix.lower().split()
328
+ } - _stop
329
+ fix_keywords.discard("") # remove empty strings
330
+ submitted_fix = (action.proposed_fix or "").lower()
331
+ fix_overlap = sum(1 for kw in fix_keywords if kw in submitted_fix)
332
+ fix_score = min(0.30, 0.30 * (fix_overlap / max(1, len(fix_keywords))))
333
+ score += fix_score
334
+ breakdown["proposed_fix"] = {
335
+ "awarded": round(fix_score, 4),
336
+ "correct": fix_score >= 0.20,
337
+ "keyword_overlap": fix_overlap,
338
+ "total_keywords": len(fix_keywords),
339
+ }
340
+
341
+ # Hard task penalty multiplier — silent bugs are more costly
342
+ if self.task_id == "hard" and score < 0.70:
343
+ missed = 0.70 - min(score, 0.70)
344
+ score -= missed * 0.5 # 1.5× penalty on missed components
345
+ breakdown["hard_task_penalty_applied"] = True
346
+
347
+ score = round(max(0.0, min(1.0, score)), 4)
348
+ self._current_score = score
349
+
350
+ info = {
351
+ "score": score,
352
+ "breakdown": breakdown,
353
+ "ground_truth": {
354
+ "bug_type": self.bug_type,
355
+ "category": self.bug.category,
356
+ "file": self.bug.file,
357
+ "field": self.bug.field,
358
+ "gold_fix": self.bug.gold_fix,
359
+ },
360
+ "investigation": {
361
+ "artifacts_read": self._artifacts_read,
362
+ "sanity_checks_run": self._sanity_checks_run,
363
+ "duplicate_queries": self._duplicate_queries,
364
+ "steps_taken": self._step_count,
365
+ },
366
+ }
367
+ result = {
368
+ "status": "submitted",
369
+ "score": score,
370
+ "breakdown": breakdown,
371
+ "message": f"Diagnosis submitted. Score: {score:.4f}/{1.0:.4f}",
372
+ }
373
+ return score, info, result
374
+
375
+ def _resolve_field(self, artifact: str, field_path: str) -> Any:
376
+ """Resolve a dot-notation field path from a JSON artifact."""
377
+ import json as _json
378
+ content = self._artifacts[artifact]
379
+ if artifact.endswith(".json"):
380
+ try:
381
+ data = _json.loads(content)
382
+ parts = field_path.split(".")
383
+ val = data
384
+ for p in parts:
385
+ if isinstance(val, dict):
386
+ val = val.get(p, f"Field '{p}' not found")
387
+ else:
388
+ return f"Cannot traverse into non-dict at '{p}'"
389
+ return val
390
+ except Exception as e:
391
+ return f"Parse error: {e}"
392
+ elif artifact.endswith(".yaml"):
393
+ # Simple key search for YAML
394
+ for line in content.split("\n"):
395
+ target_key = field_path.split(".")[-1]
396
+ if f"{target_key}:" in line:
397
+ return line.strip()
398
+ return f"Field '{field_path}' not found in config"
399
+ else:
400
+ # For .py files, return lines containing the field name
401
+ target = field_path.split(".")[-1]
402
+ matches = [l.strip() for l in content.split("\n") if target in l]
403
+ return matches[:5] if matches else f"'{target}' not found in {artifact}"
404
+
405
+ def _build_obs(self, last_result: Dict[str, Any]) -> MLOpsObservation:
406
+ return MLOpsObservation(
407
+ task_id=self.task_id,
408
+ task_description=TASK_DESCRIPTIONS[self.task_id],
409
+ run_id=self._run_id,
410
+ run_summary={
411
+ "model": self._model_cfg["name"],
412
+ "dataset": self._model_cfg["dataset"],
413
+ "task": self._model_cfg["type"],
414
+ "status": "FAILED" if self.task_id == "easy" else "COMPLETED_WITH_ANOMALIES",
415
+ "note": "Investigate artifacts to determine root cause.",
416
+ },
417
+ available_artifacts=list(self._artifact_meta),
418
+ artifacts_read=list(self._artifacts_read),
419
+ last_action_result=last_result,
420
+ step_count=self._step_count,
421
+ max_steps=self._max_steps,
422
+ done=self._done,
423
+ messages=list(self._messages),
424
+ )
425
+
426
+ @property
427
+ def state(self) -> MLOpsState:
428
+ return MLOpsState(
429
+ task_id=self.task_id,
430
+ seed=self._seed,
431
+ step_count=self._step_count,
432
+ max_steps=self._max_steps,
433
+ episode_done=self._done,
434
+ bug_type=self.bug_type,
435
+ bug_category=self.bug.category,
436
+ bug_file=self.bug.file,
437
+ bug_field=self.bug.field,
438
+ gold_fix=self.bug.gold_fix,
439
+ artifacts=self._artifacts,
440
+ artifacts_read=list(self._artifacts_read),
441
+ sanity_checks_run=list(self._sanity_checks_run),
442
+ duplicate_queries=self._duplicate_queries,
443
+ current_score=self._current_score,
444
+ )
445
+
446
+
447
+ # ─── Standalone grader ────────────────────────────────────────────────────────
448
+
449
+ def grade_task(task_id: str, seed: int, diagnosis: Dict[str, Any]) -> float:
450
+ """Deterministic grader callable by OpenEnv validation framework.
451
+
452
+ Bypasses the artifact-read penalty since the grader only evaluates
453
+ diagnosis quality, not investigation thoroughness.
454
+ """
455
+ env = MLOpsEnvironment(task_id=task_id)
456
+ env.reset(seed=seed)
457
+ # Pre-populate artifact reads to avoid the < 3 artifacts penalty
458
+ env._artifacts_read = list(env._artifacts.keys())
459
+ action = MLOpsAction(action_type="submit_diagnosis", **diagnosis)
460
+ _, reward, _, info = env.step(action)
461
+ return info.get("score", 0.0)
models.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MLOps Pipeline Debugger — Pydantic Models
3
+
4
+ The agent acts as an ML engineer investigating a broken training run.
5
+ It has access to training artifacts (logs, configs, dataset stats, preprocessing code)
6
+ and must diagnose the root cause through systematic investigation.
7
+
8
+ Action Space:
9
+ read_config → Get training configuration (hyperparams, model arch, optimizer)
10
+ read_logs → Get training logs (filterable by keyword/epoch range)
11
+ check_dataset_stats → Get dataset split sizes, class distribution, feature statistics
12
+ inspect_preprocessing → Read preprocessing pipeline code
13
+ read_eval_results → Get validation and test set evaluation metrics
14
+ run_sanity_check → Compute a specific diagnostic check (label overlap, class balance, etc.)
15
+ query_artifact → Fetch a specific field from any artifact
16
+ submit_diagnosis → Final answer — triggers grading
17
+
18
+ Observation Space:
19
+ task_id, task_description
20
+ available_artifacts — list of artifacts the agent can inspect
21
+ last_action_result — result of the most recent action
22
+ artifacts_read — which artifacts have been read so far (exploration tracking)
23
+ step_count, max_steps
24
+ done
25
+ """
26
+
27
+ from __future__ import annotations
28
+ from typing import Any, Dict, List, Literal, Optional
29
+ from pydantic import BaseModel, Field
30
+
31
+
32
+ # ─── Action ──────────────────────────────────────────────────────────────────
33
+
34
+ class MLOpsAction(BaseModel):
35
+ """
36
+ One action the agent can take per step.
37
+
38
+ action_type determines which fields are used:
39
+ read_config → (no extra fields)
40
+ read_logs → log_filter (optional keyword or "epoch:N-M")
41
+ check_dataset_stats → (no extra fields)
42
+ inspect_preprocessing → (no extra fields)
43
+ read_eval_results → (no extra fields)
44
+ run_sanity_check → sanity_check_type (required)
45
+ query_artifact → artifact_name + field_path (required)
46
+ submit_diagnosis → all diagnosis fields (required)
47
+ """
48
+ action_type: Literal[
49
+ "read_config",
50
+ "read_logs",
51
+ "check_dataset_stats",
52
+ "inspect_preprocessing",
53
+ "read_eval_results",
54
+ "run_sanity_check",
55
+ "query_artifact",
56
+ "submit_diagnosis",
57
+ ] = Field(..., description="Which action to perform")
58
+
59
+ # read_logs
60
+ log_filter: Optional[str] = Field(
61
+ None,
62
+ description="Filter logs by keyword (e.g. 'nan', 'warning', 'error') or epoch range 'epoch:1-5'"
63
+ )
64
+
65
+ # run_sanity_check
66
+ sanity_check_type: Optional[Literal[
67
+ "label_consistency", # Are train/eval label mappings identical?
68
+ "data_leakage", # Is there train/val sample overlap?
69
+ "gradient_norms", # Are gradient norms within normal range?
70
+ "class_balance", # Are classes balanced across splits?
71
+ "feature_statistics", # Do train/val feature distributions match?
72
+ "encoder_version_match", # Do all pipeline stages use the same encoder version?
73
+ "loss_trajectory", # Is the loss curve shape anomalous?
74
+ "metric_gap_analysis", # Is val vs test metric gap suspiciously large?
75
+ ]] = Field(None, description="Type of sanity check to run")
76
+
77
+ # query_artifact
78
+ artifact_name: Optional[Literal[
79
+ "config.yaml",
80
+ "train.log",
81
+ "dataset_stats.json",
82
+ "preprocessing.py",
83
+ "eval_results.json",
84
+ "model_card.json",
85
+ ]] = Field(None, description="Artifact to query a specific field from")
86
+ field_path: Optional[str] = Field(
87
+ None,
88
+ description="Dot-notation field path, e.g. 'optimizer.learning_rate' or 'metrics.val_accuracy'"
89
+ )
90
+
91
+ # submit_diagnosis
92
+ failure_category: Optional[Literal[
93
+ "config_error", # Wrong hyperparameter value
94
+ "data_leakage", # Train/val contamination
95
+ "evaluation_bug", # Eval pipeline uses wrong artifacts
96
+ "preprocessing_bug", # Data transformation applied incorrectly
97
+ "label_mismatch", # Label encoding inconsistency
98
+ "architecture_bug", # Model architecture misconfiguration
99
+ ]] = Field(None, description="Category of the failure")
100
+ root_cause_file: Optional[str] = Field(
101
+ None, description="Which artifact file contains the root cause"
102
+ )
103
+ root_cause_field: Optional[str] = Field(
104
+ None, description="Specific parameter, function, or variable that is wrong"
105
+ )
106
+ diagnosis: Optional[str] = Field(
107
+ None, description="Natural language explanation of what went wrong and why"
108
+ )
109
+ proposed_fix: Optional[str] = Field(
110
+ None, description="Concrete change that would fix the issue"
111
+ )
112
+
113
+
114
+ # ─── Observation ─────────────���───────────────────────────────────────────────
115
+
116
+ class ArtifactMeta(BaseModel):
117
+ name: str
118
+ description: str
119
+ size_hint: str # e.g. "47 lines", "12 fields"
120
+ last_modified: str
121
+
122
+
123
+ class MLOpsObservation(BaseModel):
124
+ """Everything the agent sees after each step / reset."""
125
+ task_id: str
126
+ task_description: str
127
+
128
+ # Run summary — always visible
129
+ run_id: str
130
+ run_summary: Dict[str, Any] = Field(
131
+ description="High-level run info: model, dataset, final loss, training status"
132
+ )
133
+
134
+ available_artifacts: List[ArtifactMeta]
135
+ artifacts_read: List[str] = Field(
136
+ default_factory=list,
137
+ description="Names of artifacts the agent has already read"
138
+ )
139
+
140
+ last_action_result: Dict[str, Any] = Field(default_factory=dict)
141
+
142
+ step_count: int = 0
143
+ max_steps: int = 30
144
+ done: bool = False
145
+ messages: List[str] = Field(default_factory=list)
146
+
147
+
148
+ # ─── State ───────────────────────────────────────────────────────────────────
149
+
150
+ class MLOpsState(BaseModel):
151
+ """Full internal state — for RL harness and debugging."""
152
+ task_id: str
153
+ seed: int
154
+ step_count: int
155
+ max_steps: int
156
+ episode_done: bool
157
+
158
+ # Planted bug ground truth
159
+ bug_type: str
160
+ bug_category: str
161
+ bug_file: str
162
+ bug_field: str
163
+ gold_fix: str
164
+
165
+ # All generated artifacts (full text)
166
+ artifacts: Dict[str, str]
167
+
168
+ # Agent's investigation history
169
+ artifacts_read: List[str]
170
+ sanity_checks_run: List[str]
171
+ duplicate_queries: int
172
+
173
+ current_score: float
openenv.yaml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: mlops-debug-env
2
+ version: "1.0.0"
3
+ description: >
4
+ MLOps Pipeline Debugger: an AI agent acts as a senior ML engineer
5
+ investigating a broken training run. The environment procedurally generates
6
+ realistic training artifacts (logs, configs, preprocessing code, eval results)
7
+ with one planted fault. The agent must systematically investigate and submit
8
+ a structured diagnosis. Three tasks: config error (easy) → data leakage (medium)
9
+ → silent evaluation bug (hard). All graders are fully deterministic.
10
+ author: Mohit Goyal
11
+ license: MIT
12
+ tags: [openenv, rl, mlops, debugging, machine-learning, agents]
13
+ tasks:
14
+ - id: easy
15
+ name: Config Error Diagnosis
16
+ difficulty: easy
17
+ max_steps: 20
18
+ bug_pool: [exploding_lr, wrong_optimizer, batch_size_overflow]
19
+ reward_range: [0.0, 1.0]
20
+ - id: medium
21
+ name: Data Leakage Detection
22
+ difficulty: medium
23
+ max_steps: 30
24
+ bug_pool: [data_leakage_scaler, data_leakage_overlap, wrong_split_ratio]
25
+ reward_range: [0.0, 1.0]
26
+ - id: hard
27
+ name: Silent Evaluation Bug
28
+ difficulty: hard
29
+ max_steps: 40
30
+ bug_pool: [label_encoder_mismatch, silent_metric_swap, tokenizer_version_drift]
31
+ reward_range: [0.0, 1.0]
32
+ asymmetric_penalty: true
33
+ action_space:
34
+ type: discrete_structured
35
+ actions: [read_config, read_logs, check_dataset_stats, inspect_preprocessing,
36
+ read_eval_results, run_sanity_check, query_artifact, submit_diagnosis]
37
+ observation_space:
38
+ type: structured_text
39
+ fields: [task_id, run_summary, available_artifacts, artifacts_read,
40
+ last_action_result, step_count, max_steps, done, messages]
41
+ reward:
42
+ type: dense_and_terminal
43
+ per_step: "+0.02 new artifact read, -0.02 duplicate read, +0.01 new sanity check"
44
+ terminal: "0.15 category + 0.25 file + 0.30 field + 0.30 fix. Hard task 1.5x penalty."
45
+ api:
46
+ reset: POST /reset
47
+ step: POST /step
48
+ state: GET /state
49
+ health: GET /health
50
+ websocket: /ws
51
+ runtime:
52
+ port: 7860
53
+ workers: 1
54
+ framework: fastapi
55
+ python: "3.11"
openenv_state.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime
4
+ from typing import Dict, List
5
+
6
+ from pydantic import BaseModel
7
+
8
+
9
+ class OpenEnvState(BaseModel):
10
+ run_id: str
11
+ task_id: str
12
+ seed: int
13
+ step_count: int
14
+ max_steps: int
15
+ scores: Dict[str, float]
16
+ end_score: float
17
+ rewards: List[float]
18
+ artifacts_read: List[str]
19
+ timestamp: str
20
+
21
+
22
+ # Global current state (mutable during run)
23
+ OPENENV_STATE: OpenEnvState = OpenEnvState(
24
+ run_id="",
25
+ task_id="",
26
+ seed=0,
27
+ step_count=0,
28
+ max_steps=30,
29
+ scores={"easy": 0.0, "medium": 0.0, "hard": 0.0},
30
+ end_score=0.0,
31
+ rewards=[],
32
+ artifacts_read=[],
33
+ timestamp=datetime.utcnow().isoformat(),
34
+ )
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.115.0
2
+ uvicorn[standard]==0.30.6
3
+ pydantic==2.9.2
4
+ httpx==0.27.2
5
+ openai==1.51.0
6
+ websockets==13.1
7
+ numpy==1.26.4
8
+ python-multipart==0.0.12
validate-submission.sh ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # validate-submission.sh — OpenEnv Submission Validator
4
+ #
5
+ # This script validates that your submission meets all competition requirements.
6
+ # It checks:
7
+ # 1. Docker build passes
8
+ # 2. openenv validate passes
9
+ # 3. Server responds to /health and /reset
10
+ # 4. Baseline inference script reproduces scores
11
+ # 5. 3+ tasks with graders produce scores in 0.0-1.0 range
12
+
13
+ set -uo pipefail
14
+
15
+ DOCKER_BUILD_TIMEOUT=600
16
+
17
+ if [ -t 1 ]; then
18
+ RED='\033[0;31m'
19
+ GREEN='\033[0;32m'
20
+ YELLOW='\033[1;33m'
21
+ BOLD='\033[1m'
22
+ NC='\033[0m'
23
+ else
24
+ RED=''
25
+ GREEN=''
26
+ YELLOW=''
27
+ BOLD=''
28
+ NC=''
29
+ fi
30
+
31
+ log() { echo -e "${GREEN}[OK]${NC} $*"; }
32
+ warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
33
+ err() { echo -e "${RED}[ERROR]${NC} $*"; }
34
+ fail() { echo -e "${RED}[FAIL]${NC} $*"; exit 1; }
35
+
36
+ check() {
37
+ local cmd="$1"
38
+ local name="$2"
39
+ echo -n "Checking $name... "
40
+ if eval "$cmd" &>/dev/null; then
41
+ log "$name"
42
+ return 0
43
+ else
44
+ fail "$name"
45
+ return 1
46
+ fi
47
+ }
48
+
49
+ # Check prerequisites
50
+ check "command -v docker" "Docker installed"
51
+ check "command -v python" "Python installed"
52
+
53
+ # Build Docker
54
+ log "Building Docker image..."
55
+ cd "$(dirname "$0")"
56
+ if docker build -t mlops-debug-env .; then
57
+ log "Docker build passed"
58
+ else
59
+ fail "Docker build failed"
60
+ fi
61
+
62
+ # Start container in background
63
+ log "Starting server..."
64
+ docker run -d -p 7860:7860 --name mlops-test mlops-debug-env
65
+ sleep 5
66
+
67
+ # Check health endpoint
68
+ check "curl -s http://localhost:7860/health" "Server /health responds"
69
+
70
+ # Test reset endpoint
71
+ log "Testing /reset..."
72
+ RESET_RESP=$(curl -s -X POST http://localhost:7860/reset -H "Content-Type: application/json" -d '{"task_id":"easy","seed":42}')
73
+ if echo "$RESET_RESP" | grep -q "task_id"; then
74
+ log "/reset works"
75
+ else
76
+ fail "/reset failed"
77
+ fi
78
+
79
+ # Test inference script
80
+ log "Running baseline inference..."
81
+ export API_BASE_URL="https://api.openai.com/v1"
82
+ export MODEL_NAME="gemini-2.5-flash"
83
+ export HF_TOKEN="${HF_TOKEN:-test}"
84
+ export ENV_BASE_URL="http://localhost:7860"
85
+
86
+ if python inference.py --task easy --seed 42 2>&1 | grep -q "score="; then
87
+ log "Inference script format correct"
88
+ else
89
+ warn "Inference script may have format issues"
90
+ fi
91
+
92
+ # Cleanup
93
+ log "Cleaning up..."
94
+ docker stop mlops-test 2>/dev/null || true
95
+ docker rm mlops-test 2>/dev/null || true
96
+
97
+ echo ""
98
+ echo "========================================="
99
+ log "All validation checks passed!"
100
+ echo "========================================="