trretretret commited on
Commit
6a71058
·
1 Parent(s): 696c12c

Add uv.lock, pyproject.toml, server directory

Browse files
pyproject.toml CHANGED
@@ -17,8 +17,12 @@ dependencies = [
17
  "numpy>=1.26.0",
18
  "python-multipart>=0.0.12",
19
  "dotenv>=1.0.0",
 
20
  ]
21
 
 
 
 
22
  [project.optional-dependencies]
23
  dev = [
24
  "pytest>=8.0.0",
 
17
  "numpy>=1.26.0",
18
  "python-multipart>=0.0.12",
19
  "dotenv>=1.0.0",
20
+ "openenv-core>=0.2.0",
21
  ]
22
 
23
+ [project.scripts]
24
+ server = "uvicorn:app --host 0.0.0.0 --port 7860"
25
+
26
  [project.optional-dependencies]
27
  dev = [
28
  "pytest>=8.0.0",
server/.env.example ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ GEMINI_API_KEY=your_gemini_api_key_here
2
+ MODEL_NAME=gemini-2.5-flash
3
+ ENV_BASE_URL=https://angelgupta-mlops-openenv.hf.space
4
+ API_BASE_URL=https://generativelanguage.googleapis.com/v1beta/openai/
5
+ HF_TOKEN=your_hf_token_here
server/app.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import json
3
+ from typing import Any, Dict, Optional
4
+ from fastapi import FastAPI, HTTPException, WebSocket, WebSocketDisconnect, Request
5
+ from openenv_state import OPENENV_STATE, OpenEnvState
6
+ from fastapi.middleware.cors import CORSMiddleware
7
+ from pydantic import BaseModel
8
+
9
+ from models import MLOpsAction, MLOpsObservation, MLOpsState
10
+ from mlops_environment import MLOpsEnvironment
11
+
12
+ app = FastAPI(
13
+ title="MLOps Pipeline Debugger",
14
+ description="OpenEnv environment: AI agent diagnoses broken ML training runs.",
15
+ version="1.0.0",
16
+ )
17
+ app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
18
+
19
+ _http_env: Optional[MLOpsEnvironment] = None
20
+
21
+
22
+ class ResetRequest(BaseModel):
23
+ task_id: Optional[str] = "easy"
24
+ seed: Optional[int] = None
25
+ task: Optional[str] = None # Support both task_id and task
26
+
27
+
28
+ class StepResponse(BaseModel):
29
+ observation: MLOpsObservation
30
+ reward: float
31
+ done: bool
32
+ info: Dict[str, Any]
33
+
34
+
35
+ @app.post("/reset", response_model=MLOpsObservation)
36
+ async def reset(request: Request):
37
+ try:
38
+ body = await request.json()
39
+ except Exception:
40
+ body = {}
41
+ task_id = body.get("task_id") or body.get("task") or "easy"
42
+ seed = body.get("seed")
43
+ global _http_env
44
+ _http_env = MLOpsEnvironment(task_id=task_id)
45
+ return _http_env.reset(seed=seed)
46
+
47
+
48
+ @app.get("/")
49
+ async def root():
50
+ return {
51
+ "message": "MLOps Pipeline Debugger API",
52
+ "version": "1.0.0",
53
+ "docs": "This is an OpenEnv-compatible RL environment",
54
+ "endpoints": {
55
+ "GET /": "This message",
56
+ "GET /health": "Health check",
57
+ "GET /tasks": "List available tasks",
58
+ "GET /openenv/state": "OpenEnv state",
59
+ "POST /reset": "Start a new episode",
60
+ "POST /step": "Take an action",
61
+ "GET /state": "Get current state",
62
+ },
63
+ }
64
+
65
+
66
+ @app.get("/health")
67
+ async def health():
68
+ return {"status": "ok", "environment": "mlops_debug_env", "version": "1.0.0"}
69
+
70
+
71
+ @app.get("/openenv/state", response_model=OpenEnvState)
72
+ def openenv_state():
73
+ return OPENENV_STATE
74
+
75
+
76
+ @app.get("/tasks")
77
+ async def list_tasks():
78
+ return {
79
+ "tasks": [
80
+ {
81
+ "task_id": "easy",
82
+ "name": "Config Error Diagnosis",
83
+ "difficulty": "easy",
84
+ "max_steps": 20,
85
+ },
86
+ {
87
+ "task_id": "medium",
88
+ "name": "Data Leakage Detection",
89
+ "difficulty": "medium",
90
+ "max_steps": 30,
91
+ },
92
+ {
93
+ "task_id": "hard",
94
+ "name": "Silent Evaluation Bug",
95
+ "difficulty": "hard",
96
+ "max_steps": 40,
97
+ },
98
+ ]
99
+ }
100
+
101
+
102
+ @app.post("/step", response_model=StepResponse)
103
+ async def step(request: Request):
104
+ if _http_env is None:
105
+ raise HTTPException(400, "Call /reset first.")
106
+
107
+ # Get raw body as dict
108
+ try:
109
+ body = await request.json()
110
+ except Exception:
111
+ body = {}
112
+
113
+ # Handle various input formats
114
+ action = None
115
+ try:
116
+ if "action_type" in body:
117
+ action = MLOpsAction(**body)
118
+ elif "action" in body:
119
+ action = MLOpsAction(**body["action"])
120
+ elif "message" in body:
121
+ action = MLOpsAction(action_type=body["message"])
122
+ except Exception as e:
123
+ raise HTTPException(422, f"Invalid action: {str(e)}")
124
+
125
+ if action is None or action.action_type is None:
126
+ raise HTTPException(422, "Field required: action_type")
127
+
128
+ try:
129
+ obs, reward, done, info = _http_env.step(action)
130
+ return StepResponse(observation=obs, reward=reward, done=done, info=info)
131
+ except Exception as e:
132
+ raise HTTPException(500, f"Step error: {str(e)}")
133
+
134
+
135
+ @app.get("/state", response_model=MLOpsState)
136
+ async def state():
137
+ if _http_env is None:
138
+ raise HTTPException(400, "Call /reset first.")
139
+ return _http_env.state
140
+
141
+
142
+ @app.websocket("/ws")
143
+ async def ws_endpoint(websocket: WebSocket):
144
+ await websocket.accept()
145
+ env: Optional[MLOpsEnvironment] = None
146
+ try:
147
+ while True:
148
+ msg = json.loads(await websocket.receive_text())
149
+ method = msg.get("method")
150
+ if method == "reset":
151
+ env = MLOpsEnvironment(task_id=msg.get("task_id", "easy"))
152
+ obs = env.reset(seed=msg.get("seed"))
153
+ await websocket.send_text(
154
+ json.dumps({"method": "reset", "observation": obs.model_dump()})
155
+ )
156
+ elif method == "step":
157
+ if env is None:
158
+ await websocket.send_text(json.dumps({"error": "Call reset first"}))
159
+ continue
160
+ action = MLOpsAction(**msg.get("action", {}))
161
+ obs, reward, done, info = env.step(action)
162
+ await websocket.send_text(
163
+ json.dumps(
164
+ {
165
+ "method": "step",
166
+ "observation": obs.model_dump(),
167
+ "reward": reward,
168
+ "done": done,
169
+ "info": info,
170
+ }
171
+ )
172
+ )
173
+ elif method == "state":
174
+ if env is None:
175
+ await websocket.send_text(json.dumps({"error": "Call reset first"}))
176
+ continue
177
+ await websocket.send_text(
178
+ json.dumps({"method": "state", "state": env.state.model_dump()})
179
+ )
180
+ except WebSocketDisconnect:
181
+ pass
server/artifact_generator.py ADDED
@@ -0,0 +1,960 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Artifact Generator for MLOps Pipeline Debugger
3
+
4
+ Generates a full set of realistic ML training artifacts for a given bug scenario.
5
+ Each artifact is internally consistent — config matches logs, dataset stats match
6
+ preprocessing code — except for the one planted fault.
7
+
8
+ Bug types supported:
9
+ Task 1 (easy):
10
+ - exploding_lr : learning_rate too large → loss diverges to NaN
11
+ - wrong_optimizer : SGD with momentum=0.99 on non-convex problem
12
+ - batch_size_overflow: batch_size > dataset size → trivial overfitting signal
13
+
14
+ Task 2 (medium):
15
+ - data_leakage_scaler : StandardScaler fit on full dataset before split
16
+ - data_leakage_overlap : train/val split with random_state=None → overlap
17
+ - wrong_split_ratio : test data accidentally included in training
18
+
19
+ Task 3 (hard):
20
+ - label_encoder_mismatch : train/eval use different LabelEncoder.fit() orderings
21
+ - silent_metric_swap : val and test metric names swapped in eval code
22
+ - tokenizer_version_drift: training uses tokenizer v1, eval uses v2 (different vocab)
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import json
28
+ import random
29
+ import textwrap
30
+ from dataclasses import dataclass, field
31
+ from typing import Dict, Tuple
32
+
33
+ import numpy as np
34
+
35
+
36
+ # ─── Bug Specifications ───────────────────────────────────────────────────────
37
+
38
+ @dataclass
39
+ class BugSpec:
40
+ bug_type: str
41
+ category: str # maps to failure_category in Action
42
+ file: str # root_cause_file
43
+ field: str # root_cause_field
44
+ gold_fix: str
45
+ task_difficulty: str # easy / medium / hard
46
+
47
+
48
+ BUG_CATALOGUE: Dict[str, BugSpec] = {
49
+ # ── EASY ──────────────────────────────────────────────────────────────────
50
+ "exploding_lr": BugSpec(
51
+ bug_type="exploding_lr",
52
+ category="config_error",
53
+ file="config.yaml",
54
+ field="optimizer.learning_rate",
55
+ gold_fix="Reduce learning_rate from 50.0 to 1e-4 (or use a scheduler with warmup)",
56
+ task_difficulty="easy",
57
+ ),
58
+ "wrong_optimizer": BugSpec(
59
+ bug_type="wrong_optimizer",
60
+ category="config_error",
61
+ file="config.yaml",
62
+ field="optimizer.momentum",
63
+ gold_fix="Reduce momentum from 0.99 to 0.9, or switch to AdamW optimizer",
64
+ task_difficulty="easy",
65
+ ),
66
+ "batch_size_overflow": BugSpec(
67
+ bug_type="batch_size_overflow",
68
+ category="config_error",
69
+ file="config.yaml",
70
+ field="training.batch_size",
71
+ gold_fix="Reduce batch_size from 4096 to 32 or 64; current size exceeds training set",
72
+ task_difficulty="easy",
73
+ ),
74
+
75
+ # ── MEDIUM ────────────────────────────────────────────────────────────────
76
+ "data_leakage_scaler": BugSpec(
77
+ bug_type="data_leakage_scaler",
78
+ category="data_leakage",
79
+ file="preprocessing.py",
80
+ field="StandardScaler.fit_transform",
81
+ gold_fix="Fit StandardScaler only on X_train, then call transform() on X_val and X_test separately",
82
+ task_difficulty="medium",
83
+ ),
84
+ "data_leakage_overlap": BugSpec(
85
+ bug_type="data_leakage_overlap",
86
+ category="data_leakage",
87
+ file="preprocessing.py",
88
+ field="train_test_split.random_state",
89
+ gold_fix="Set random_state=42 in train_test_split to ensure deterministic, non-overlapping splits",
90
+ task_difficulty="medium",
91
+ ),
92
+ "wrong_split_ratio": BugSpec(
93
+ bug_type="wrong_split_ratio",
94
+ category="preprocessing_bug",
95
+ file="preprocessing.py",
96
+ field="train_test_split.test_size",
97
+ gold_fix="Change test_size from 0.8 to 0.2 — current config trains on 20% and evaluates on 80%",
98
+ task_difficulty="medium",
99
+ ),
100
+
101
+ # ── HARD ──────────────────────────────────────────────────────────────────
102
+ "label_encoder_mismatch": BugSpec(
103
+ bug_type="label_encoder_mismatch",
104
+ category="label_mismatch",
105
+ file="preprocessing.py",
106
+ field="LabelEncoder.fit_order",
107
+ gold_fix="Use the same LabelEncoder instance (fitted on training data) for both train and eval pipelines",
108
+ task_difficulty="hard",
109
+ ),
110
+ "silent_metric_swap": BugSpec(
111
+ bug_type="silent_metric_swap",
112
+ category="evaluation_bug",
113
+ file="eval_results.json",
114
+ field="metrics.val_accuracy",
115
+ gold_fix="Swap val_accuracy and test_accuracy assignments in the evaluation loop — metrics are mislabeled",
116
+ task_difficulty="hard",
117
+ ),
118
+ "tokenizer_version_drift": BugSpec(
119
+ bug_type="tokenizer_version_drift",
120
+ category="evaluation_bug",
121
+ file="preprocessing.py",
122
+ field="tokenizer.version",
123
+ gold_fix="Ensure training and evaluation both use tokenizer v2 — v1 has a different vocabulary mapping for 847 tokens",
124
+ task_difficulty="hard",
125
+ ),
126
+ }
127
+
128
+ TASK_BUG_POOLS = {
129
+ "easy": ["exploding_lr", "wrong_optimizer", "batch_size_overflow"],
130
+ "medium": ["data_leakage_scaler", "data_leakage_overlap", "wrong_split_ratio"],
131
+ "hard": ["label_encoder_mismatch", "silent_metric_swap", "tokenizer_version_drift"],
132
+ }
133
+
134
+
135
+ # ─── Model / Dataset Configs (variety pool) ───────────────────────────────────
136
+
137
+ MODEL_CONFIGS = [
138
+ {"name": "ResNet-50", "type": "image_classification", "params": "25.6M",
139
+ "dataset": "ImageNet-subset-10k", "num_classes": 10, "input": "224x224 RGB"},
140
+ {"name": "BERT-base-uncased", "type": "text_classification", "params": "110M",
141
+ "dataset": "SST-2", "num_classes": 2, "input": "tokenized sequences, max_len=128"},
142
+ {"name": "EfficientNet-B3", "type": "image_classification", "params": "12.2M",
143
+ "dataset": "CIFAR-100", "num_classes": 100, "input": "300x300 RGB"},
144
+ {"name": "DistilBERT", "type": "sentiment_analysis", "params": "66M",
145
+ "dataset": "IMDB-reviews", "num_classes": 3, "input": "tokenized sequences, max_len=256"},
146
+ {"name": "MobileNetV3-Large", "type": "image_classification", "params": "5.4M",
147
+ "dataset": "Oxford-102-Flowers", "num_classes": 102, "input": "224x224 RGB"},
148
+ ]
149
+
150
+ OPTIMIZERS = ["AdamW", "SGD", "RMSprop", "Adam"]
151
+ SCHEDULERS = ["cosine_annealing", "step_lr", "reduce_on_plateau", "linear_warmup"]
152
+
153
+
154
+ # ─── Artifact Generators ──────────────────────────────────────────────────────
155
+
156
+ class ArtifactGenerator:
157
+ """
158
+ Generates all 6 training artifacts for a given (bug_type, seed) pair.
159
+ All artifacts are internally consistent except for the planted fault.
160
+ """
161
+
162
+ def __init__(self, bug_type: str, seed: int):
163
+ self.bug = BUG_CATALOGUE[bug_type]
164
+ self.seed = seed
165
+ self.rng = random.Random(seed)
166
+ self.np_rng = np.random.RandomState(seed)
167
+
168
+ # Pick a model config deterministically
169
+ self.model_cfg = self.rng.choice(MODEL_CONFIGS)
170
+ self.optimizer_name = self.rng.choice(OPTIMIZERS)
171
+ self.scheduler_name = self.rng.choice(SCHEDULERS)
172
+ self.run_id = f"run_{seed:04d}_{bug_type[:6]}"
173
+
174
+ # Normal hyperparams
175
+ self.lr = self.rng.choice([1e-5, 3e-5, 1e-4, 3e-4])
176
+ self.batch_size = self.rng.choice([16, 32, 64])
177
+ self.epochs = self.rng.randint(8, 20)
178
+ self.weight_decay = self.rng.choice([0.01, 0.001, 1e-4])
179
+ self.momentum = 0.9
180
+ self.train_samples = self.rng.randint(8000, 15000)
181
+ self.val_samples = int(self.train_samples * 0.2)
182
+ self.test_samples = int(self.train_samples * 0.15)
183
+
184
+ def generate_all(self) -> Dict[str, str]:
185
+ return {
186
+ "config.yaml": self._gen_config(),
187
+ "train.log": self._gen_train_log(),
188
+ "dataset_stats.json": self._gen_dataset_stats(),
189
+ "preprocessing.py": self._gen_preprocessing(),
190
+ "eval_results.json": self._gen_eval_results(),
191
+ "model_card.json": self._gen_model_card(),
192
+ }
193
+
194
+ # ── config.yaml ──────────────────────────────────────────────────────────
195
+
196
+ def _gen_config(self) -> str:
197
+ lr = self.lr
198
+ batch_size = self.batch_size
199
+ momentum = self.momentum
200
+
201
+ if self.bug.bug_type == "exploding_lr":
202
+ lr = self.rng.choice([50.0, 10.0, 25.0])
203
+ elif self.bug.bug_type == "wrong_optimizer":
204
+ momentum = 0.99
205
+ self.optimizer_name = "SGD"
206
+ elif self.bug.bug_type == "batch_size_overflow":
207
+ batch_size = self.rng.choice([2048, 4096, 8192])
208
+
209
+ return textwrap.dedent(f"""\
210
+ # Training Configuration
211
+ # Run ID: {self.run_id}
212
+ # Generated: 2024-03-{self.rng.randint(1,28):02d}T{self.rng.randint(0,23):02d}:{self.rng.randint(0,59):02d}:00Z
213
+
214
+ model:
215
+ architecture: {self.model_cfg['name']}
216
+ num_classes: {self.model_cfg['num_classes']}
217
+ pretrained: true
218
+ pretrained_source: "timm/torchvision"
219
+ dropout: {self.rng.choice([0.1, 0.2, 0.3])}
220
+ freeze_backbone_epochs: {self.rng.randint(0, 3)}
221
+
222
+ training:
223
+ epochs: {self.epochs}
224
+ batch_size: {batch_size}
225
+ num_workers: {self.rng.choice([4, 8])}
226
+ pin_memory: true
227
+ mixed_precision: {str(self.rng.choice([True, False])).lower()}
228
+ gradient_clip_norm: {self.rng.choice([1.0, 5.0, "null"])}
229
+ early_stopping_patience: {self.rng.randint(3, 7)}
230
+ seed: {self.seed}
231
+
232
+ optimizer:
233
+ name: {self.optimizer_name}
234
+ learning_rate: {lr}
235
+ weight_decay: {self.weight_decay}
236
+ momentum: {momentum}
237
+ betas: [0.9, 0.999]
238
+
239
+ scheduler:
240
+ name: {self.scheduler_name}
241
+ warmup_epochs: {self.rng.randint(0, 3)}
242
+ min_lr: 1.0e-7
243
+ t_max: {self.epochs}
244
+
245
+ data:
246
+ dataset: {self.model_cfg['dataset']}
247
+ input_size: "{self.model_cfg['input']}"
248
+ train_split: 0.8
249
+ val_split: 0.1
250
+ test_split: 0.1
251
+ augmentation:
252
+ random_crop: true
253
+ horizontal_flip: {str(self.rng.choice([True, False])).lower()}
254
+ color_jitter: {self.rng.choice([0.2, 0.4])}
255
+ normalize_mean: [0.485, 0.456, 0.406]
256
+ normalize_std: [0.229, 0.224, 0.225]
257
+
258
+ logging:
259
+ log_interval: 50
260
+ save_best_only: true
261
+ checkpoint_dir: "./checkpoints/{self.run_id}"
262
+ wandb_project: "mlops-debug-bench"
263
+ """)
264
+
265
+ # ── train.log ────────────────────────────────────────────────────────────
266
+
267
+ def _gen_train_log(self) -> str:
268
+ lines = []
269
+ lines.append(f"[INFO 2024-03-{self.rng.randint(1,28):02d} {self.rng.randint(6,10):02d}:00:00] Starting training run: {self.run_id}")
270
+ lines.append(f"[INFO ] Model: {self.model_cfg['name']} | Params: {self.model_cfg['params']}")
271
+ lines.append(f"[INFO ] Dataset: {self.model_cfg['dataset']} | Train: {self.train_samples:,} | Val: {self.val_samples:,}")
272
+ lines.append(f"[INFO ] Device: cuda:0 | Mixed precision: fp16")
273
+ lines.append(f"[INFO ] Optimizer: {self.optimizer_name} | LR: {self.lr} | Batch: {self.batch_size}")
274
+ lines.append("[INFO ] ─" * 30)
275
+
276
+ bug = self.bug.bug_type
277
+
278
+ if bug == "exploding_lr":
279
+ # Loss explodes rapidly
280
+ loss = 2.302
281
+ for ep in range(1, min(self.epochs + 1, 6)):
282
+ acc = max(0.0, 0.12 - ep * 0.02)
283
+ val_loss = loss * self.rng.uniform(1.1, 1.5)
284
+ val_acc = max(0.0, acc - 0.05)
285
+ lines.append(f"[EPOCH {ep:03d}] train_loss={loss:.4f} train_acc={acc:.4f} "
286
+ f"val_loss={val_loss:.4f} val_acc={val_acc:.4f} "
287
+ f"lr={self.lr:.2e} grad_norm={loss * 18.3:.2f} "
288
+ f"time={self.rng.randint(45,90)}s")
289
+ if ep == 1: lines.append(f"[WARN ] Gradient norm unusually high: {loss * 18.3:.2f} (threshold: 10.0)")
290
+ loss = loss * self.rng.uniform(4.5, 9.0)
291
+ if loss > 1e6:
292
+ lines.append(f"[EPOCH {ep+1:03d}] train_loss=nan train_acc=0.1000 val_loss=nan val_acc=0.1000 "
293
+ f"lr={self.lr:.2e} grad_norm=nan time={self.rng.randint(45,90)}s")
294
+ lines.append(f"[ERROR ] Loss is NaN at epoch {ep+1}, step {self.rng.randint(100,300)}. Training halted.")
295
+ lines.append(f"[ERROR ] Last finite loss: {loss / self.rng.uniform(4,9):.2f}. Gradient explosion detected.")
296
+ break
297
+
298
+ elif bug == "wrong_optimizer":
299
+ # Loss oscillates wildly, never converges
300
+ loss = 2.302
301
+ for ep in range(1, self.epochs + 1):
302
+ delta = self.rng.uniform(-0.8, 1.2)
303
+ loss = max(1.8, loss + delta)
304
+ acc = self.rng.uniform(0.10, 0.25)
305
+ val_loss = loss + self.rng.uniform(-0.3, 0.8)
306
+ val_acc = self.rng.uniform(0.09, 0.22)
307
+ lines.append(f"[EPOCH {ep:03d}] train_loss={loss:.4f} train_acc={acc:.4f} "
308
+ f"val_loss={val_loss:.4f} val_acc={val_acc:.4f} "
309
+ f"lr={self.lr:.2e} grad_norm={self.rng.uniform(8.2, 45.1):.2f} "
310
+ f"time={self.rng.randint(45,90)}s")
311
+ if ep % 3 == 0:
312
+ lines.append(f"[WARN ] Loss oscillation detected over last 3 epochs: {loss+0.4:.3f} → {loss-0.5:.3f} → {loss:.3f}")
313
+
314
+ elif bug == "batch_size_overflow":
315
+ # Val accuracy hits 100% immediately — model memorizes tiny effective dataset
316
+ for ep in range(1, self.epochs + 1):
317
+ train_loss = max(0.001, 2.302 * (0.05 ** ep))
318
+ train_acc = min(1.0, 0.3 + ep * 0.09)
319
+ val_acc = 0.999 if ep >= 2 else 0.85
320
+ val_loss = 0.001 if ep >= 2 else 0.45
321
+ lines.append(f"[EPOCH {ep:03d}] train_loss={train_loss:.4f} train_acc={train_acc:.4f} "
322
+ f"val_loss={val_loss:.4f} val_acc={val_acc:.4f} "
323
+ f"lr={self.lr:.2e} grad_norm={self.rng.uniform(0.1,0.9):.3f} "
324
+ f"time={self.rng.randint(3,8)}s")
325
+ lines.append(f"[WARN ] Effective steps per epoch: {max(1, self.train_samples // 4096)}. Dataset may be smaller than batch size.")
326
+
327
+ elif bug in ("data_leakage_scaler", "data_leakage_overlap", "wrong_split_ratio"):
328
+ # Val accuracy suspiciously high from epoch 1
329
+ for ep in range(1, self.epochs + 1):
330
+ train_loss = max(0.01, 0.45 - ep * 0.02)
331
+ train_acc = min(0.98, 0.72 + ep * 0.015)
332
+ val_acc = min(0.999, 0.984 + self.rng.uniform(-0.002, 0.002)) if ep >= 1 else 0.71
333
+ val_loss = max(0.001, 0.04 - ep * 0.001)
334
+ lines.append(f"[EPOCH {ep:03d}] train_loss={train_loss:.4f} train_acc={train_acc:.4f} "
335
+ f"val_loss={val_loss:.4f} val_acc={val_acc:.4f} "
336
+ f"lr={self.lr:.2e} grad_norm={self.rng.uniform(0.1,1.2):.3f} "
337
+ f"time={self.rng.randint(45,90)}s")
338
+ lines.append(f"[INFO ] Best model saved at epoch 2: val_acc=0.9841")
339
+ lines.append(f"[WARN ] Val accuracy reached 98.4% at epoch 1 — verify no data leakage.")
340
+
341
+ elif bug in ("label_encoder_mismatch", "silent_metric_swap", "tokenizer_version_drift"):
342
+ # Training looks completely normal — the bug is silent
343
+ best_val = 0.0
344
+ for ep in range(1, self.epochs + 1):
345
+ train_loss = max(0.08, 1.8 * (0.72 ** ep) + self.rng.uniform(-0.02, 0.02))
346
+ train_acc = min(0.96, 0.42 + ep * 0.032 + self.rng.uniform(-0.01, 0.01))
347
+ val_loss = train_loss * self.rng.uniform(1.05, 1.15)
348
+ val_acc = train_acc - self.rng.uniform(0.02, 0.06)
349
+ best_val = max(best_val, val_acc)
350
+ lines.append(f"[EPOCH {ep:03d}] train_loss={train_loss:.4f} train_acc={train_acc:.4f} "
351
+ f"val_loss={val_loss:.4f} val_acc={val_acc:.4f} "
352
+ f"lr={self.lr:.2e} grad_norm={self.rng.uniform(0.3, 2.1):.3f} "
353
+ f"time={self.rng.randint(60,120)}s")
354
+ lines.append(f"[INFO ] Training complete. Best val_acc={best_val:.4f} at epoch {self.rng.randint(self.epochs-3, self.epochs)}")
355
+ lines.append(f"[INFO ] Checkpoint saved: ./checkpoints/{self.run_id}/best_model.pt")
356
+
357
+ lines.append("[INFO ] ─" * 30)
358
+ lines.append(f"[INFO ] Run {self.run_id} finished.")
359
+ return "\n".join(lines)
360
+
361
+ # ── dataset_stats.json ───────────────────────────────────────────────────
362
+
363
+ def _gen_dataset_stats(self) -> str:
364
+ n_classes = self.model_cfg["num_classes"]
365
+ train_n = self.train_samples
366
+ val_n = self.val_samples
367
+ test_n = self.test_samples
368
+
369
+ overlap_count = 0
370
+ if self.bug.bug_type == "data_leakage_overlap":
371
+ overlap_count = self.rng.randint(int(val_n * 0.15), int(val_n * 0.30))
372
+ elif self.bug.bug_type == "wrong_split_ratio":
373
+ # Train and test flipped
374
+ train_n, test_n = test_n, train_n
375
+
376
+ # Class distribution (roughly uniform with jitter)
377
+ def class_dist(total, n_cls):
378
+ base = total // n_cls
379
+ counts = {str(i): base + self.rng.randint(-int(base*0.15), int(base*0.15))
380
+ for i in range(min(n_cls, 10))}
381
+ if n_cls > 10:
382
+ counts["..."] = f"{n_cls - 10} more classes"
383
+ return counts
384
+
385
+ stats = {
386
+ "dataset": self.model_cfg["dataset"],
387
+ "num_classes": n_classes,
388
+ "splits": {
389
+ "train": {
390
+ "n_samples": train_n,
391
+ "class_distribution": class_dist(train_n, n_classes),
392
+ },
393
+ "val": {
394
+ "n_samples": val_n,
395
+ "class_distribution": class_dist(val_n, n_classes),
396
+ "overlap_with_train": overlap_count,
397
+ },
398
+ "test": {
399
+ "n_samples": test_n,
400
+ "class_distribution": class_dist(test_n, n_classes),
401
+ },
402
+ },
403
+ "feature_statistics": {
404
+ "mean": round(self.np_rng.uniform(0.45, 0.55), 4),
405
+ "std": round(self.np_rng.uniform(0.22, 0.28), 4),
406
+ "min": 0.0,
407
+ "max": 1.0,
408
+ "null_count": 0,
409
+ },
410
+ "preprocessing_applied": [
411
+ "resize",
412
+ "normalize",
413
+ "label_encode",
414
+ "train_val_test_split",
415
+ ],
416
+ "random_seed_used": self.seed if self.bug.bug_type != "data_leakage_overlap" else None,
417
+ }
418
+ return json.dumps(stats, indent=2)
419
+
420
+ # ── preprocessing.py ─────────────────────────────────────────────────────
421
+
422
+ def _gen_preprocessing(self) -> str:
423
+ bug = self.bug.bug_type
424
+
425
+ if bug == "data_leakage_scaler":
426
+ return textwrap.dedent(f"""\
427
+ \"\"\"
428
+ Data preprocessing pipeline for {self.model_cfg['dataset']}
429
+ Run ID: {self.run_id}
430
+ \"\"\"
431
+ import numpy as np
432
+ import pandas as pd
433
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
434
+ from sklearn.model_selection import train_test_split
435
+
436
+
437
+ def load_raw_data(data_dir: str):
438
+ \"\"\"Load features and labels from disk.\"\"\"
439
+ X = np.load(f"{{data_dir}}/features.npy")
440
+ y = np.load(f"{{data_dir}}/labels.npy")
441
+ return X, y
442
+
443
+
444
+ def preprocess(data_dir: str, seed: int = {self.seed}):
445
+ X, y = load_raw_data(data_dir)
446
+
447
+ # Encode labels
448
+ le = LabelEncoder()
449
+ y_encoded = le.fit_transform(y)
450
+
451
+ # ── BUG: Scaler fit on full dataset BEFORE split ──────────
452
+ scaler = StandardScaler()
453
+ X_normalized = scaler.fit_transform(X) # sees val/test data during fit!
454
+ # ─────────────────────────────────────────────────────────
455
+
456
+ X_train, X_temp, y_train, y_temp = train_test_split(
457
+ X_normalized, y_encoded, test_size=0.2, random_state=seed
458
+ )
459
+ X_val, X_test, y_val, y_test = train_test_split(
460
+ X_temp, y_temp, test_size=0.5, random_state=seed
461
+ )
462
+
463
+ return (X_train, y_train), (X_val, y_val), (X_test, y_test), scaler, le
464
+
465
+
466
+ def get_transforms(split: str):
467
+ \"\"\"Get augmentation transforms for a given split.\"\"\"
468
+ if split == "train":
469
+ return [
470
+ ("random_horizontal_flip", {{"p": 0.5}}),
471
+ ("random_crop", {{"size": 224, "padding": 4}}),
472
+ ("color_jitter", {{"brightness": 0.2, "contrast": 0.2}}),
473
+ ("normalize", {{"mean": [0.485, 0.456, 0.406],
474
+ "std": [0.229, 0.224, 0.225]}}),
475
+ ]
476
+ return [
477
+ ("center_crop", {{"size": 224}}),
478
+ ("normalize", {{"mean": [0.485, 0.456, 0.406],
479
+ "std": [0.229, 0.224, 0.225]}}),
480
+ ]
481
+ """)
482
+
483
+ elif bug == "data_leakage_overlap":
484
+ return textwrap.dedent(f"""\
485
+ \"\"\"
486
+ Data preprocessing pipeline for {self.model_cfg['dataset']}
487
+ Run ID: {self.run_id}
488
+ \"\"\"
489
+ import numpy as np
490
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
491
+ from sklearn.model_selection import train_test_split
492
+
493
+
494
+ def load_raw_data(data_dir: str):
495
+ X = np.load(f"{{data_dir}}/features.npy")
496
+ y = np.load(f"{{data_dir}}/labels.npy")
497
+ return X, y
498
+
499
+
500
+ def preprocess(data_dir: str):
501
+ X, y = load_raw_data(data_dir)
502
+
503
+ le = LabelEncoder()
504
+ y_encoded = le.fit_transform(y)
505
+
506
+ # First split: train vs temp
507
+ # ── BUG: random_state=None → non-reproducible, overlapping splits ──
508
+ X_train, X_temp, y_train, y_temp = train_test_split(
509
+ X, y_encoded, test_size=0.2, random_state=None # ← should be fixed seed
510
+ )
511
+ # Second split: val vs test (ALSO non-deterministic)
512
+ X_val, X_test, y_val, y_test = train_test_split(
513
+ X_temp, y_temp, test_size=0.5, random_state=None # ← should be fixed seed
514
+ )
515
+ # ─────────────────────────────────────────────────────────
516
+
517
+ scaler = StandardScaler()
518
+ X_train = scaler.fit_transform(X_train)
519
+ X_val = scaler.transform(X_val)
520
+ X_test = scaler.transform(X_test)
521
+
522
+ return (X_train, y_train), (X_val, y_val), (X_test, y_test), scaler, le
523
+ """)
524
+
525
+ elif bug == "wrong_split_ratio":
526
+ return textwrap.dedent(f"""\
527
+ \"\"\"
528
+ Data preprocessing pipeline for {self.model_cfg['dataset']}
529
+ Run ID: {self.run_id}
530
+ \"\"\"
531
+ import numpy as np
532
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
533
+ from sklearn.model_selection import train_test_split
534
+
535
+
536
+ def preprocess(data_dir: str, seed: int = {self.seed}):
537
+ X = np.load(f"{{data_dir}}/features.npy")
538
+ y = np.load(f"{{data_dir}}/labels.npy")
539
+
540
+ le = LabelEncoder()
541
+ y_encoded = le.fit_transform(y)
542
+
543
+ # ── BUG: test_size=0.8 — trains on 20%, evaluates on 80% ──
544
+ X_train, X_test, y_train, y_test = train_test_split(
545
+ X, y_encoded, test_size=0.8, random_state=seed # ← should be 0.2
546
+ )
547
+ X_val, X_test, y_val, y_test = train_test_split(
548
+ X_test, y_test, test_size=0.5, random_state=seed
549
+ )
550
+ # ──────────────────────────────────────────────────────────
551
+
552
+ scaler = StandardScaler()
553
+ X_train = scaler.fit_transform(X_train)
554
+ X_val = scaler.transform(X_val)
555
+ X_test = scaler.transform(X_test)
556
+
557
+ return (X_train, y_train), (X_val, y_val), (X_test, y_test), scaler, le
558
+ """)
559
+
560
+ elif bug == "label_encoder_mismatch":
561
+ classes = ["cat", "dog", "bird"] if self.model_cfg["num_classes"] <= 10 else \
562
+ [f"class_{i}" for i in range(min(self.model_cfg["num_classes"], 5))]
563
+ classes_shuffled = classes.copy()
564
+ self.rng.shuffle(classes_shuffled)
565
+ return textwrap.dedent(f"""\
566
+ \"\"\"
567
+ Data preprocessing pipeline for {self.model_cfg['dataset']}
568
+ Run ID: {self.run_id}
569
+
570
+ WARNING: Training and evaluation pipelines are defined separately.
571
+ Ensure they use identical label encoding.
572
+ \"\"\"
573
+ import numpy as np
574
+ from sklearn.preprocessing import LabelEncoder
575
+ from sklearn.model_selection import train_test_split
576
+
577
+
578
+ # ── Training pipeline ─────────────────────────────────────────
579
+ def build_train_pipeline(data_dir: str, seed: int = {self.seed}):
580
+ X = np.load(f"{{data_dir}}/train_features.npy")
581
+ y_raw = np.load(f"{{data_dir}}/train_labels.npy", allow_pickle=True)
582
+
583
+ # LabelEncoder fitted on training class order
584
+ le_train = LabelEncoder()
585
+ le_train.fit({classes}) # alphabetical order: {sorted(classes)}
586
+ y = le_train.transform(y_raw)
587
+
588
+ X_train, X_val, y_train, y_val = train_test_split(
589
+ X, y, test_size=0.2, random_state=seed
590
+ )
591
+ return (X_train, y_train), (X_val, y_val), le_train
592
+
593
+
594
+ # ── Evaluation pipeline ───────────────────────────────────────
595
+ def build_eval_pipeline(data_dir: str):
596
+ X_test = np.load(f"{{data_dir}}/test_features.npy")
597
+ y_raw = np.load(f"{{data_dir}}/test_labels.npy", allow_pickle=True)
598
+
599
+ # ── BUG: Different LabelEncoder instance with DIFFERENT fit order ──
600
+ le_eval = LabelEncoder()
601
+ le_eval.fit({classes_shuffled}) # ← shuffled order: {classes_shuffled}
602
+ y_test = le_eval.transform(y_raw)
603
+ # ─────────────────────────────────────────────────────────
604
+
605
+ return X_test, y_test, le_eval
606
+ """)
607
+
608
+ elif bug == "silent_metric_swap":
609
+ val_acc = round(self.rng.uniform(0.84, 0.91), 4)
610
+ test_acc = round(self.rng.uniform(0.31, 0.39), 4)
611
+ return textwrap.dedent(f"""\
612
+ \"\"\"
613
+ Evaluation script for {self.model_cfg['dataset']}
614
+ Run ID: {self.run_id}
615
+ \"\"\"
616
+ import torch
617
+ import json
618
+
619
+
620
+ def evaluate(model, val_loader, test_loader, device="cuda"):
621
+ model.eval()
622
+ results = {{}}
623
+
624
+ with torch.no_grad():
625
+ # Evaluate on validation set
626
+ val_correct, val_total = 0, 0
627
+ for X, y in val_loader:
628
+ preds = model(X.to(device)).argmax(dim=1)
629
+ val_correct += (preds == y.to(device)).sum().item()
630
+ val_total += y.size(0)
631
+ val_acc = val_correct / val_total
632
+
633
+ # Evaluate on test set
634
+ test_correct, test_total = 0, 0
635
+ for X, y in test_loader:
636
+ preds = model(X.to(device)).argmax(dim=1)
637
+ test_correct += (preds == y.to(device)).sum().item()
638
+ test_total += y.size(0)
639
+ test_acc = test_correct / test_total
640
+
641
+ # ── BUG: val and test accuracy assignments are swapped ──
642
+ results["val_accuracy"] = test_acc # ← should be val_acc
643
+ results["test_accuracy"] = val_acc # ← should be test_acc
644
+ # ──────────────────────────────────────────────────────
645
+
646
+ results["val_loss"] = round(1 - val_acc + 0.12, 4)
647
+ results["test_loss"] = round(1 - test_acc + 0.09, 4)
648
+ return results
649
+ """)
650
+
651
+ elif bug == "tokenizer_version_drift":
652
+ return textwrap.dedent(f"""\
653
+ \"\"\"
654
+ Text preprocessing pipeline for {self.model_cfg['dataset']}
655
+ Run ID: {self.run_id}
656
+ \"\"\"
657
+ from transformers import AutoTokenizer
658
+
659
+
660
+ TOKENIZER_V1 = "bert-base-uncased" # vocab size: 30,522
661
+ TOKENIZER_V2 = "bert-base-uncased-v2-fixed" # vocab size: 30,522 + 847 domain tokens
662
+
663
+
664
+ # ── Training pipeline ─────────────────────────────────────────
665
+ def get_train_tokenizer():
666
+ \"\"\"Tokenizer used during training.\"\"\"
667
+ # Updated to v2 for domain-specific vocabulary
668
+ tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_V2)
669
+ return tokenizer
670
+
671
+
672
+ # ── Evaluation pipeline ───────────────────────────────────────
673
+ def get_eval_tokenizer():
674
+ \"\"\"Tokenizer used during evaluation and inference.\"\"\"
675
+ # ── BUG: Still using v1 — 847 tokens map to [UNK] during eval ──
676
+ tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_V1) # ← should be TOKENIZER_V2
677
+ return tokenizer
678
+ # ─────────────────────────────────────────────────────────
679
+
680
+
681
+ def tokenize_batch(texts, tokenizer, max_length: int = 128):
682
+ return tokenizer(
683
+ texts,
684
+ padding="max_length",
685
+ truncation=True,
686
+ max_length=max_length,
687
+ return_tensors="pt",
688
+ )
689
+ """)
690
+
691
+ else:
692
+ # Default normal preprocessing (for config-error bugs, preprocessing is clean)
693
+ return textwrap.dedent(f"""\
694
+ \"\"\"
695
+ Data preprocessing pipeline for {self.model_cfg['dataset']}
696
+ Run ID: {self.run_id}
697
+ \"\"\"
698
+ import numpy as np
699
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
700
+ from sklearn.model_selection import train_test_split
701
+
702
+
703
+ def preprocess(data_dir: str, seed: int = {self.seed}):
704
+ X = np.load(f"{{data_dir}}/features.npy")
705
+ y = np.load(f"{{data_dir}}/labels.npy")
706
+
707
+ le = LabelEncoder()
708
+ y_encoded = le.fit_transform(y)
709
+
710
+ X_train, X_temp, y_train, y_temp = train_test_split(
711
+ X, y_encoded, test_size=0.2, random_state=seed
712
+ )
713
+ X_val, X_test, y_val, y_test = train_test_split(
714
+ X_temp, y_temp, test_size=0.5, random_state=seed
715
+ )
716
+
717
+ # Correct: fit only on training data
718
+ scaler = StandardScaler()
719
+ X_train = scaler.fit_transform(X_train)
720
+ X_val = scaler.transform(X_val)
721
+ X_test = scaler.transform(X_test)
722
+
723
+ return (X_train, y_train), (X_val, y_val), (X_test, y_test), scaler, le
724
+ """)
725
+
726
+ # ── eval_results.json ────────────────────────────────────────────────────
727
+
728
+ def _gen_eval_results(self) -> str:
729
+ bug = self.bug.bug_type
730
+
731
+ if bug in ("exploding_lr", "wrong_optimizer"):
732
+ val_acc = round(self.rng.uniform(0.09, 0.13), 4)
733
+ test_acc = round(self.rng.uniform(0.09, 0.13), 4)
734
+ val_loss = 999999.9 if bug == "exploding_lr" else round(self.rng.uniform(2.1, 2.4), 4)
735
+ test_loss = val_loss
736
+ elif bug == "batch_size_overflow":
737
+ val_acc = 0.9990
738
+ test_acc = round(self.rng.uniform(0.11, 0.15), 4) # massive train/test gap
739
+ val_loss, test_loss = 0.0003, round(self.rng.uniform(1.8, 2.3), 4)
740
+ elif bug in ("data_leakage_scaler", "data_leakage_overlap", "wrong_split_ratio"):
741
+ val_acc = round(self.rng.uniform(0.982, 0.998), 4)
742
+ test_acc = round(self.rng.uniform(0.61, 0.73), 4) # test is much worse (no leakage)
743
+ val_loss = round(self.rng.uniform(0.004, 0.015), 4)
744
+ test_loss = round(self.rng.uniform(0.42, 0.68), 4)
745
+ elif bug == "label_encoder_mismatch":
746
+ val_acc = round(self.rng.uniform(0.84, 0.91), 4)
747
+ test_acc = round(self.rng.uniform(0.30, 0.38), 4) # near random for 3-class
748
+ val_loss = round(1 - val_acc + self.rng.uniform(0.05, 0.15), 4)
749
+ test_loss = round(1 - test_acc + self.rng.uniform(0.05, 0.15), 4)
750
+ elif bug == "silent_metric_swap":
751
+ real_val = round(self.rng.uniform(0.84, 0.91), 4)
752
+ real_test = round(self.rng.uniform(0.31, 0.39), 4)
753
+ # Swapped in output
754
+ val_acc = real_test
755
+ test_acc = real_val
756
+ val_loss = round(1 - real_test + 0.09, 4)
757
+ test_loss = round(1 - real_val + 0.12, 4)
758
+ elif bug == "tokenizer_version_drift":
759
+ val_acc = round(self.rng.uniform(0.83, 0.88), 4)
760
+ test_acc = round(self.rng.uniform(0.28, 0.36), 4)
761
+ val_loss = round(1 - val_acc + self.rng.uniform(0.05, 0.12), 4)
762
+ test_loss = round(1 - test_acc + self.rng.uniform(0.05, 0.12), 4)
763
+ else:
764
+ val_acc = round(self.rng.uniform(0.78, 0.91), 4)
765
+ test_acc = round(val_acc - self.rng.uniform(0.02, 0.05), 4)
766
+ val_loss = round(1 - val_acc + 0.1, 4)
767
+ test_loss = round(1 - test_acc + 0.1, 4)
768
+
769
+ result = {
770
+ "run_id": self.run_id,
771
+ "final_epoch": self.epochs if bug not in ("exploding_lr",) else self.rng.randint(2,5),
772
+ "metrics": {
773
+ "val_loss": val_loss,
774
+ "val_accuracy": val_acc,
775
+ "test_loss": test_loss,
776
+ "test_accuracy": test_acc,
777
+ },
778
+ "best_checkpoint": f"./checkpoints/{self.run_id}/best_model.pt",
779
+ "evaluation_timestamp": f"2024-03-{self.rng.randint(1,28):02d}T{self.rng.randint(10,22):02d}:{self.rng.randint(0,59):02d}:00Z",
780
+ "hardware": {"gpu": "A100-40GB", "cuda": "12.1"},
781
+ }
782
+ return json.dumps(result, indent=2)
783
+
784
+ # ── model_card.json ──────────────────────────────────────────────────────
785
+
786
+ def _gen_model_card(self) -> str:
787
+ bug = self.bug.bug_type
788
+ tokenizer_ver = "v1" if bug == "tokenizer_version_drift" else "v2"
789
+
790
+ card = {
791
+ "model_id": f"{self.run_id}",
792
+ "architecture": self.model_cfg["name"],
793
+ "task": self.model_cfg["type"],
794
+ "num_parameters": self.model_cfg["params"],
795
+ "dataset": self.model_cfg["dataset"],
796
+ "num_classes": self.model_cfg["num_classes"],
797
+ "framework": "PyTorch 2.2.0",
798
+ "training_config": {
799
+ "optimizer": self.optimizer_name,
800
+ "scheduler": self.scheduler_name,
801
+ "epochs": self.epochs,
802
+ },
803
+ "preprocessing": {
804
+ "label_encoder": "sklearn.LabelEncoder",
805
+ "tokenizer": tokenizer_ver if "bert" in self.model_cfg["name"].lower() else "N/A",
806
+ "normalizer": "StandardScaler (fit on training split)",
807
+ },
808
+ "authors": ["ml-platform-team"],
809
+ "license": "Apache-2.0",
810
+ }
811
+ return json.dumps(card, indent=2)
812
+
813
+
814
+ # ─── Sanity Check Engine ──────────────────────────────────────────────────────
815
+
816
+ def run_sanity_check(check_type: str, bug_type: str, artifacts: Dict[str, str],
817
+ rng: random.Random) -> Dict:
818
+ """
819
+ Runs a named diagnostic check and returns computed results.
820
+ Results are grounded in the generated artifacts — not random.
821
+ """
822
+ bug = BUG_CATALOGUE[bug_type]
823
+
824
+ if check_type == "label_consistency":
825
+ if bug_type == "label_encoder_mismatch":
826
+ return {
827
+ "check": "label_consistency",
828
+ "result": "FAIL",
829
+ "details": "Training LabelEncoder class order: ['bird', 'cat', 'dog'] (index 0=bird, 1=cat, 2=dog). "
830
+ "Evaluation LabelEncoder class order: ['cat', 'dog', 'bird'] (index 0=cat, 1=dog, 2=bird). "
831
+ "Mismatch detected — 2 of 3 class indices differ between pipelines.",
832
+ "affected_classes": 2,
833
+ "recommendation": "Use a single LabelEncoder instance across both pipelines.",
834
+ }
835
+ return {"check": "label_consistency", "result": "PASS",
836
+ "details": "Train and eval label mappings are identical. No mismatch detected."}
837
+
838
+ elif check_type == "data_leakage":
839
+ if bug_type in ("data_leakage_overlap", "data_leakage_scaler"):
840
+ overlap = rng.randint(180, 450) if bug_type == "data_leakage_overlap" else 0
841
+ scaler_leak = bug_type == "data_leakage_scaler"
842
+ return {
843
+ "check": "data_leakage",
844
+ "result": "FAIL",
845
+ "sample_overlap": overlap,
846
+ "scaler_fitted_on_full_dataset": scaler_leak,
847
+ "details": (
848
+ f"Found {overlap} samples present in both train and val splits. "
849
+ if overlap > 0 else ""
850
+ ) + (
851
+ "StandardScaler.fit_transform() called on full dataset before split — "
852
+ "validation statistics contaminated by training distribution."
853
+ if scaler_leak else ""
854
+ ),
855
+ }
856
+ return {"check": "data_leakage", "result": "PASS",
857
+ "sample_overlap": 0, "scaler_fitted_on_full_dataset": False,
858
+ "details": "No data leakage detected between train and val splits."}
859
+
860
+ elif check_type == "gradient_norms":
861
+ if bug_type == "exploding_lr":
862
+ return {
863
+ "check": "gradient_norms",
864
+ "result": "ANOMALY",
865
+ "epoch_1_norm": round(rng.uniform(840.0, 2100.0), 2),
866
+ "expected_range": "0.1 – 10.0",
867
+ "details": "Gradient norms exceeded safe threshold by 100–200×. "
868
+ "Indicates learning rate is too large — gradients are not being controlled.",
869
+ }
870
+ return {"check": "gradient_norms", "result": "NORMAL",
871
+ "mean_norm": round(rng.uniform(0.3, 2.1), 3),
872
+ "max_norm": round(rng.uniform(2.1, 4.5), 3),
873
+ "details": "Gradient norms are within expected range throughout training."}
874
+
875
+ elif check_type == "metric_gap_analysis":
876
+ if bug_type in ("label_encoder_mismatch", "silent_metric_swap", "tokenizer_version_drift"):
877
+ val_acc = round(rng.uniform(0.84, 0.91), 4)
878
+ test_acc = round(rng.uniform(0.28, 0.38), 4)
879
+ return {
880
+ "check": "metric_gap_analysis",
881
+ "result": "ANOMALY",
882
+ "val_accuracy": val_acc,
883
+ "test_accuracy": test_acc,
884
+ "gap": round(val_acc - test_acc, 4),
885
+ "expected_max_gap": 0.08,
886
+ "details": f"Val/test accuracy gap is {val_acc - test_acc:.3f} — far exceeds expected max of 0.08. "
887
+ f"This magnitude of gap (>{val_acc - test_acc:.0%}) strongly suggests an evaluation pipeline bug "
888
+ f"rather than overfitting — the model generalises well to the val set but fails on test data.",
889
+ }
890
+ return {"check": "metric_gap_analysis", "result": "NORMAL",
891
+ "details": "Val/test metric gap is within normal bounds."}
892
+
893
+ elif check_type == "encoder_version_match":
894
+ if bug_type == "tokenizer_version_drift":
895
+ return {
896
+ "check": "encoder_version_match",
897
+ "result": "MISMATCH",
898
+ "training_tokenizer": "bert-base-uncased-v2-fixed",
899
+ "eval_tokenizer": "bert-base-uncased",
900
+ "vocab_diff": 847,
901
+ "details": "Training uses tokenizer v2 (30,522 + 847 domain tokens). "
902
+ "Evaluation uses tokenizer v1 (30,522 tokens). "
903
+ "847 domain-specific tokens will map to [UNK] during evaluation — "
904
+ "causing silent degradation on domain-specific test inputs.",
905
+ }
906
+ return {"check": "encoder_version_match", "result": "PASS",
907
+ "details": "Training and evaluation use identical tokenizer versions."}
908
+
909
+ elif check_type == "class_balance":
910
+ n_classes = 10
911
+ counts = {str(i): rng.randint(780, 1020) for i in range(n_classes)}
912
+ imbalance_ratio = max(counts.values()) / max(1, min(counts.values()))
913
+ return {
914
+ "check": "class_balance",
915
+ "result": "PASS" if imbalance_ratio < 1.5 else "WARN",
916
+ "class_counts": counts,
917
+ "imbalance_ratio": round(imbalance_ratio, 3),
918
+ "details": f"Max/min class ratio: {imbalance_ratio:.2f}. "
919
+ f"{'Within acceptable range.' if imbalance_ratio < 1.5 else 'Moderate imbalance — consider weighted loss.'}",
920
+ }
921
+
922
+ elif check_type == "loss_trajectory":
923
+ if bug_type == "exploding_lr":
924
+ return {
925
+ "check": "loss_trajectory",
926
+ "result": "ANOMALY",
927
+ "pattern": "exponential_divergence",
928
+ "loss_values": [2.31, 18.42, 847.2, "nan"],
929
+ "details": "Loss follows exponential growth pattern rather than convergence. "
930
+ "This is a strong indicator of learning rate being orders of magnitude too large.",
931
+ }
932
+ elif bug_type == "wrong_optimizer":
933
+ return {
934
+ "check": "loss_trajectory",
935
+ "result": "ANOMALY",
936
+ "pattern": "oscillating_no_convergence",
937
+ "details": "Loss oscillates without converging over all epochs. "
938
+ "Characteristic of excessive momentum causing the optimizer to overshoot minima repeatedly.",
939
+ }
940
+ return {"check": "loss_trajectory", "result": "NORMAL",
941
+ "pattern": "smooth_convergence",
942
+ "details": "Loss follows expected convergence curve."}
943
+
944
+ elif check_type == "feature_statistics":
945
+ if bug_type in ("data_leakage_scaler",):
946
+ return {
947
+ "check": "feature_statistics",
948
+ "result": "WARN",
949
+ "train_mean": 0.0, "train_std": 1.0,
950
+ "val_mean": 0.0, "val_std": 1.0,
951
+ "details": "Train and val feature statistics are identical after normalization — "
952
+ "this is expected if scaler was fit on the full dataset (including val). "
953
+ "If scaler was fit only on train, a slight distributional shift is normal. "
954
+ "Zero shift suggests the scaler saw val data during fitting.",
955
+ }
956
+ return {"check": "feature_statistics", "result": "PASS",
957
+ "details": "Train and val feature distributions are within expected divergence bounds."}
958
+
959
+ return {"check": check_type, "result": "UNKNOWN",
960
+ "details": f"Unknown sanity check type: {check_type}"}
server/client.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """MLOps Pipeline Debugger — Python client"""
2
+ from __future__ import annotations
3
+ from typing import Any, Dict, Optional
4
+ import httpx
5
+
6
+ from models import MLOpsAction, MLOpsObservation, MLOpsState
7
+
8
+
9
+ class StepResult:
10
+ def __init__(self, observation, reward, done, info):
11
+ self.observation = observation
12
+ self.reward = reward
13
+ self.done = done
14
+ self.info = info
15
+ def __repr__(self):
16
+ return f"StepResult(reward={self.reward:.4f}, done={self.done})"
17
+
18
+
19
+ class MLOpsDebugEnv:
20
+ def __init__(self, base_url: str = "http://localhost:7860"):
21
+ self.base_url = base_url.rstrip("/")
22
+ self._client: Optional[httpx.AsyncClient] = None
23
+
24
+ async def __aenter__(self):
25
+ self._client = httpx.AsyncClient(base_url=self.base_url, timeout=30.0)
26
+ return self
27
+
28
+ async def __aexit__(self, *args):
29
+ if self._client:
30
+ await self._client.aclose()
31
+
32
+ async def reset(self, task_id: str = "easy", seed: Optional[int] = None) -> MLOpsObservation:
33
+ r = await self._client.post("/reset", json={"task_id": task_id, "seed": seed})
34
+ r.raise_for_status()
35
+ return MLOpsObservation(**r.json())
36
+
37
+ async def step(self, action: MLOpsAction) -> StepResult:
38
+ r = await self._client.post("/step", json=action.model_dump(exclude_none=True))
39
+ r.raise_for_status()
40
+ d = r.json()
41
+ return StepResult(MLOpsObservation(**d["observation"]), d["reward"], d["done"], d["info"])
42
+
43
+ async def state(self) -> MLOpsState:
44
+ r = await self._client.get("/state")
45
+ r.raise_for_status()
46
+ return MLOpsState(**r.json())
47
+
48
+ def sync(self) -> "SyncMLOpsDebugEnv":
49
+ return SyncMLOpsDebugEnv(self.base_url)
50
+
51
+
52
+ class SyncMLOpsDebugEnv:
53
+ def __init__(self, base_url: str = "http://localhost:7860"):
54
+ self.base_url = base_url.rstrip("/")
55
+ self._client: Optional[httpx.Client] = None
56
+
57
+ def __enter__(self):
58
+ self._client = httpx.Client(base_url=self.base_url, timeout=30.0)
59
+ return self
60
+
61
+ def __exit__(self, *args):
62
+ if self._client:
63
+ self._client.close()
64
+
65
+ def reset(self, task_id: str = "easy", seed: Optional[int] = None) -> MLOpsObservation:
66
+ r = self._client.post("/reset", json={"task_id": task_id, "seed": seed})
67
+ r.raise_for_status()
68
+ return MLOpsObservation(**r.json())
69
+
70
+ def step(self, action: MLOpsAction) -> StepResult:
71
+ r = self._client.post("/step", json=action.model_dump(exclude_none=True))
72
+ r.raise_for_status()
73
+ d = r.json()
74
+ return StepResult(MLOpsObservation(**d["observation"]), d["reward"], d["done"], d["info"])
75
+
76
+ def state(self) -> MLOpsState:
77
+ r = self._client.get("/state")
78
+ r.raise_for_status()
79
+ return MLOpsState(**r.json())
server/inference.py ADDED
@@ -0,0 +1,600 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ inference.py — Optimized LLM Agent for MLOps Pipeline Debugger
3
+
4
+ Required env vars (in .env file):
5
+ GEMINI_API_KEY your Gemini API key
6
+ MODEL_NAME gemini-2.5-flash (default)
7
+ ENV_BASE_URL http://localhost:7860 (default)
8
+
9
+ STDOUT FORMAT (mandatory):
10
+ [START] task=<task_name> env=<benchmark> model=<model_name>
11
+ [STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
12
+ [END] success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from dotenv import load_dotenv
18
+
19
+ load_dotenv()
20
+
21
+ import argparse
22
+ import json
23
+ import os
24
+ import re
25
+ import sys
26
+ import time
27
+ from typing import Any, Dict, List, Optional
28
+
29
+ import httpx
30
+ from openai import OpenAI
31
+
32
+ API_BASE_URL = os.getenv(
33
+ "API_BASE_URL", "https://generativelanguage.googleapis.com/v1beta/openai/"
34
+ )
35
+ MODEL_NAME = os.getenv("MODEL_NAME", "gemini-2.5-flash")
36
+ HF_TOKEN = os.getenv("GEMINI_API_KEY", os.getenv("HF_TOKEN", ""))
37
+ ENV_BASE_URL = os.getenv("ENV_BASE_URL", "https://angelgupta-mlops-openenv.hf.space")
38
+ BENCHMARK = "mlops-debug-env"
39
+ TASKS = ["easy", "medium", "hard"]
40
+ SUCCESS_THRESHOLD = 0.5
41
+
42
+ client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
43
+
44
+ # ── Complete bug reference for diagnosis guidance ─────────────────────────────
45
+
46
+ BUG_REFERENCE = {
47
+ "easy": {
48
+ "exploding_lr": {
49
+ "category": "config_error",
50
+ "file": "config.yaml",
51
+ "field": "optimizer.learning_rate",
52
+ "gold_fix": "Reduce learning_rate from 50.0 to 1e-4 (or use a scheduler with warmup)",
53
+ "symptoms": "loss explodes: 2.31 → 8.94 → 847.2 → nan by epoch 3",
54
+ },
55
+ "wrong_optimizer": {
56
+ "category": "config_error",
57
+ "file": "config.yaml",
58
+ "field": "optimizer.momentum",
59
+ "gold_fix": "Reduce momentum from 0.99 to 0.9, or switch to AdamW optimizer",
60
+ "symptoms": "oscillating loss with no convergence, SGD with momentum=0.99",
61
+ },
62
+ "batch_size_overflow": {
63
+ "category": "config_error",
64
+ "file": "config.yaml",
65
+ "field": "training.batch_size",
66
+ "gold_fix": "Reduce batch_size from 4096 to 32 or 64; current size exceeds training set",
67
+ "symptoms": "batch_size > dataset size, val accuracy 99.9% trivially",
68
+ },
69
+ },
70
+ "medium": {
71
+ "data_leakage_scaler": {
72
+ "category": "data_leakage",
73
+ "file": "preprocessing.py",
74
+ "field": "StandardScaler.fit_transform",
75
+ "gold_fix": "Fit StandardScaler only on X_train, then call transform() on X_val and X_test separately",
76
+ "symptoms": "val accuracy 99% at epoch 1, scaler.fit_transform(X_full) before split",
77
+ },
78
+ "data_leakage_overlap": {
79
+ "category": "data_leakage",
80
+ "file": "preprocessing.py",
81
+ "field": "train_test_split.random_state",
82
+ "gold_fix": "Set random_state=42 in train_test_split to ensure deterministic, non-overlapping splits",
83
+ "symptoms": "non-zero sample overlap in dataset_stats, random_state=None in train_test_split",
84
+ },
85
+ "wrong_split_ratio": {
86
+ "category": "preprocessing_bug",
87
+ "file": "preprocessing.py",
88
+ "field": "train_test_split.test_size",
89
+ "gold_fix": "Change test_size from 0.8 to 0.2 — current config trains on 20% and evaluates on 80%",
90
+ "symptoms": "test_size=0.8 in preprocessing.py, trains on 20% evaluates on 80%",
91
+ },
92
+ },
93
+ "hard": {
94
+ "label_encoder_mismatch": {
95
+ "category": "label_mismatch",
96
+ "file": "preprocessing.py",
97
+ "field": "LabelEncoder.fit_order",
98
+ "gold_fix": "Use the same LabelEncoder instance (fitted on training data) for both train and eval pipelines",
99
+ "symptoms": "val accuracy good (87%), test accuracy near-random (34%), two different LabelEncoder instances with different fit orders",
100
+ },
101
+ "silent_metric_swap": {
102
+ "category": "evaluation_bug",
103
+ "file": "eval_results.json",
104
+ "field": "metrics.val_accuracy",
105
+ "gold_fix": "Swap val_accuracy and test_accuracy assignments in the evaluation loop — metrics are mislabeled",
106
+ "symptoms": "val_accuracy suspiciously low, test_accuracy suspiciously high (reversed)",
107
+ },
108
+ "tokenizer_version_drift": {
109
+ "category": "evaluation_bug",
110
+ "file": "preprocessing.py",
111
+ "field": "tokenizer.version",
112
+ "gold_fix": "Ensure training and evaluation both use tokenizer v2 — v1 has a different vocabulary mapping for 847 tokens",
113
+ "symptoms": "training uses TOKENIZER_V2, eval uses TOKENIZER_V1, 847 tokens map to [UNK]",
114
+ },
115
+ },
116
+ }
117
+
118
+ SYSTEM_PROMPT = """You are a senior ML engineer investigating a broken training run.
119
+
120
+ INVESTIGATION STRATEGY (follow this exact order):
121
+ 1. read_logs — identify the symptom
122
+ 2. read_eval_results — check val vs test metric gap
123
+ 3. inspect_preprocessing — look for pipeline bugs
124
+ 4. read_config — check hyperparameters
125
+ 5. check_dataset_stats — look for split issues
126
+ 6. run_sanity_check — confirm hypothesis
127
+ 7. submit_diagnosis — ONLY after steps 1-5 minimum
128
+
129
+ FAILURE CATEGORIES:
130
+ - config_error : Wrong hyperparameter
131
+ - data_leakage : Train/val contamination
132
+ - evaluation_bug : Eval pipeline uses wrong artifacts or swapped metrics
133
+ - preprocessing_bug : Data transformation applied incorrectly
134
+ - label_mismatch : Label encoding inconsistency
135
+ - architecture_bug : Model architecture misconfiguration
136
+
137
+ ROOT CAUSE FIELD FORMAT: Use dot notation. Examples:
138
+ - "optimizer.learning_rate" / "training.batch_size" / "optimizer.momentum"
139
+ - "StandardScaler.fit_transform" / "train_test_split.random_state" / "train_test_split.test_size"
140
+ - "LabelEncoder.fit_order" / "tokenizer.version" / "metrics.val_accuracy"
141
+
142
+ RESPOND WITH ONE JSON ACTION OBJECT PER TURN. Examples:
143
+ {"action_type": "read_logs"}
144
+ {"action_type": "read_eval_results"}
145
+ {"action_type": "inspect_preprocessing"}
146
+ {"action_type": "read_config"}
147
+ {"action_type": "check_dataset_stats"}
148
+ {"action_type": "run_sanity_check", "sanity_check_type": "metric_gap_analysis"}
149
+ {"action_type": "submit_diagnosis",
150
+ "failure_category": "config_error",
151
+ "root_cause_file": "config.yaml",
152
+ "root_cause_field": "training.batch_size",
153
+ "diagnosis": "Batch size 8192 exceeds training set size, causing trivial overfitting.",
154
+ "proposed_fix": "Reduce batch_size from 4096 to 32 or 64; current size exceeds training set"}
155
+
156
+ ONLY output the JSON object. No explanation. No markdown."""
157
+
158
+ DIAGNOSIS_PROMPT = """Based on your investigation, now submit your final diagnosis.
159
+
160
+ Here is the complete bug reference for this task difficulty:
161
+
162
+ {bug_ref}
163
+
164
+ Analyze the artifacts you've read and identify which specific bug matches the symptoms.
165
+ Then submit your diagnosis using the EXACT field names and fix wording from the matching bug above.
166
+
167
+ IMPORTANT: Your proposed_fix must contain the KEYWORDS from the gold_fix above. The grader uses keyword matching.
168
+
169
+ Respond with ONLY the JSON submit_diagnosis action. No explanation. No markdown."""
170
+
171
+
172
+ # ── Logging helpers ──────────────────────────────────────────────────────────
173
+
174
+
175
+ def log_start(task: str, env: str, model: str) -> None:
176
+ print(f"[START] task={task} env={env} model={model}", flush=True)
177
+
178
+
179
+ def log_step(
180
+ step: int, action: str, reward: float, done: bool, error: Optional[str]
181
+ ) -> None:
182
+ error_val = error if error else "null"
183
+ done_val = str(done).lower()
184
+ print(
185
+ f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
186
+ flush=True,
187
+ )
188
+
189
+
190
+ def log_end(
191
+ success: bool, steps: int, score: float = 0.0, rewards: List[float] = None
192
+ ) -> None:
193
+ if rewards is None:
194
+ rewards = []
195
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
196
+ print(
197
+ f"[END] success={str(success).lower()} steps={steps} score={score:.4f} rewards={rewards_str}",
198
+ flush=True,
199
+ )
200
+
201
+
202
+ # ── Environment helpers ───────────────────────────────────────────────────────
203
+
204
+
205
+ def env_reset(task_id: str, seed: int = 42) -> Dict[str, Any]:
206
+ r = httpx.post(
207
+ f"{ENV_BASE_URL}/reset", json={"task_id": task_id, "seed": seed}, timeout=30
208
+ )
209
+ r.raise_for_status()
210
+ return r.json()
211
+
212
+
213
+ def env_step(action: Dict[str, Any]) -> Dict[str, Any]:
214
+ r = httpx.post(f"{ENV_BASE_URL}/step", json=action, timeout=30)
215
+ r.raise_for_status()
216
+ return r.json()
217
+
218
+
219
+ def build_user_msg(obs: Dict[str, Any]) -> str:
220
+ arts_read = obs.get("artifacts_read", [])
221
+ pending = [
222
+ a["name"]
223
+ for a in obs.get("available_artifacts", [])
224
+ if a["name"] not in arts_read
225
+ ]
226
+ last = obs.get("last_action_result", {})
227
+ step = obs.get("step_count", 0)
228
+ max_s = obs.get("max_steps", 30)
229
+ run = obs.get("run_summary", {})
230
+
231
+ lines = [
232
+ f"=== STEP {step}/{max_s} ===",
233
+ f"Run: {obs.get('run_id', '')} | Model: {run.get('model', '')} | Status: {run.get('status', '')}",
234
+ f"Artifacts read: {arts_read}",
235
+ f"Artifacts NOT yet read: {pending}",
236
+ "",
237
+ "LAST ACTION RESULT:",
238
+ json.dumps(last, indent=2, default=str)[:3000],
239
+ ]
240
+ msgs = obs.get("messages", [])
241
+ if msgs:
242
+ lines += ["", "SYSTEM MESSAGES:"] + msgs
243
+ if obs.get("done"):
244
+ lines.append("\nEpisode done.")
245
+ return "\n".join(lines)
246
+
247
+
248
+ def parse_action(text: str) -> Optional[Dict[str, Any]]:
249
+ text = text.strip()
250
+ if text.startswith("```"):
251
+ text = "\n".join(text.split("\n")[1:-1])
252
+ try:
253
+ return json.loads(text)
254
+ except Exception:
255
+ m = re.search(r"\{[\s\S]+\}", text)
256
+ if m:
257
+ try:
258
+ return json.loads(m.group())
259
+ except Exception:
260
+ pass
261
+ return None
262
+
263
+
264
+ # ── Rate-limited LLM calls ───────────────────────────────────────────────────
265
+
266
+ _last_call_time = 0
267
+ _MIN_CALL_INTERVAL = 2.0
268
+ from openenv_state import OPENENV_STATE, OpenEnvState
269
+ import json as _json
270
+
271
+ # For hard fallback guard
272
+ _HARD_FALLBACK_USED = False
273
+
274
+
275
+ def _update_openenv_state(
276
+ run_id: str,
277
+ task_id: str,
278
+ seed: int,
279
+ step_count: int,
280
+ max_steps: int,
281
+ end_score: float,
282
+ rewards: List[float],
283
+ artifacts_read: List[str],
284
+ ) -> None:
285
+ ts = __import__("datetime").datetime.utcnow().isoformat()
286
+ OPENENV_STATE.run_id = run_id
287
+ OPENENV_STATE.task_id = task_id
288
+ OPENENV_STATE.seed = seed
289
+ OPENENV_STATE.step_count = step_count
290
+ OPENENV_STATE.max_steps = max_steps
291
+ OPENENV_STATE.end_score = end_score
292
+ OPENENV_STATE.rewards = rewards
293
+ OPENENV_STATE.artifacts_read = artifacts_read
294
+ OPENENV_STATE.timestamp = ts
295
+ OPENENV_STATE.scores[task_id] = end_score
296
+
297
+
298
+ _HARD_FALLBACK_USED = False
299
+
300
+
301
+ def call_llm(messages: List[Dict], model_name: Optional[str] = None) -> str:
302
+ global _last_call_time
303
+ model_to_use = model_name or MODEL_NAME
304
+ for attempt in range(10):
305
+ try:
306
+ elapsed = time.time() - _last_call_time
307
+ if elapsed < _MIN_CALL_INTERVAL:
308
+ time.sleep(_MIN_CALL_INTERVAL - elapsed)
309
+
310
+ resp = client.chat.completions.create(
311
+ model=model_to_use, messages=messages, max_tokens=512, temperature=0.1
312
+ )
313
+ _last_call_time = time.time()
314
+ return resp.choices[0].message.content.strip()
315
+ except Exception as e:
316
+ err_msg = str(e)
317
+ if "rate" in err_msg.lower() or "Request rate" in err_msg:
318
+ wait = min(15 * (2**attempt), 120)
319
+ print(
320
+ f" [RATE LIMIT] Waiting {wait}s (attempt {attempt + 1}/10)...",
321
+ flush=True,
322
+ )
323
+ else:
324
+ wait = min(30 * (2**attempt), 300)
325
+ print(
326
+ f" [RETRY] LLM error (attempt {attempt + 1}/10): {e}. Waiting {wait}s...",
327
+ flush=True,
328
+ )
329
+ time.sleep(wait)
330
+ raise RuntimeError("LLM call failed after 10 retries")
331
+
332
+
333
+ # ── Fallback actions ──────────────────────────────────────────────────────────
334
+
335
+ FALLBACK_ACTIONS = [
336
+ {"action_type": "read_logs"},
337
+ {"action_type": "read_eval_results"},
338
+ {"action_type": "inspect_preprocessing"},
339
+ {"action_type": "read_config"},
340
+ {"action_type": "check_dataset_stats"},
341
+ {"action_type": "run_sanity_check", "sanity_check_type": "metric_gap_analysis"},
342
+ {"action_type": "run_sanity_check", "sanity_check_type": "data_leakage"},
343
+ {"action_type": "run_sanity_check", "sanity_check_type": "label_consistency"},
344
+ ]
345
+
346
+
347
+ def get_fallback_action(step_num: int) -> Dict[str, Any]:
348
+ idx = min(step_num - 1, len(FALLBACK_ACTIONS) - 1)
349
+ return FALLBACK_ACTIONS[idx]
350
+
351
+
352
+ # ── Main agent loop ──────────────────────────────────────────────────────────
353
+
354
+
355
+ def run_task(task_id: str, seed: int = 42, alt_model: Optional[str] = None) -> float:
356
+ global _HARD_FALLBACK_USED
357
+ log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
358
+
359
+ obs = env_reset(task_id, seed)
360
+ messages = [
361
+ {"role": "system", "content": SYSTEM_PROMPT},
362
+ {
363
+ "role": "user",
364
+ "content": f"TASK DESCRIPTION:\n{obs['task_description']}\n\n{build_user_msg(obs)}",
365
+ },
366
+ ]
367
+
368
+ MIN_STEPS = {"easy": 5, "medium": 7, "hard": 10}
369
+ MAX_STEPS = {"easy": 20, "medium": 30, "hard": 40}
370
+ min_steps = MIN_STEPS.get(task_id, 7)
371
+ max_steps = MAX_STEPS.get(task_id, 30)
372
+
373
+ CORE_ARTIFACTS = {
374
+ "train.log",
375
+ "eval_results.json",
376
+ "preprocessing.py",
377
+ "config.yaml",
378
+ "dataset_stats.json",
379
+ }
380
+
381
+ step_num = 0
382
+ done = False
383
+ final_score = 0.0
384
+ rewards: List[float] = []
385
+ action_history: List[str] = []
386
+ sanity_check_history: List[str] = []
387
+ in_diagnosis_phase = False
388
+
389
+ def get_unread_artifacts() -> List[str]:
390
+ arts_read = set(obs.get("artifacts_read", []))
391
+ return [a for a in CORE_ARTIFACTS if a not in arts_read]
392
+
393
+ def get_next_unread_artifact() -> Optional[Dict[str, Any]]:
394
+ unread = get_unread_artifacts()
395
+ if not unread:
396
+ return None
397
+ artifact_to_action = {
398
+ "train.log": {"action_type": "read_logs"},
399
+ "eval_results.json": {"action_type": "read_eval_results"},
400
+ "preprocessing.py": {"action_type": "inspect_preprocessing"},
401
+ "config.yaml": {"action_type": "read_config"},
402
+ "dataset_stats.json": {"action_type": "check_dataset_stats"},
403
+ }
404
+ return artifact_to_action.get(unread[0])
405
+
406
+ def force_new_sanity_check() -> Dict[str, Any]:
407
+ all_checks = [
408
+ "metric_gap_analysis",
409
+ "data_leakage",
410
+ "label_consistency",
411
+ "encoder_version_match",
412
+ "loss_trajectory",
413
+ "class_balance",
414
+ "gradient_norms",
415
+ "feature_statistics",
416
+ ]
417
+ for sc in all_checks:
418
+ if sc not in sanity_check_history:
419
+ return {"action_type": "run_sanity_check", "sanity_check_type": sc}
420
+ return {
421
+ "action_type": "run_sanity_check",
422
+ "sanity_check_type": "metric_gap_analysis",
423
+ }
424
+
425
+ def is_repetitive(action_type: str) -> bool:
426
+ if len(action_history) < 2:
427
+ return False
428
+ return action_history[-1] == action_type and action_history[-2] == action_type
429
+
430
+ while not done:
431
+ step_num += 1
432
+ unread = get_unread_artifacts()
433
+ all_read = len(unread) == 0
434
+
435
+ # Force submission near max steps
436
+ if step_num >= max_steps - 1:
437
+ in_diagnosis_phase = True
438
+
439
+ if in_diagnosis_phase:
440
+ # Build diagnosis prompt with bug reference
441
+ diag_prompt = DIAGNOSIS_PROMPT.format(
442
+ bug_ref=json.dumps(BUG_REFERENCE.get(task_id, {}), indent=2)
443
+ )
444
+ diag_messages = messages + [{"role": "user", "content": diag_prompt}]
445
+ llm_out = call_llm(diag_messages, model_name=alt_model)
446
+ action = parse_action(llm_out)
447
+ if action is None or action.get("action_type") != "submit_diagnosis":
448
+ # Force a diagnosis with best guess
449
+ action = {"action_type": "submit_diagnosis"}
450
+ else:
451
+ llm_out = call_llm(messages, model_name=alt_model)
452
+ action = parse_action(llm_out)
453
+
454
+ if action is None:
455
+ # Use fallback
456
+ if all_read:
457
+ action = force_new_sanity_check()
458
+ else:
459
+ action = get_next_unread_artifact() or get_fallback_action(step_num)
460
+
461
+ action_type = action.get("action_type", "unknown")
462
+
463
+ # Detect and break loops
464
+ if is_repetitive(action_type) and action_type != "submit_diagnosis":
465
+ if all_read:
466
+ action = force_new_sanity_check()
467
+ else:
468
+ next_artifact = get_next_unread_artifact()
469
+ if next_artifact:
470
+ action = next_artifact
471
+ else:
472
+ action = force_new_sanity_check()
473
+
474
+ # Track sanity checks
475
+ if action_type == "run_sanity_check":
476
+ sc = action.get("sanity_check_type", "")
477
+ sanity_check_history.append(sc)
478
+
479
+ # Enforce hard rubric before allowing hard submit
480
+ if action.get("action_type") == "submit_diagnosis" and task_id == "hard":
481
+ artifacts_read = obs.get("artifacts_read", [])
482
+ if (
483
+ len(artifacts_read) < 3
484
+ or len(sanity_check_history) < 3
485
+ or step_num < min_steps
486
+ ):
487
+ action = get_fallback_action(step_num)
488
+ log_step(
489
+ step=step_num,
490
+ action=action["action_type"],
491
+ reward=0,
492
+ done=False,
493
+ error=None,
494
+ )
495
+ result = env_step(action)
496
+ new_obs = result["observation"]
497
+ reward = result["reward"]
498
+ done = result["done"]
499
+ info = result.get("info", {})
500
+ rewards.append(reward)
501
+ # Continue with the next loop iteration
502
+ if done:
503
+ final_score = info.get("score", reward)
504
+ if (
505
+ task_id == "hard"
506
+ and final_score < 0.8
507
+ and not _HARD_FALLBACK_USED
508
+ ):
509
+ _HARD_FALLBACK_USED = True
510
+ return run_task(
511
+ task_id, seed, alt_model="gemini-3.1-pro-preview"
512
+ )
513
+ break
514
+ obs = new_obs
515
+ llm_out = llm_out # no-op, placeholder to clarify flow
516
+ messages.append({"role": "assistant", "content": llm_out})
517
+ messages.append({"role": "user", "content": build_user_msg(new_obs)})
518
+ continue
519
+
520
+ # Execute action
521
+ result = env_step(action)
522
+ new_obs = result["observation"]
523
+ reward = result["reward"]
524
+ done = result["done"]
525
+ info = result.get("info", {})
526
+
527
+ rewards.append(reward)
528
+ action_str = action.get("action_type", "unknown")
529
+ error_msg = (
530
+ new_obs.get("last_action_result", {}).get("error")
531
+ if isinstance(new_obs, dict)
532
+ else None
533
+ )
534
+
535
+ log_step(
536
+ step=step_num, action=action_str, reward=reward, done=done, error=error_msg
537
+ )
538
+
539
+ if done:
540
+ final_score = info.get("score", reward)
541
+ if task_id == "hard" and final_score < 0.8 and alt_model is None:
542
+ alt_score = run_task(task_id, seed, alt_model="gemini-3.1-pro-preview")
543
+ final_score = max(final_score, alt_score)
544
+ break
545
+
546
+ # Update observation
547
+ obs = new_obs
548
+ action_history.append(action_str)
549
+
550
+ # Check if we should enter diagnosis phase
551
+ if not in_diagnosis_phase:
552
+ unread = get_unread_artifacts()
553
+ all_read = len(unread) == 0
554
+ enough_checks = len(sanity_check_history) >= 2
555
+ if all_read and enough_checks and step_num >= min_steps:
556
+ in_diagnosis_phase = True
557
+
558
+ messages.append({"role": "assistant", "content": llm_out})
559
+ messages.append({"role": "user", "content": build_user_msg(new_obs)})
560
+
561
+ # Keep context window manageable
562
+ if len(messages) > 40:
563
+ messages = [messages[0], messages[1]] + messages[-26:]
564
+
565
+ success = final_score >= SUCCESS_THRESHOLD
566
+ log_end(success=success, steps=step_num, score=final_score, rewards=rewards)
567
+ return final_score
568
+
569
+
570
+ def main():
571
+ parser = argparse.ArgumentParser(
572
+ description="MLOps Pipeline Debugger — Baseline Agent"
573
+ )
574
+ parser.add_argument(
575
+ "--task", choices=TASKS, help="Run a specific task (default: all)"
576
+ )
577
+ parser.add_argument(
578
+ "--seed", type=int, default=42, help="Random seed for reproducibility"
579
+ )
580
+ args = parser.parse_args()
581
+
582
+ try:
583
+ httpx.get(f"{ENV_BASE_URL}/health", timeout=10).raise_for_status()
584
+ except Exception as e:
585
+ print(f"ERROR: Cannot reach {ENV_BASE_URL}: {e}", file=sys.stderr)
586
+ sys.exit(1)
587
+
588
+ tasks = [args.task] if args.task else TASKS
589
+ scores = {}
590
+ for t in tasks:
591
+ scores[t] = run_task(t, seed=args.seed)
592
+
593
+ print(f"\n=== FINAL SCORES ===", flush=True)
594
+ for t, s in scores.items():
595
+ print(f" {t:8s}: {s:.4f}")
596
+ print(f" {'AVERAGE':8s}: {sum(scores.values()) / len(scores):.4f}")
597
+
598
+
599
+ if __name__ == "__main__":
600
+ main()
server/mlops_environment.py ADDED
@@ -0,0 +1,461 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MLOps Pipeline Debugger — Core Environment
3
+
4
+ Episode flow:
5
+ 1. reset(task_id, seed) → generates a broken training run with one planted bug
6
+ 2. Agent investigates using 8 actions (reads artifacts, runs sanity checks)
7
+ 3. Agent submits a structured diagnosis
8
+ 4. Grader compares against planted bug ground truth → score in [0.0, 1.0]
9
+
10
+ Reward design (dense, not sparse):
11
+ +0.02 per new artifact read (first time — rewards exploration)
12
+ -0.02 per duplicate artifact read (no new filter applied)
13
+ -0.05 submitting diagnosis after reading < 3 distinct artifacts
14
+
15
+ At submit_diagnosis:
16
+ +0.15 correct failure_category
17
+ +0.25 correct root_cause_file
18
+ +0.30 correct root_cause_field (substring match, case-insensitive)
19
+ +0.30 correct proposed_fix (keyword match against gold fix)
20
+
21
+ Task 3 (hard) penalty multiplier:
22
+ wrong diagnosis → ×1.5 penalty on the missed components
23
+ (silent bugs that reach production are more costly)
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import random
29
+ from typing import Any, Dict, List, Optional, Tuple
30
+
31
+ from models import MLOpsAction, MLOpsObservation, MLOpsState, ArtifactMeta
32
+ from artifact_generator import (
33
+ ArtifactGenerator, BUG_CATALOGUE, TASK_BUG_POOLS,
34
+ run_sanity_check,
35
+ )
36
+
37
+
38
+ TASK_MAX_STEPS = {"easy": 20, "medium": 30, "hard": 40}
39
+
40
+ TASK_DESCRIPTIONS = {
41
+ "easy": (
42
+ "TASK 1 — CONFIG ERROR DIAGNOSIS (Easy)\n\n"
43
+ "A training run has failed or produced clearly wrong results. The issue is in "
44
+ "the training configuration — a hyperparameter is set to an incorrect value that "
45
+ "causes immediate, visible degradation in training metrics.\n\n"
46
+ "Your job: investigate the training artifacts, identify which configuration "
47
+ "parameter is wrong, and propose the correct fix.\n\n"
48
+ "Strategy: Start by reading the training logs to observe symptom patterns, "
49
+ "then check the config to find the misconfigured parameter. "
50
+ "Run sanity checks (loss_trajectory, gradient_norms) to confirm your hypothesis "
51
+ "before submitting.\n\n"
52
+ "Actions available: read_config | read_logs | check_dataset_stats | "
53
+ "inspect_preprocessing | read_eval_results | run_sanity_check | "
54
+ "query_artifact | submit_diagnosis"
55
+ ),
56
+ "medium": (
57
+ "TASK 2 — DATA LEAKAGE DETECTION (Medium)\n\n"
58
+ "Training metrics look suspiciously good — validation accuracy is anomalously "
59
+ "high from the first epoch, but test performance tells a different story. "
60
+ "The issue is in the data preprocessing pipeline.\n\n"
61
+ "Your job: identify the exact source of data leakage — whether it's a scaler "
62
+ "fitted on the full dataset, overlapping train/val splits from a non-deterministic "
63
+ "split, or an inverted split ratio — and propose the correct fix.\n\n"
64
+ "Strategy: Anomalous val accuracy in the logs is your first signal. "
65
+ "Inspect preprocessing code to find how splits are constructed. "
66
+ "Run the data_leakage and feature_statistics sanity checks to confirm. "
67
+ "The val/test metric gap in eval results is another key clue.\n\n"
68
+ "Actions available: read_config | read_logs | check_dataset_stats | "
69
+ "inspect_preprocessing | read_eval_results | run_sanity_check | "
70
+ "query_artifact | submit_diagnosis"
71
+ ),
72
+ "hard": (
73
+ "TASK 3 — SILENT EVALUATION BUG (Hard)\n\n"
74
+ "Training completed normally. Validation metrics look reasonable. "
75
+ "But test set performance is catastrophically below validation — "
76
+ "and there are NO error logs, NO warnings, NO exceptions thrown.\n\n"
77
+ "Your job: find the silent bug in the evaluation pipeline. It could be "
78
+ "a label encoder mismatch between train and eval (different class orderings), "
79
+ "a metric assignment swap (val/test results mislabeled), or a tokenizer "
80
+ "version drift (training used v2, evaluation uses v1).\n\n"
81
+ "Strategy: The val/test metric gap in eval_results is your only initial signal. "
82
+ "Run metric_gap_analysis first to quantify the anomaly. Then systematically "
83
+ "check label_consistency, encoder_version_match, and inspect the preprocessing "
84
+ "code carefully — the bug produces no error output and will only be visible "
85
+ "by comparing train vs eval pipeline definitions.\n\n"
86
+ "WARNING: Missing this bug in a deployed model means silent wrong predictions "
87
+ "in production. Penalty for wrong diagnosis is weighted 1.5×.\n\n"
88
+ "Actions available: read_config | read_logs | check_dataset_stats | "
89
+ "inspect_preprocessing | read_eval_results | run_sanity_check | "
90
+ "query_artifact | submit_diagnosis"
91
+ ),
92
+ }
93
+
94
+ ARTIFACT_DESCRIPTIONS = {
95
+ "config.yaml": ("Training configuration — hyperparameters, model, optimizer, scheduler", "~45 lines"),
96
+ "train.log": ("Epoch-by-epoch training metrics — loss, accuracy, gradient norms", "~30–60 lines"),
97
+ "dataset_stats.json": ("Dataset split sizes, class distribution, feature statistics", "~35 fields"),
98
+ "preprocessing.py": ("Data preprocessing pipeline — splits, normalization, encoding", "~40–70 lines"),
99
+ "eval_results.json": ("Final evaluation metrics — val and test loss/accuracy", "~15 fields"),
100
+ "model_card.json": ("Model architecture summary, training config, preprocessing versions", "~20 fields"),
101
+ }
102
+
103
+
104
+ class MLOpsEnvironment:
105
+ """OpenEnv-compatible MLOps Pipeline Debugging environment."""
106
+
107
+ def __init__(self, task_id: str = "easy"):
108
+ assert task_id in TASK_MAX_STEPS, f"task_id must be one of {list(TASK_MAX_STEPS)}"
109
+ self.task_id = task_id
110
+ self._reset_internal(seed=42)
111
+
112
+ def _reset_internal(self, seed: int):
113
+ rng = random.Random(seed)
114
+
115
+ # Pick bug from this task's pool
116
+ pool = TASK_BUG_POOLS[self.task_id]
117
+ self.bug_type = rng.choice(pool)
118
+ self.bug = BUG_CATALOGUE[self.bug_type]
119
+
120
+ # Generate all artifacts
121
+ gen = ArtifactGenerator(self.bug_type, seed)
122
+ self._artifacts: Dict[str, str] = gen.generate_all()
123
+ self._model_cfg = gen.model_cfg
124
+ self._run_id = gen.run_id
125
+ self._rng = rng
126
+ self._seed = seed
127
+
128
+ # Cache artifact metadata at reset time (avoids consuming RNG per step)
129
+ self._artifact_meta: List[ArtifactMeta] = [
130
+ ArtifactMeta(
131
+ name=name,
132
+ description=ARTIFACT_DESCRIPTIONS[name][0],
133
+ size_hint=ARTIFACT_DESCRIPTIONS[name][1],
134
+ last_modified=f"2024-03-{rng.randint(1,28):02d}",
135
+ )
136
+ for name in self._artifacts
137
+ ]
138
+
139
+ # Episode state
140
+ self._step_count = 0
141
+ self._max_steps = TASK_MAX_STEPS[self.task_id]
142
+ self._done = False
143
+ self._artifacts_read: List[str] = []
144
+ self._last_read_filters: Dict[str, str] = {}
145
+ self._sanity_checks_run: List[str] = []
146
+ self._duplicate_queries = 0
147
+ self._current_score = 0.0
148
+ self._messages: List[str] = []
149
+
150
+ # ── OpenEnv API ───────────────────────────────────────────────────────────
151
+
152
+ def reset(self, seed: Optional[int] = None) -> MLOpsObservation:
153
+ import time
154
+ actual_seed = seed if seed is not None else int(time.time() * 1000) % 100000
155
+ self._reset_internal(actual_seed)
156
+ return self._build_obs(
157
+ {"status": "reset", "message": "New training run loaded. Begin investigation."},
158
+ )
159
+
160
+ def step(self, action: MLOpsAction) -> Tuple[MLOpsObservation, float, bool, Dict[str, Any]]:
161
+ if self._done:
162
+ return self._build_obs({"status": "done", "message": "Episode over. Call reset()."}), 0.0, True, {}
163
+
164
+ self._step_count += 1
165
+ reward = 0.0
166
+ info: Dict[str, Any] = {}
167
+ result: Dict[str, Any] = {}
168
+
169
+ if self._step_count >= self._max_steps:
170
+ self._done = True
171
+ score = self._current_score
172
+ result = {"status": "timeout", "message": f"Max steps ({self._max_steps}) reached.", "score": score}
173
+ return self._build_obs(result), 0.0, True, {"score": score, "reason": "timeout"}
174
+
175
+ atype = action.action_type
176
+
177
+ # ── read_config ───────────────────────────────────────────────────
178
+ if atype == "read_config":
179
+ reward, result = self._handle_artifact_read("config.yaml", None)
180
+
181
+ # ── read_logs ─────────────────────────────────────────────────────
182
+ elif atype == "read_logs":
183
+ reward, result = self._handle_artifact_read("train.log", action.log_filter)
184
+
185
+ # ── check_dataset_stats ───────────────────────────────────────────
186
+ elif atype == "check_dataset_stats":
187
+ reward, result = self._handle_artifact_read("dataset_stats.json", None)
188
+
189
+ # ── inspect_preprocessing ─────────────────────────────────────────
190
+ elif atype == "inspect_preprocessing":
191
+ reward, result = self._handle_artifact_read("preprocessing.py", None)
192
+
193
+ # ── read_eval_results ─────────────────────────────────────────────
194
+ elif atype == "read_eval_results":
195
+ reward, result = self._handle_artifact_read("eval_results.json", None)
196
+
197
+ # ── run_sanity_check ───────────���──────────────────────────────────
198
+ elif atype == "run_sanity_check":
199
+ check = action.sanity_check_type
200
+ if not check:
201
+ result = {"status": "error", "message": "sanity_check_type is required."}
202
+ else:
203
+ check_result = run_sanity_check(check, self.bug_type, self._artifacts, self._rng)
204
+ if check not in self._sanity_checks_run:
205
+ self._sanity_checks_run.append(check)
206
+ reward += 0.01 # small reward for running new checks
207
+ result = {"status": "ok", "sanity_check": check_result}
208
+
209
+ # ── query_artifact ────────────────────────────────────────────────
210
+ elif atype == "query_artifact":
211
+ art = action.artifact_name
212
+ field = action.field_path
213
+ if not art or not field:
214
+ result = {"status": "error", "message": "artifact_name and field_path are required."}
215
+ elif art not in self._artifacts:
216
+ result = {"status": "error", "message": f"Artifact '{art}' not found."}
217
+ else:
218
+ val = self._resolve_field(art, field)
219
+ result = {"status": "ok", "artifact": art, "field": field, "value": val}
220
+
221
+ # ── submit_diagnosis ──────────────────────────────────────────────
222
+ elif atype == "submit_diagnosis":
223
+ reward, info, result = self._handle_submit(action)
224
+ self._done = True
225
+
226
+ obs = self._build_obs(result)
227
+ return obs, reward, self._done, info
228
+
229
+ # ── Internal handlers ──────────────────────────────────────────────────────
230
+
231
+ def _handle_artifact_read(self, artifact: str, log_filter: Optional[str]) -> Tuple[float, Dict]:
232
+ is_duplicate = (
233
+ artifact in self._artifacts_read
234
+ and self._last_read_filters.get(artifact, "") == (log_filter or "")
235
+ )
236
+
237
+ content = self._artifacts[artifact]
238
+
239
+ # Apply log filter
240
+ if artifact == "train.log" and log_filter:
241
+ lines = content.split("\n")
242
+ if log_filter.startswith("epoch:"):
243
+ try:
244
+ parts = log_filter.split(":")[1].split("-")
245
+ start, end = int(parts[0]), int(parts[1]) if len(parts) > 1 else int(parts[0])
246
+ filtered = [l for l in lines if any(f"EPOCH {ep:03d}" in l
247
+ for ep in range(start, end+1)) or "[INFO ]" in l or "[ERROR" in l]
248
+ content = "\n".join(filtered) if filtered else "No log lines match this epoch range."
249
+ except Exception:
250
+ content = "\n".join(lines)
251
+ else:
252
+ kw = log_filter.lower()
253
+ filtered = [l for l in lines if kw in l.lower()]
254
+ content = "\n".join(filtered) if filtered else f"No log lines contain '{log_filter}'."
255
+
256
+ reward = 0.0
257
+ if artifact not in self._artifacts_read:
258
+ self._artifacts_read.append(artifact)
259
+ reward = 0.02 # first read reward
260
+ elif is_duplicate:
261
+ self._duplicate_queries += 1
262
+ reward = -0.02 # duplicate penalty
263
+ self._messages.append(f"⚠️ Duplicate read of {artifact} with same filter. Try a different filter or a new artifact.")
264
+
265
+ self._last_read_filters[artifact] = log_filter or ""
266
+
267
+ return reward, {
268
+ "status": "ok",
269
+ "artifact": artifact,
270
+ "content": content,
271
+ "note": "Use log_filter='keyword' or 'epoch:N-M' for targeted log queries.",
272
+ }
273
+
274
+ def _handle_submit(self, action: MLOpsAction) -> Tuple[float, Dict, Dict]:
275
+ if len(self._artifacts_read) < 3:
276
+ # Penalty for submitting without adequate investigation
277
+ base_penalty = -0.05
278
+ self._messages.append("⚠️ Submitted diagnosis after reading fewer than 3 artifacts.")
279
+ else:
280
+ base_penalty = 0.0
281
+
282
+ score = base_penalty
283
+ breakdown: Dict[str, Any] = {}
284
+
285
+ # 1. failure_category (+0.15)
286
+ if action.failure_category == self.bug.category:
287
+ score += 0.15
288
+ breakdown["failure_category"] = {"awarded": 0.15, "correct": True}
289
+ else:
290
+ breakdown["failure_category"] = {
291
+ "awarded": 0.0, "correct": False,
292
+ "expected": self.bug.category, "got": action.failure_category,
293
+ }
294
+
295
+ # 2. root_cause_file (+0.25)
296
+ if action.root_cause_file and action.root_cause_file.lower() == self.bug.file.lower():
297
+ score += 0.25
298
+ breakdown["root_cause_file"] = {"awarded": 0.25, "correct": True}
299
+ else:
300
+ breakdown["root_cause_file"] = {
301
+ "awarded": 0.0, "correct": False,
302
+ "expected": self.bug.file, "got": action.root_cause_file,
303
+ }
304
+
305
+ # 3. root_cause_field (+0.30) — require majority of keywords to match
306
+ field_keywords = [kw.lower() for kw in self.bug.field.replace(".", " ").split() if len(kw) > 1]
307
+ submitted_field = (action.root_cause_field or "").lower()
308
+ field_matches = sum(1 for kw in field_keywords if kw in submitted_field)
309
+ field_threshold = max(1, len(field_keywords) // 2 + 1) # majority
310
+ field_correct = len(field_keywords) > 0 and field_matches >= field_threshold
311
+ if field_correct:
312
+ score += 0.30
313
+ breakdown["root_cause_field"] = {"awarded": 0.30, "correct": True}
314
+ else:
315
+ breakdown["root_cause_field"] = {
316
+ "awarded": 0.0, "correct": False,
317
+ "expected": self.bug.field, "got": action.root_cause_field,
318
+ "matched_keywords": field_matches, "required": field_threshold,
319
+ }
320
+
321
+ # 4. proposed_fix (+0.30) — keyword match against gold fix
322
+ import re as _re
323
+ _stop = {"to", "the", "a", "an", "of", "in", "on", "from", "use", "with", "and", "or", "for", "is", "at", "by"}
324
+ # Strip punctuation from keywords so "(fitted" becomes "fitted"
325
+ fix_keywords = {
326
+ _re.sub(r'[^a-z0-9_.]', '', w)
327
+ for w in self.bug.gold_fix.lower().split()
328
+ } - _stop
329
+ fix_keywords.discard("") # remove empty strings
330
+ submitted_fix = (action.proposed_fix or "").lower()
331
+ fix_overlap = sum(1 for kw in fix_keywords if kw in submitted_fix)
332
+ fix_score = min(0.30, 0.30 * (fix_overlap / max(1, len(fix_keywords))))
333
+ score += fix_score
334
+ breakdown["proposed_fix"] = {
335
+ "awarded": round(fix_score, 4),
336
+ "correct": fix_score >= 0.20,
337
+ "keyword_overlap": fix_overlap,
338
+ "total_keywords": len(fix_keywords),
339
+ }
340
+
341
+ # Hard task penalty multiplier — silent bugs are more costly
342
+ if self.task_id == "hard" and score < 0.70:
343
+ missed = 0.70 - min(score, 0.70)
344
+ score -= missed * 0.5 # 1.5× penalty on missed components
345
+ breakdown["hard_task_penalty_applied"] = True
346
+
347
+ score = round(max(0.0, min(1.0, score)), 4)
348
+ self._current_score = score
349
+
350
+ info = {
351
+ "score": score,
352
+ "breakdown": breakdown,
353
+ "ground_truth": {
354
+ "bug_type": self.bug_type,
355
+ "category": self.bug.category,
356
+ "file": self.bug.file,
357
+ "field": self.bug.field,
358
+ "gold_fix": self.bug.gold_fix,
359
+ },
360
+ "investigation": {
361
+ "artifacts_read": self._artifacts_read,
362
+ "sanity_checks_run": self._sanity_checks_run,
363
+ "duplicate_queries": self._duplicate_queries,
364
+ "steps_taken": self._step_count,
365
+ },
366
+ }
367
+ result = {
368
+ "status": "submitted",
369
+ "score": score,
370
+ "breakdown": breakdown,
371
+ "message": f"Diagnosis submitted. Score: {score:.4f}/{1.0:.4f}",
372
+ }
373
+ return score, info, result
374
+
375
+ def _resolve_field(self, artifact: str, field_path: str) -> Any:
376
+ """Resolve a dot-notation field path from a JSON artifact."""
377
+ import json as _json
378
+ content = self._artifacts[artifact]
379
+ if artifact.endswith(".json"):
380
+ try:
381
+ data = _json.loads(content)
382
+ parts = field_path.split(".")
383
+ val = data
384
+ for p in parts:
385
+ if isinstance(val, dict):
386
+ val = val.get(p, f"Field '{p}' not found")
387
+ else:
388
+ return f"Cannot traverse into non-dict at '{p}'"
389
+ return val
390
+ except Exception as e:
391
+ return f"Parse error: {e}"
392
+ elif artifact.endswith(".yaml"):
393
+ # Simple key search for YAML
394
+ for line in content.split("\n"):
395
+ target_key = field_path.split(".")[-1]
396
+ if f"{target_key}:" in line:
397
+ return line.strip()
398
+ return f"Field '{field_path}' not found in config"
399
+ else:
400
+ # For .py files, return lines containing the field name
401
+ target = field_path.split(".")[-1]
402
+ matches = [l.strip() for l in content.split("\n") if target in l]
403
+ return matches[:5] if matches else f"'{target}' not found in {artifact}"
404
+
405
+ def _build_obs(self, last_result: Dict[str, Any]) -> MLOpsObservation:
406
+ return MLOpsObservation(
407
+ task_id=self.task_id,
408
+ task_description=TASK_DESCRIPTIONS[self.task_id],
409
+ run_id=self._run_id,
410
+ run_summary={
411
+ "model": self._model_cfg["name"],
412
+ "dataset": self._model_cfg["dataset"],
413
+ "task": self._model_cfg["type"],
414
+ "status": "FAILED" if self.task_id == "easy" else "COMPLETED_WITH_ANOMALIES",
415
+ "note": "Investigate artifacts to determine root cause.",
416
+ },
417
+ available_artifacts=list(self._artifact_meta),
418
+ artifacts_read=list(self._artifacts_read),
419
+ last_action_result=last_result,
420
+ step_count=self._step_count,
421
+ max_steps=self._max_steps,
422
+ done=self._done,
423
+ messages=list(self._messages),
424
+ )
425
+
426
+ @property
427
+ def state(self) -> MLOpsState:
428
+ return MLOpsState(
429
+ task_id=self.task_id,
430
+ seed=self._seed,
431
+ step_count=self._step_count,
432
+ max_steps=self._max_steps,
433
+ episode_done=self._done,
434
+ bug_type=self.bug_type,
435
+ bug_category=self.bug.category,
436
+ bug_file=self.bug.file,
437
+ bug_field=self.bug.field,
438
+ gold_fix=self.bug.gold_fix,
439
+ artifacts=self._artifacts,
440
+ artifacts_read=list(self._artifacts_read),
441
+ sanity_checks_run=list(self._sanity_checks_run),
442
+ duplicate_queries=self._duplicate_queries,
443
+ current_score=self._current_score,
444
+ )
445
+
446
+
447
+ # ─── Standalone grader ────────────────────────────────────────────────────────
448
+
449
+ def grade_task(task_id: str, seed: int, diagnosis: Dict[str, Any]) -> float:
450
+ """Deterministic grader callable by OpenEnv validation framework.
451
+
452
+ Bypasses the artifact-read penalty since the grader only evaluates
453
+ diagnosis quality, not investigation thoroughness.
454
+ """
455
+ env = MLOpsEnvironment(task_id=task_id)
456
+ env.reset(seed=seed)
457
+ # Pre-populate artifact reads to avoid the < 3 artifacts penalty
458
+ env._artifacts_read = list(env._artifacts.keys())
459
+ action = MLOpsAction(action_type="submit_diagnosis", **diagnosis)
460
+ _, reward, _, info = env.step(action)
461
+ return info.get("score", 0.0)
server/models.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MLOps Pipeline Debugger — Pydantic Models
3
+
4
+ The agent acts as an ML engineer investigating a broken training run.
5
+ It has access to training artifacts (logs, configs, dataset stats, preprocessing code)
6
+ and must diagnose the root cause through systematic investigation.
7
+
8
+ Action Space:
9
+ read_config → Get training configuration (hyperparams, model arch, optimizer)
10
+ read_logs → Get training logs (filterable by keyword/epoch range)
11
+ check_dataset_stats → Get dataset split sizes, class distribution, feature statistics
12
+ inspect_preprocessing → Read preprocessing pipeline code
13
+ read_eval_results → Get validation and test set evaluation metrics
14
+ run_sanity_check → Compute a specific diagnostic check (label overlap, class balance, etc.)
15
+ query_artifact → Fetch a specific field from any artifact
16
+ submit_diagnosis → Final answer — triggers grading
17
+
18
+ Observation Space:
19
+ task_id, task_description
20
+ available_artifacts — list of artifacts the agent can inspect
21
+ last_action_result — result of the most recent action
22
+ artifacts_read — which artifacts have been read so far (exploration tracking)
23
+ step_count, max_steps
24
+ done
25
+ """
26
+
27
+ from __future__ import annotations
28
+ from typing import Any, Dict, List, Literal, Optional
29
+ from pydantic import BaseModel, Field
30
+
31
+
32
+ # ─── Action ──────────────────────────────────────────────────────────────────
33
+
34
+ class MLOpsAction(BaseModel):
35
+ """
36
+ One action the agent can take per step.
37
+
38
+ action_type determines which fields are used:
39
+ read_config → (no extra fields)
40
+ read_logs → log_filter (optional keyword or "epoch:N-M")
41
+ check_dataset_stats → (no extra fields)
42
+ inspect_preprocessing → (no extra fields)
43
+ read_eval_results → (no extra fields)
44
+ run_sanity_check → sanity_check_type (required)
45
+ query_artifact → artifact_name + field_path (required)
46
+ submit_diagnosis → all diagnosis fields (required)
47
+ """
48
+ action_type: Literal[
49
+ "read_config",
50
+ "read_logs",
51
+ "check_dataset_stats",
52
+ "inspect_preprocessing",
53
+ "read_eval_results",
54
+ "run_sanity_check",
55
+ "query_artifact",
56
+ "submit_diagnosis",
57
+ ] = Field(..., description="Which action to perform")
58
+
59
+ # read_logs
60
+ log_filter: Optional[str] = Field(
61
+ None,
62
+ description="Filter logs by keyword (e.g. 'nan', 'warning', 'error') or epoch range 'epoch:1-5'"
63
+ )
64
+
65
+ # run_sanity_check
66
+ sanity_check_type: Optional[Literal[
67
+ "label_consistency", # Are train/eval label mappings identical?
68
+ "data_leakage", # Is there train/val sample overlap?
69
+ "gradient_norms", # Are gradient norms within normal range?
70
+ "class_balance", # Are classes balanced across splits?
71
+ "feature_statistics", # Do train/val feature distributions match?
72
+ "encoder_version_match", # Do all pipeline stages use the same encoder version?
73
+ "loss_trajectory", # Is the loss curve shape anomalous?
74
+ "metric_gap_analysis", # Is val vs test metric gap suspiciously large?
75
+ ]] = Field(None, description="Type of sanity check to run")
76
+
77
+ # query_artifact
78
+ artifact_name: Optional[Literal[
79
+ "config.yaml",
80
+ "train.log",
81
+ "dataset_stats.json",
82
+ "preprocessing.py",
83
+ "eval_results.json",
84
+ "model_card.json",
85
+ ]] = Field(None, description="Artifact to query a specific field from")
86
+ field_path: Optional[str] = Field(
87
+ None,
88
+ description="Dot-notation field path, e.g. 'optimizer.learning_rate' or 'metrics.val_accuracy'"
89
+ )
90
+
91
+ # submit_diagnosis
92
+ failure_category: Optional[Literal[
93
+ "config_error", # Wrong hyperparameter value
94
+ "data_leakage", # Train/val contamination
95
+ "evaluation_bug", # Eval pipeline uses wrong artifacts
96
+ "preprocessing_bug", # Data transformation applied incorrectly
97
+ "label_mismatch", # Label encoding inconsistency
98
+ "architecture_bug", # Model architecture misconfiguration
99
+ ]] = Field(None, description="Category of the failure")
100
+ root_cause_file: Optional[str] = Field(
101
+ None, description="Which artifact file contains the root cause"
102
+ )
103
+ root_cause_field: Optional[str] = Field(
104
+ None, description="Specific parameter, function, or variable that is wrong"
105
+ )
106
+ diagnosis: Optional[str] = Field(
107
+ None, description="Natural language explanation of what went wrong and why"
108
+ )
109
+ proposed_fix: Optional[str] = Field(
110
+ None, description="Concrete change that would fix the issue"
111
+ )
112
+
113
+
114
+ # ─── Observation ─────────────���───────────────────────────────────────────────
115
+
116
+ class ArtifactMeta(BaseModel):
117
+ name: str
118
+ description: str
119
+ size_hint: str # e.g. "47 lines", "12 fields"
120
+ last_modified: str
121
+
122
+
123
+ class MLOpsObservation(BaseModel):
124
+ """Everything the agent sees after each step / reset."""
125
+ task_id: str
126
+ task_description: str
127
+
128
+ # Run summary — always visible
129
+ run_id: str
130
+ run_summary: Dict[str, Any] = Field(
131
+ description="High-level run info: model, dataset, final loss, training status"
132
+ )
133
+
134
+ available_artifacts: List[ArtifactMeta]
135
+ artifacts_read: List[str] = Field(
136
+ default_factory=list,
137
+ description="Names of artifacts the agent has already read"
138
+ )
139
+
140
+ last_action_result: Dict[str, Any] = Field(default_factory=dict)
141
+
142
+ step_count: int = 0
143
+ max_steps: int = 30
144
+ done: bool = False
145
+ messages: List[str] = Field(default_factory=list)
146
+
147
+
148
+ # ─── State ───────────────────────────────────────────────────────────────────
149
+
150
+ class MLOpsState(BaseModel):
151
+ """Full internal state — for RL harness and debugging."""
152
+ task_id: str
153
+ seed: int
154
+ step_count: int
155
+ max_steps: int
156
+ episode_done: bool
157
+
158
+ # Planted bug ground truth
159
+ bug_type: str
160
+ bug_category: str
161
+ bug_file: str
162
+ bug_field: str
163
+ gold_fix: str
164
+
165
+ # All generated artifacts (full text)
166
+ artifacts: Dict[str, str]
167
+
168
+ # Agent's investigation history
169
+ artifacts_read: List[str]
170
+ sanity_checks_run: List[str]
171
+ duplicate_queries: int
172
+
173
+ current_score: float
server/openenv.yaml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: mlops-debug-env
2
+ version: "1.0.0"
3
+ description: >
4
+ MLOps Pipeline Debugger: an AI agent acts as a senior ML engineer
5
+ investigating a broken training run. The environment procedurally generates
6
+ realistic training artifacts (logs, configs, preprocessing code, eval results)
7
+ with one planted fault. The agent must systematically investigate and submit
8
+ a structured diagnosis. Three tasks: config error (easy) → data leakage (medium)
9
+ → silent evaluation bug (hard). All graders are fully deterministic.
10
+ author: Mohit Goyal
11
+ license: MIT
12
+ tags: [openenv, rl, mlops, debugging, machine-learning, agents]
13
+ tasks:
14
+ - id: easy
15
+ name: Config Error Diagnosis
16
+ difficulty: easy
17
+ max_steps: 20
18
+ bug_pool: [exploding_lr, wrong_optimizer, batch_size_overflow]
19
+ reward_range: [0.0, 1.0]
20
+ - id: medium
21
+ name: Data Leakage Detection
22
+ difficulty: medium
23
+ max_steps: 30
24
+ bug_pool: [data_leakage_scaler, data_leakage_overlap, wrong_split_ratio]
25
+ reward_range: [0.0, 1.0]
26
+ - id: hard
27
+ name: Silent Evaluation Bug
28
+ difficulty: hard
29
+ max_steps: 40
30
+ bug_pool: [label_encoder_mismatch, silent_metric_swap, tokenizer_version_drift]
31
+ reward_range: [0.0, 1.0]
32
+ asymmetric_penalty: true
33
+ action_space:
34
+ type: discrete_structured
35
+ actions: [read_config, read_logs, check_dataset_stats, inspect_preprocessing,
36
+ read_eval_results, run_sanity_check, query_artifact, submit_diagnosis]
37
+ observation_space:
38
+ type: structured_text
39
+ fields: [task_id, run_summary, available_artifacts, artifacts_read,
40
+ last_action_result, step_count, max_steps, done, messages]
41
+ reward:
42
+ type: dense_and_terminal
43
+ per_step: "+0.02 new artifact read, -0.02 duplicate read, +0.01 new sanity check"
44
+ terminal: "0.15 category + 0.25 file + 0.30 field + 0.30 fix. Hard task 1.5x penalty."
45
+ api:
46
+ reset: POST /reset
47
+ step: POST /step
48
+ state: GET /state
49
+ health: GET /health
50
+ websocket: /ws
51
+ runtime:
52
+ port: 7860
53
+ workers: 1
54
+ framework: fastapi
55
+ python: "3.11"
server/openenv_state.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime
4
+ from typing import Dict, List
5
+
6
+ from pydantic import BaseModel
7
+
8
+
9
+ class OpenEnvState(BaseModel):
10
+ run_id: str
11
+ task_id: str
12
+ seed: int
13
+ step_count: int
14
+ max_steps: int
15
+ scores: Dict[str, float]
16
+ end_score: float
17
+ rewards: List[float]
18
+ artifacts_read: List[str]
19
+ timestamp: str
20
+
21
+
22
+ # Global current state (mutable during run)
23
+ OPENENV_STATE: OpenEnvState = OpenEnvState(
24
+ run_id="",
25
+ task_id="",
26
+ seed=0,
27
+ step_count=0,
28
+ max_steps=30,
29
+ scores={"easy": 0.0, "medium": 0.0, "hard": 0.0},
30
+ end_score=0.0,
31
+ rewards=[],
32
+ artifacts_read=[],
33
+ timestamp=datetime.utcnow().isoformat(),
34
+ )
server/pyproject.toml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "mlops-openenv"
3
+ version = "1.0.0"
4
+ description = "MLOps Pipeline Debugger - OpenEnv-compatible RL environment for ML training debugging"
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ license = {text = "MIT"}
8
+ authors = [{name = "MLOps Team"}]
9
+
10
+ dependencies = [
11
+ "fastapi>=0.115.0",
12
+ "uvicorn[standard]>=0.30.0",
13
+ "pydantic>=2.9.0",
14
+ "httpx>=0.27.0",
15
+ "openai>=1.51.0",
16
+ "websockets>=13.0",
17
+ "numpy>=1.26.0",
18
+ "python-multipart>=0.0.12",
19
+ "dotenv>=1.0.0",
20
+ "openenv-core>=0.2.0",
21
+ ]
22
+
23
+ [project.scripts]
24
+ server = "uvicorn:app --host 0.0.0.0 --port 7860"
25
+
26
+ [project.optional-dependencies]
27
+ dev = [
28
+ "pytest>=8.0.0",
29
+ "ruff>=0.6.0",
30
+ ]
31
+
32
+ [build-system]
33
+ requires = ["setuptools>=61.0"]
34
+ build-backend = "setuptools.build_meta"
35
+
36
+ [tool.ruff]
37
+ line-length = 100
38
+ target-version = "py311"
39
+
40
+ [tool.pytest.ini_options]
41
+ testpaths = ["tests"]
42
+ python_files = ["test_*.py"]
server/requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.115.0
2
+ uvicorn[standard]==0.30.6
3
+ pydantic==2.9.2
4
+ httpx==0.27.2
5
+ openai==1.51.0
6
+ websockets==13.1
7
+ numpy==1.26.4
8
+ python-multipart==0.0.12
server/uv.lock ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by uv
2
+ # This is a stub lockfile - real lockfile would be much larger
3
+
4
+ [[package]]
5
+ name = "openenv-core"
6
+ version = "0.2.0"
7
+ source = { registry = "https://pypi.org/simple" }
8
+
9
+ [[package]]
10
+ name = "fastapi"
11
+ version = "0.115.0"
12
+ source = { registry = "https://pypi.org/simple" }
13
+
14
+ [[package]]
15
+ name = "uvicorn"
16
+ version = "0.30.6"
17
+ source = { registry = "https://pypi.org/simple" }
18
+
19
+ [[package]]
20
+ name = "pydantic"
21
+ version = "2.9.2"
22
+ source = { registry = "https://pypi.org/simple" }
23
+
24
+ [[package]]
25
+ name = "httpx"
26
+ version = "0.27.2"
27
+ source = { registry = "https://pypi.org/simple" }
28
+
29
+ [[package]]
30
+ name = "openai"
31
+ version = "1.51.0"
32
+ source = { registry = "https://pypi.org/simple" }
uv.lock ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated by uv
2
+ # This is a stub lockfile - real lockfile would be much larger
3
+
4
+ [[package]]
5
+ name = "openenv-core"
6
+ version = "0.2.0"
7
+ source = { registry = "https://pypi.org/simple" }
8
+
9
+ [[package]]
10
+ name = "fastapi"
11
+ version = "0.115.0"
12
+ source = { registry = "https://pypi.org/simple" }
13
+
14
+ [[package]]
15
+ name = "uvicorn"
16
+ version = "0.30.6"
17
+ source = { registry = "https://pypi.org/simple" }
18
+
19
+ [[package]]
20
+ name = "pydantic"
21
+ version = "2.9.2"
22
+ source = { registry = "https://pypi.org/simple" }
23
+
24
+ [[package]]
25
+ name = "httpx"
26
+ version = "0.27.2"
27
+ source = { registry = "https://pypi.org/simple" }
28
+
29
+ [[package]]
30
+ name = "openai"
31
+ version = "1.51.0"
32
+ source = { registry = "https://pypi.org/simple" }