Spaces:

Angelgupta
/

mlops-openenv

Sleeping

App Files Files Community

trretretret commited on 11 days ago

Commit

6a71058

1 Parent(s): 696c12c

Add uv.lock, pyproject.toml, server directory

Browse files

Files changed (14) hide show

pyproject.toml +4 -0
server/.env.example +5 -0
server/app.py +181 -0
server/artifact_generator.py +960 -0
server/client.py +79 -0
server/inference.py +600 -0
server/mlops_environment.py +461 -0
server/models.py +173 -0
server/openenv.yaml +55 -0
server/openenv_state.py +34 -0
server/pyproject.toml +42 -0
server/requirements.txt +8 -0
server/uv.lock +32 -0
uv.lock +32 -0

pyproject.toml CHANGED Viewed

@@ -17,8 +17,12 @@ dependencies = [
     "numpy>=1.26.0",
     "python-multipart>=0.0.12",
     "dotenv>=1.0.0",
 ]
 [project.optional-dependencies]
 dev = [
     "pytest>=8.0.0",

     "numpy>=1.26.0",
     "python-multipart>=0.0.12",
     "dotenv>=1.0.0",
+    "openenv-core>=0.2.0",
 ]
+[project.scripts]
+server = "uvicorn:app --host 0.0.0.0 --port 7860"
 [project.optional-dependencies]
 dev = [
     "pytest>=8.0.0",

server/.env.example ADDED Viewed

	@@ -0,0 +1,5 @@

+GEMINI_API_KEY=your_gemini_api_key_here
+MODEL_NAME=gemini-2.5-flash
+ENV_BASE_URL=https://angelgupta-mlops-openenv.hf.space
+API_BASE_URL=https://generativelanguage.googleapis.com/v1beta/openai/
+HF_TOKEN=your_hf_token_here

server/app.py ADDED Viewed

	@@ -0,0 +1,181 @@

+from __future__ import annotations
+import json
+from typing import Any, Dict, Optional
+from fastapi import FastAPI, HTTPException, WebSocket, WebSocketDisconnect, Request
+from openenv_state import OPENENV_STATE, OpenEnvState
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from models import MLOpsAction, MLOpsObservation, MLOpsState
+from mlops_environment import MLOpsEnvironment
+app = FastAPI(
+    title="MLOps Pipeline Debugger",
+    description="OpenEnv environment: AI agent diagnoses broken ML training runs.",
+    version="1.0.0",
+)
+app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
+_http_env: Optional[MLOpsEnvironment] = None
+class ResetRequest(BaseModel):
+    task_id: Optional[str] = "easy"
+    seed: Optional[int] = None
+    task: Optional[str] = None  # Support both task_id and task
+class StepResponse(BaseModel):
+    observation: MLOpsObservation
+    reward: float
+    done: bool
+    info: Dict[str, Any]
+@app.post("/reset", response_model=MLOpsObservation)
+async def reset(request: Request):
+    try:
+        body = await request.json()
+    except Exception:
+        body = {}
+    task_id = body.get("task_id") or body.get("task") or "easy"
+    seed = body.get("seed")
+    global _http_env
+    _http_env = MLOpsEnvironment(task_id=task_id)
+    return _http_env.reset(seed=seed)
+@app.get("/")
+async def root():
+    return {
+        "message": "MLOps Pipeline Debugger API",
+        "version": "1.0.0",
+        "docs": "This is an OpenEnv-compatible RL environment",
+        "endpoints": {
+            "GET /": "This message",
+            "GET /health": "Health check",
+            "GET /tasks": "List available tasks",
+            "GET /openenv/state": "OpenEnv state",
+            "POST /reset": "Start a new episode",
+            "POST /step": "Take an action",
+            "GET /state": "Get current state",
+        },
+    }
+@app.get("/health")
+async def health():
+    return {"status": "ok", "environment": "mlops_debug_env", "version": "1.0.0"}
+@app.get("/openenv/state", response_model=OpenEnvState)
+def openenv_state():
+    return OPENENV_STATE
+@app.get("/tasks")
+async def list_tasks():
+    return {
+        "tasks": [
+            {
+                "task_id": "easy",
+                "name": "Config Error Diagnosis",
+                "difficulty": "easy",
+                "max_steps": 20,
+            },
+            {
+                "task_id": "medium",
+                "name": "Data Leakage Detection",
+                "difficulty": "medium",
+                "max_steps": 30,
+            },
+            {
+                "task_id": "hard",
+                "name": "Silent Evaluation Bug",
+                "difficulty": "hard",
+                "max_steps": 40,
+            },
+        ]
+    }
+@app.post("/step", response_model=StepResponse)
+async def step(request: Request):
+    if _http_env is None:
+        raise HTTPException(400, "Call /reset first.")
+    # Get raw body as dict
+    try:
+        body = await request.json()
+    except Exception:
+        body = {}
+    # Handle various input formats
+    action = None
+    try:
+        if "action_type" in body:
+            action = MLOpsAction(**body)
+        elif "action" in body:
+            action = MLOpsAction(**body["action"])
+        elif "message" in body:
+            action = MLOpsAction(action_type=body["message"])
+    except Exception as e:
+        raise HTTPException(422, f"Invalid action: {str(e)}")
+    if action is None or action.action_type is None:
+        raise HTTPException(422, "Field required: action_type")
+    try:
+        obs, reward, done, info = _http_env.step(action)
+        return StepResponse(observation=obs, reward=reward, done=done, info=info)
+    except Exception as e:
+        raise HTTPException(500, f"Step error: {str(e)}")
+@app.get("/state", response_model=MLOpsState)
+async def state():
+    if _http_env is None:
+        raise HTTPException(400, "Call /reset first.")
+    return _http_env.state
+@app.websocket("/ws")
+async def ws_endpoint(websocket: WebSocket):
+    await websocket.accept()
+    env: Optional[MLOpsEnvironment] = None
+    try:
+        while True:
+            msg = json.loads(await websocket.receive_text())
+            method = msg.get("method")
+            if method == "reset":
+                env = MLOpsEnvironment(task_id=msg.get("task_id", "easy"))
+                obs = env.reset(seed=msg.get("seed"))
+                await websocket.send_text(
+                    json.dumps({"method": "reset", "observation": obs.model_dump()})
+                )
+            elif method == "step":
+                if env is None:
+                    await websocket.send_text(json.dumps({"error": "Call reset first"}))
+                    continue
+                action = MLOpsAction(**msg.get("action", {}))
+                obs, reward, done, info = env.step(action)
+                await websocket.send_text(
+                    json.dumps(
+                        {
+                            "method": "step",
+                            "observation": obs.model_dump(),
+                            "reward": reward,
+                            "done": done,
+                            "info": info,
+                        }
+                    )
+                )
+            elif method == "state":
+                if env is None:
+                    await websocket.send_text(json.dumps({"error": "Call reset first"}))
+                    continue
+                await websocket.send_text(
+                    json.dumps({"method": "state", "state": env.state.model_dump()})
+                )
+    except WebSocketDisconnect:
+        pass

server/artifact_generator.py ADDED Viewed

	@@ -0,0 +1,960 @@

+"""
+Artifact Generator for MLOps Pipeline Debugger
+Generates a full set of realistic ML training artifacts for a given bug scenario.
+Each artifact is internally consistent — config matches logs, dataset stats match
+preprocessing code — except for the one planted fault.
+Bug types supported:
+    Task 1 (easy):
+        - exploding_lr       : learning_rate too large → loss diverges to NaN
+        - wrong_optimizer    : SGD with momentum=0.99 on non-convex problem
+        - batch_size_overflow: batch_size > dataset size → trivial overfitting signal
+    Task 2 (medium):
+        - data_leakage_scaler  : StandardScaler fit on full dataset before split
+        - data_leakage_overlap : train/val split with random_state=None → overlap
+        - wrong_split_ratio    : test data accidentally included in training
+    Task 3 (hard):
+        - label_encoder_mismatch : train/eval use different LabelEncoder.fit() orderings
+        - silent_metric_swap     : val and test metric names swapped in eval code
+        - tokenizer_version_drift: training uses tokenizer v1, eval uses v2 (different vocab)
+"""
+from __future__ import annotations
+import json
+import random
+import textwrap
+from dataclasses import dataclass, field
+from typing import Dict, Tuple
+import numpy as np
+# ─── Bug Specifications ───────────────────────────────────────────────────────
+@dataclass
+class BugSpec:
+    bug_type: str
+    category: str          # maps to failure_category in Action
+    file: str              # root_cause_file
+    field: str             # root_cause_field
+    gold_fix: str
+    task_difficulty: str   # easy / medium / hard
+BUG_CATALOGUE: Dict[str, BugSpec] = {
+    # ── EASY ──────────────────────────────────────────────────────────────────
+    "exploding_lr": BugSpec(
+        bug_type="exploding_lr",
+        category="config_error",
+        file="config.yaml",
+        field="optimizer.learning_rate",
+        gold_fix="Reduce learning_rate from 50.0 to 1e-4 (or use a scheduler with warmup)",
+        task_difficulty="easy",
+    ),
+    "wrong_optimizer": BugSpec(
+        bug_type="wrong_optimizer",
+        category="config_error",
+        file="config.yaml",
+        field="optimizer.momentum",
+        gold_fix="Reduce momentum from 0.99 to 0.9, or switch to AdamW optimizer",
+        task_difficulty="easy",
+    ),
+    "batch_size_overflow": BugSpec(
+        bug_type="batch_size_overflow",
+        category="config_error",
+        file="config.yaml",
+        field="training.batch_size",
+        gold_fix="Reduce batch_size from 4096 to 32 or 64; current size exceeds training set",
+        task_difficulty="easy",
+    ),
+    # ── MEDIUM ────────────────────────────────────────────────────────────────
+    "data_leakage_scaler": BugSpec(
+        bug_type="data_leakage_scaler",
+        category="data_leakage",
+        file="preprocessing.py",
+        field="StandardScaler.fit_transform",
+        gold_fix="Fit StandardScaler only on X_train, then call transform() on X_val and X_test separately",
+        task_difficulty="medium",
+    ),
+    "data_leakage_overlap": BugSpec(
+        bug_type="data_leakage_overlap",
+        category="data_leakage",
+        file="preprocessing.py",
+        field="train_test_split.random_state",
+        gold_fix="Set random_state=42 in train_test_split to ensure deterministic, non-overlapping splits",
+        task_difficulty="medium",
+    ),
+    "wrong_split_ratio": BugSpec(
+        bug_type="wrong_split_ratio",
+        category="preprocessing_bug",
+        file="preprocessing.py",
+        field="train_test_split.test_size",
+        gold_fix="Change test_size from 0.8 to 0.2 — current config trains on 20% and evaluates on 80%",
+        task_difficulty="medium",
+    ),
+    # ── HARD ──────────────────────────────────────────────────────────────────
+    "label_encoder_mismatch": BugSpec(
+        bug_type="label_encoder_mismatch",
+        category="label_mismatch",
+        file="preprocessing.py",
+        field="LabelEncoder.fit_order",
+        gold_fix="Use the same LabelEncoder instance (fitted on training data) for both train and eval pipelines",
+        task_difficulty="hard",
+    ),
+    "silent_metric_swap": BugSpec(
+        bug_type="silent_metric_swap",
+        category="evaluation_bug",
+        file="eval_results.json",
+        field="metrics.val_accuracy",
+        gold_fix="Swap val_accuracy and test_accuracy assignments in the evaluation loop — metrics are mislabeled",
+        task_difficulty="hard",
+    ),
+    "tokenizer_version_drift": BugSpec(
+        bug_type="tokenizer_version_drift",
+        category="evaluation_bug",
+        file="preprocessing.py",
+        field="tokenizer.version",
+        gold_fix="Ensure training and evaluation both use tokenizer v2 — v1 has a different vocabulary mapping for 847 tokens",
+        task_difficulty="hard",
+    ),
+}
+TASK_BUG_POOLS = {
+    "easy":   ["exploding_lr", "wrong_optimizer", "batch_size_overflow"],
+    "medium": ["data_leakage_scaler", "data_leakage_overlap", "wrong_split_ratio"],
+    "hard":   ["label_encoder_mismatch", "silent_metric_swap", "tokenizer_version_drift"],
+}
+# ─── Model / Dataset Configs (variety pool) ───────────────────────────────────
+MODEL_CONFIGS = [
+    {"name": "ResNet-50", "type": "image_classification", "params": "25.6M",
+     "dataset": "ImageNet-subset-10k", "num_classes": 10, "input": "224x224 RGB"},
+    {"name": "BERT-base-uncased", "type": "text_classification", "params": "110M",
+     "dataset": "SST-2", "num_classes": 2, "input": "tokenized sequences, max_len=128"},
+    {"name": "EfficientNet-B3", "type": "image_classification", "params": "12.2M",
+     "dataset": "CIFAR-100", "num_classes": 100, "input": "300x300 RGB"},
+    {"name": "DistilBERT", "type": "sentiment_analysis", "params": "66M",
+     "dataset": "IMDB-reviews", "num_classes": 3, "input": "tokenized sequences, max_len=256"},
+    {"name": "MobileNetV3-Large", "type": "image_classification", "params": "5.4M",
+     "dataset": "Oxford-102-Flowers", "num_classes": 102, "input": "224x224 RGB"},
+]
+OPTIMIZERS = ["AdamW", "SGD", "RMSprop", "Adam"]
+SCHEDULERS = ["cosine_annealing", "step_lr", "reduce_on_plateau", "linear_warmup"]
+# ─── Artifact Generators ──────────────────────────────────────────────────────
+class ArtifactGenerator:
+    """
+    Generates all 6 training artifacts for a given (bug_type, seed) pair.
+    All artifacts are internally consistent except for the planted fault.
+    """
+    def __init__(self, bug_type: str, seed: int):
+        self.bug = BUG_CATALOGUE[bug_type]
+        self.seed = seed
+        self.rng = random.Random(seed)
+        self.np_rng = np.random.RandomState(seed)
+        # Pick a model config deterministically
+        self.model_cfg = self.rng.choice(MODEL_CONFIGS)
+        self.optimizer_name = self.rng.choice(OPTIMIZERS)
+        self.scheduler_name = self.rng.choice(SCHEDULERS)
+        self.run_id = f"run_{seed:04d}_{bug_type[:6]}"
+        # Normal hyperparams
+        self.lr = self.rng.choice([1e-5, 3e-5, 1e-4, 3e-4])
+        self.batch_size = self.rng.choice([16, 32, 64])
+        self.epochs = self.rng.randint(8, 20)
+        self.weight_decay = self.rng.choice([0.01, 0.001, 1e-4])
+        self.momentum = 0.9
+        self.train_samples = self.rng.randint(8000, 15000)
+        self.val_samples = int(self.train_samples * 0.2)
+        self.test_samples = int(self.train_samples * 0.15)
+    def generate_all(self) -> Dict[str, str]:
+        return {
+            "config.yaml":          self._gen_config(),
+            "train.log":            self._gen_train_log(),
+            "dataset_stats.json":   self._gen_dataset_stats(),
+            "preprocessing.py":     self._gen_preprocessing(),
+            "eval_results.json":    self._gen_eval_results(),
+            "model_card.json":      self._gen_model_card(),
+        }
+    # ── config.yaml ──────────────────────────────────────────────────────────
+    def _gen_config(self) -> str:
+        lr = self.lr
+        batch_size = self.batch_size
+        momentum = self.momentum
+        if self.bug.bug_type == "exploding_lr":
+            lr = self.rng.choice([50.0, 10.0, 25.0])
+        elif self.bug.bug_type == "wrong_optimizer":
+            momentum = 0.99
+            self.optimizer_name = "SGD"
+        elif self.bug.bug_type == "batch_size_overflow":
+            batch_size = self.rng.choice([2048, 4096, 8192])
+        return textwrap.dedent(f"""\
+            # Training Configuration
+            # Run ID: {self.run_id}
+            # Generated: 2024-03-{self.rng.randint(1,28):02d}T{self.rng.randint(0,23):02d}:{self.rng.randint(0,59):02d}:00Z
+            model:
+              architecture: {self.model_cfg['name']}
+              num_classes: {self.model_cfg['num_classes']}
+              pretrained: true
+              pretrained_source: "timm/torchvision"
+              dropout: {self.rng.choice([0.1, 0.2, 0.3])}
+              freeze_backbone_epochs: {self.rng.randint(0, 3)}
+            training:
+              epochs: {self.epochs}
+              batch_size: {batch_size}
+              num_workers: {self.rng.choice([4, 8])}
+              pin_memory: true
+              mixed_precision: {str(self.rng.choice([True, False])).lower()}
+              gradient_clip_norm: {self.rng.choice([1.0, 5.0, "null"])}
+              early_stopping_patience: {self.rng.randint(3, 7)}
+              seed: {self.seed}
+            optimizer:
+              name: {self.optimizer_name}
+              learning_rate: {lr}
+              weight_decay: {self.weight_decay}
+              momentum: {momentum}
+              betas: [0.9, 0.999]
+            scheduler:
+              name: {self.scheduler_name}
+              warmup_epochs: {self.rng.randint(0, 3)}
+              min_lr: 1.0e-7
+              t_max: {self.epochs}
+            data:
+              dataset: {self.model_cfg['dataset']}
+              input_size: "{self.model_cfg['input']}"
+              train_split: 0.8
+              val_split: 0.1
+              test_split: 0.1
+              augmentation:
+                random_crop: true
+                horizontal_flip: {str(self.rng.choice([True, False])).lower()}
+                color_jitter: {self.rng.choice([0.2, 0.4])}
+                normalize_mean: [0.485, 0.456, 0.406]
+                normalize_std:  [0.229, 0.224, 0.225]
+            logging:
+              log_interval: 50
+              save_best_only: true
+              checkpoint_dir: "./checkpoints/{self.run_id}"
+              wandb_project: "mlops-debug-bench"
+        """)
+    # ── train.log ────────────────────────────────────────────────────────────
+    def _gen_train_log(self) -> str:
+        lines = []
+        lines.append(f"[INFO  2024-03-{self.rng.randint(1,28):02d} {self.rng.randint(6,10):02d}:00:00] Starting training run: {self.run_id}")
+        lines.append(f"[INFO  ] Model: {self.model_cfg['name']} | Params: {self.model_cfg['params']}")
+        lines.append(f"[INFO  ] Dataset: {self.model_cfg['dataset']} | Train: {self.train_samples:,} | Val: {self.val_samples:,}")
+        lines.append(f"[INFO  ] Device: cuda:0 | Mixed precision: fp16")
+        lines.append(f"[INFO  ] Optimizer: {self.optimizer_name} | LR: {self.lr} | Batch: {self.batch_size}")
+        lines.append("[INFO  ] ─" * 30)
+        bug = self.bug.bug_type
+        if bug == "exploding_lr":
+            # Loss explodes rapidly
+            loss = 2.302
+            for ep in range(1, min(self.epochs + 1, 6)):
+                acc = max(0.0, 0.12 - ep * 0.02)
+                val_loss = loss * self.rng.uniform(1.1, 1.5)
+                val_acc = max(0.0, acc - 0.05)
+                lines.append(f"[EPOCH {ep:03d}] train_loss={loss:.4f}  train_acc={acc:.4f}  "
+                              f"val_loss={val_loss:.4f}  val_acc={val_acc:.4f}  "
+                              f"lr={self.lr:.2e}  grad_norm={loss * 18.3:.2f}  "
+                              f"time={self.rng.randint(45,90)}s")
+                if ep == 1: lines.append(f"[WARN  ] Gradient norm unusually high: {loss * 18.3:.2f} (threshold: 10.0)")
+                loss = loss * self.rng.uniform(4.5, 9.0)
+                if loss > 1e6:
+                    lines.append(f"[EPOCH {ep+1:03d}] train_loss=nan  train_acc=0.1000  val_loss=nan  val_acc=0.1000  "
+                                  f"lr={self.lr:.2e}  grad_norm=nan  time={self.rng.randint(45,90)}s")
+                    lines.append(f"[ERROR ] Loss is NaN at epoch {ep+1}, step {self.rng.randint(100,300)}. Training halted.")
+                    lines.append(f"[ERROR ] Last finite loss: {loss / self.rng.uniform(4,9):.2f}. Gradient explosion detected.")
+                    break
+        elif bug == "wrong_optimizer":
+            # Loss oscillates wildly, never converges
+            loss = 2.302
+            for ep in range(1, self.epochs + 1):
+                delta = self.rng.uniform(-0.8, 1.2)
+                loss = max(1.8, loss + delta)
+                acc = self.rng.uniform(0.10, 0.25)
+                val_loss = loss + self.rng.uniform(-0.3, 0.8)
+                val_acc = self.rng.uniform(0.09, 0.22)
+                lines.append(f"[EPOCH {ep:03d}] train_loss={loss:.4f}  train_acc={acc:.4f}  "
+                              f"val_loss={val_loss:.4f}  val_acc={val_acc:.4f}  "
+                              f"lr={self.lr:.2e}  grad_norm={self.rng.uniform(8.2, 45.1):.2f}  "
+                              f"time={self.rng.randint(45,90)}s")
+                if ep % 3 == 0:
+                    lines.append(f"[WARN  ] Loss oscillation detected over last 3 epochs: {loss+0.4:.3f} → {loss-0.5:.3f} → {loss:.3f}")
+        elif bug == "batch_size_overflow":
+            # Val accuracy hits 100% immediately — model memorizes tiny effective dataset
+            for ep in range(1, self.epochs + 1):
+                train_loss = max(0.001, 2.302 * (0.05 ** ep))
+                train_acc = min(1.0, 0.3 + ep * 0.09)
+                val_acc = 0.999 if ep >= 2 else 0.85
+                val_loss = 0.001 if ep >= 2 else 0.45
+                lines.append(f"[EPOCH {ep:03d}] train_loss={train_loss:.4f}  train_acc={train_acc:.4f}  "
+                              f"val_loss={val_loss:.4f}  val_acc={val_acc:.4f}  "
+                              f"lr={self.lr:.2e}  grad_norm={self.rng.uniform(0.1,0.9):.3f}  "
+                              f"time={self.rng.randint(3,8)}s")
+            lines.append(f"[WARN  ] Effective steps per epoch: {max(1, self.train_samples // 4096)}. Dataset may be smaller than batch size.")
+        elif bug in ("data_leakage_scaler", "data_leakage_overlap", "wrong_split_ratio"):
+            # Val accuracy suspiciously high from epoch 1
+            for ep in range(1, self.epochs + 1):
+                train_loss = max(0.01, 0.45 - ep * 0.02)
+                train_acc = min(0.98, 0.72 + ep * 0.015)
+                val_acc = min(0.999, 0.984 + self.rng.uniform(-0.002, 0.002)) if ep >= 1 else 0.71
+                val_loss = max(0.001, 0.04 - ep * 0.001)
+                lines.append(f"[EPOCH {ep:03d}] train_loss={train_loss:.4f}  train_acc={train_acc:.4f}  "
+                              f"val_loss={val_loss:.4f}  val_acc={val_acc:.4f}  "
+                              f"lr={self.lr:.2e}  grad_norm={self.rng.uniform(0.1,1.2):.3f}  "
+                              f"time={self.rng.randint(45,90)}s")
+            lines.append(f"[INFO  ] Best model saved at epoch 2: val_acc=0.9841")
+            lines.append(f"[WARN  ] Val accuracy reached 98.4% at epoch 1 — verify no data leakage.")
+        elif bug in ("label_encoder_mismatch", "silent_metric_swap", "tokenizer_version_drift"):
+            # Training looks completely normal — the bug is silent
+            best_val = 0.0
+            for ep in range(1, self.epochs + 1):
+                train_loss = max(0.08, 1.8 * (0.72 ** ep) + self.rng.uniform(-0.02, 0.02))
+                train_acc = min(0.96, 0.42 + ep * 0.032 + self.rng.uniform(-0.01, 0.01))
+                val_loss = train_loss * self.rng.uniform(1.05, 1.15)
+                val_acc = train_acc - self.rng.uniform(0.02, 0.06)
+                best_val = max(best_val, val_acc)
+                lines.append(f"[EPOCH {ep:03d}] train_loss={train_loss:.4f}  train_acc={train_acc:.4f}  "
+                              f"val_loss={val_loss:.4f}  val_acc={val_acc:.4f}  "
+                              f"lr={self.lr:.2e}  grad_norm={self.rng.uniform(0.3, 2.1):.3f}  "
+                              f"time={self.rng.randint(60,120)}s")
+            lines.append(f"[INFO  ] Training complete. Best val_acc={best_val:.4f} at epoch {self.rng.randint(self.epochs-3, self.epochs)}")
+            lines.append(f"[INFO  ] Checkpoint saved: ./checkpoints/{self.run_id}/best_model.pt")
+        lines.append("[INFO  ] ─" * 30)
+        lines.append(f"[INFO  ] Run {self.run_id} finished.")
+        return "\n".join(lines)
+    # ── dataset_stats.json ───────────────────────────────────────────────────
+    def _gen_dataset_stats(self) -> str:
+        n_classes = self.model_cfg["num_classes"]
+        train_n = self.train_samples
+        val_n = self.val_samples
+        test_n = self.test_samples
+        overlap_count = 0
+        if self.bug.bug_type == "data_leakage_overlap":
+            overlap_count = self.rng.randint(int(val_n * 0.15), int(val_n * 0.30))
+        elif self.bug.bug_type == "wrong_split_ratio":
+            # Train and test flipped
+            train_n, test_n = test_n, train_n
+        # Class distribution (roughly uniform with jitter)
+        def class_dist(total, n_cls):
+            base = total // n_cls
+            counts = {str(i): base + self.rng.randint(-int(base*0.15), int(base*0.15))
+                      for i in range(min(n_cls, 10))}
+            if n_cls > 10:
+                counts["..."] = f"{n_cls - 10} more classes"
+            return counts
+        stats = {
+            "dataset": self.model_cfg["dataset"],
+            "num_classes": n_classes,
+            "splits": {
+                "train": {
+                    "n_samples": train_n,
+                    "class_distribution": class_dist(train_n, n_classes),
+                },
+                "val": {
+                    "n_samples": val_n,
+                    "class_distribution": class_dist(val_n, n_classes),
+                    "overlap_with_train": overlap_count,
+                },
+                "test": {
+                    "n_samples": test_n,
+                    "class_distribution": class_dist(test_n, n_classes),
+                },
+            },
+            "feature_statistics": {
+                "mean": round(self.np_rng.uniform(0.45, 0.55), 4),
+                "std":  round(self.np_rng.uniform(0.22, 0.28), 4),
+                "min":  0.0,
+                "max":  1.0,
+                "null_count": 0,
+            },
+            "preprocessing_applied": [
+                "resize",
+                "normalize",
+                "label_encode",
+                "train_val_test_split",
+            ],
+            "random_seed_used": self.seed if self.bug.bug_type != "data_leakage_overlap" else None,
+        }
+        return json.dumps(stats, indent=2)
+    # ── preprocessing.py ─────────────────────────────────────────────────────
+    def _gen_preprocessing(self) -> str:
+        bug = self.bug.bug_type
+        if bug == "data_leakage_scaler":
+            return textwrap.dedent(f"""\
+                \"\"\"
+                Data preprocessing pipeline for {self.model_cfg['dataset']}
+                Run ID: {self.run_id}
+                \"\"\"
+                import numpy as np
+                import pandas as pd
+                from sklearn.preprocessing import StandardScaler, LabelEncoder
+                from sklearn.model_selection import train_test_split
+                def load_raw_data(data_dir: str):
+                    \"\"\"Load features and labels from disk.\"\"\"
+                    X = np.load(f"{{data_dir}}/features.npy")
+                    y = np.load(f"{{data_dir}}/labels.npy")
+                    return X, y
+                def preprocess(data_dir: str, seed: int = {self.seed}):
+                    X, y = load_raw_data(data_dir)
+                    # Encode labels
+                    le = LabelEncoder()
+                    y_encoded = le.fit_transform(y)
+                    # ── BUG: Scaler fit on full dataset BEFORE split ──────────
+                    scaler = StandardScaler()
+                    X_normalized = scaler.fit_transform(X)   # sees val/test data during fit!
+                    # ─────────────────────────────────────────────────────────
+                    X_train, X_temp, y_train, y_temp = train_test_split(
+                        X_normalized, y_encoded, test_size=0.2, random_state=seed
+                    )
+                    X_val, X_test, y_val, y_test = train_test_split(
+                        X_temp, y_temp, test_size=0.5, random_state=seed
+                    )
+                    return (X_train, y_train), (X_val, y_val), (X_test, y_test), scaler, le
+                def get_transforms(split: str):
+                    \"\"\"Get augmentation transforms for a given split.\"\"\"
+                    if split == "train":
+                        return [
+                            ("random_horizontal_flip", {{"p": 0.5}}),
+                            ("random_crop",            {{"size": 224, "padding": 4}}),
+                            ("color_jitter",           {{"brightness": 0.2, "contrast": 0.2}}),
+                            ("normalize",              {{"mean": [0.485, 0.456, 0.406],
+                                                        "std":  [0.229, 0.224, 0.225]}}),
+                        ]
+                    return [
+                        ("center_crop", {{"size": 224}}),
+                        ("normalize",   {{"mean": [0.485, 0.456, 0.406],
+                                          "std":  [0.229, 0.224, 0.225]}}),
+                    ]
+            """)
+        elif bug == "data_leakage_overlap":
+            return textwrap.dedent(f"""\
+                \"\"\"
+                Data preprocessing pipeline for {self.model_cfg['dataset']}
+                Run ID: {self.run_id}
+                \"\"\"
+                import numpy as np
+                from sklearn.preprocessing import StandardScaler, LabelEncoder
+                from sklearn.model_selection import train_test_split
+                def load_raw_data(data_dir: str):
+                    X = np.load(f"{{data_dir}}/features.npy")
+                    y = np.load(f"{{data_dir}}/labels.npy")
+                    return X, y
+                def preprocess(data_dir: str):
+                    X, y = load_raw_data(data_dir)
+                    le = LabelEncoder()
+                    y_encoded = le.fit_transform(y)
+                    # First split: train vs temp
+                    # ── BUG: random_state=None → non-reproducible, overlapping splits ──
+                    X_train, X_temp, y_train, y_temp = train_test_split(
+                        X, y_encoded, test_size=0.2, random_state=None   # ← should be fixed seed
+                    )
+                    # Second split: val vs test (ALSO non-deterministic)
+                    X_val, X_test, y_val, y_test = train_test_split(
+                        X_temp, y_temp, test_size=0.5, random_state=None  # ← should be fixed seed
+                    )
+                    # ─────────────────────────────────────────────────────────
+                    scaler = StandardScaler()
+                    X_train = scaler.fit_transform(X_train)
+                    X_val   = scaler.transform(X_val)
+                    X_test  = scaler.transform(X_test)
+                    return (X_train, y_train), (X_val, y_val), (X_test, y_test), scaler, le
+            """)
+        elif bug == "wrong_split_ratio":
+            return textwrap.dedent(f"""\
+                \"\"\"
+                Data preprocessing pipeline for {self.model_cfg['dataset']}
+                Run ID: {self.run_id}
+                \"\"\"
+                import numpy as np
+                from sklearn.preprocessing import StandardScaler, LabelEncoder
+                from sklearn.model_selection import train_test_split
+                def preprocess(data_dir: str, seed: int = {self.seed}):
+                    X = np.load(f"{{data_dir}}/features.npy")
+                    y = np.load(f"{{data_dir}}/labels.npy")
+                    le = LabelEncoder()
+                    y_encoded = le.fit_transform(y)
+                    # ── BUG: test_size=0.8 — trains on 20%, evaluates on 80% ──
+                    X_train, X_test, y_train, y_test = train_test_split(
+                        X, y_encoded, test_size=0.8, random_state=seed  # ← should be 0.2
+                    )
+                    X_val, X_test, y_val, y_test = train_test_split(
+                        X_test, y_test, test_size=0.5, random_state=seed
+                    )
+                    # ──────────────────────────────────────────────────────────
+                    scaler = StandardScaler()
+                    X_train = scaler.fit_transform(X_train)
+                    X_val   = scaler.transform(X_val)
+                    X_test  = scaler.transform(X_test)
+                    return (X_train, y_train), (X_val, y_val), (X_test, y_test), scaler, le
+            """)
+        elif bug == "label_encoder_mismatch":
+            classes = ["cat", "dog", "bird"] if self.model_cfg["num_classes"] <= 10 else \
+                      [f"class_{i}" for i in range(min(self.model_cfg["num_classes"], 5))]
+            classes_shuffled = classes.copy()
+            self.rng.shuffle(classes_shuffled)
+            return textwrap.dedent(f"""\
+                \"\"\"
+                Data preprocessing pipeline for {self.model_cfg['dataset']}
+                Run ID: {self.run_id}
+                WARNING: Training and evaluation pipelines are defined separately.
+                Ensure they use identical label encoding.
+                \"\"\"
+                import numpy as np
+                from sklearn.preprocessing import LabelEncoder
+                from sklearn.model_selection import train_test_split
+                # ── Training pipeline ─────────────────────────────────────────
+                def build_train_pipeline(data_dir: str, seed: int = {self.seed}):
+                    X = np.load(f"{{data_dir}}/train_features.npy")
+                    y_raw = np.load(f"{{data_dir}}/train_labels.npy", allow_pickle=True)
+                    # LabelEncoder fitted on training class order
+                    le_train = LabelEncoder()
+                    le_train.fit({classes})          # alphabetical order: {sorted(classes)}
+                    y = le_train.transform(y_raw)
+                    X_train, X_val, y_train, y_val = train_test_split(
+                        X, y, test_size=0.2, random_state=seed
+                    )
+                    return (X_train, y_train), (X_val, y_val), le_train
+                # ── Evaluation pipeline ───────────────────────────────────────
+                def build_eval_pipeline(data_dir: str):
+                    X_test = np.load(f"{{data_dir}}/test_features.npy")
+                    y_raw  = np.load(f"{{data_dir}}/test_labels.npy", allow_pickle=True)
+                    # ── BUG: Different LabelEncoder instance with DIFFERENT fit order ──
+                    le_eval = LabelEncoder()
+                    le_eval.fit({classes_shuffled})  # ← shuffled order: {classes_shuffled}
+                    y_test = le_eval.transform(y_raw)
+                    # ─────────────────────────────────────────────────────────
+                    return X_test, y_test, le_eval
+            """)
+        elif bug == "silent_metric_swap":
+            val_acc = round(self.rng.uniform(0.84, 0.91), 4)
+            test_acc = round(self.rng.uniform(0.31, 0.39), 4)
+            return textwrap.dedent(f"""\
+                \"\"\"
+                Evaluation script for {self.model_cfg['dataset']}
+                Run ID: {self.run_id}
+                \"\"\"
+                import torch
+                import json
+                def evaluate(model, val_loader, test_loader, device="cuda"):
+                    model.eval()
+                    results = {{}}
+                    with torch.no_grad():
+                        # Evaluate on validation set
+                        val_correct, val_total = 0, 0
+                        for X, y in val_loader:
+                            preds = model(X.to(device)).argmax(dim=1)
+                            val_correct += (preds == y.to(device)).sum().item()
+                            val_total   += y.size(0)
+                        val_acc = val_correct / val_total
+                        # Evaluate on test set
+                        test_correct, test_total = 0, 0
+                        for X, y in test_loader:
+                            preds = model(X.to(device)).argmax(dim=1)
+                            test_correct += (preds == y.to(device)).sum().item()
+                            test_total   += y.size(0)
+                        test_acc = test_correct / test_total
+                    # ── BUG: val and test accuracy assignments are swapped ──
+                    results["val_accuracy"]  = test_acc   # ← should be val_acc
+                    results["test_accuracy"] = val_acc    # ← should be test_acc
+                    # ──────────────────────────────────────────────────────
+                    results["val_loss"]  = round(1 - val_acc  + 0.12, 4)
+                    results["test_loss"] = round(1 - test_acc + 0.09, 4)
+                    return results
+            """)
+        elif bug == "tokenizer_version_drift":
+            return textwrap.dedent(f"""\
+                \"\"\"
+                Text preprocessing pipeline for {self.model_cfg['dataset']}
+                Run ID: {self.run_id}
+                \"\"\"
+                from transformers import AutoTokenizer
+                TOKENIZER_V1 = "bert-base-uncased"           # vocab size: 30,522
+                TOKENIZER_V2 = "bert-base-uncased-v2-fixed"  # vocab size: 30,522 + 847 domain tokens
+                # ── Training pipeline ─────────────────────────────────────────
+                def get_train_tokenizer():
+                    \"\"\"Tokenizer used during training.\"\"\"
+                    # Updated to v2 for domain-specific vocabulary
+                    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_V2)
+                    return tokenizer
+                # ── Evaluation pipeline ───────────────────────────────────────
+                def get_eval_tokenizer():
+                    \"\"\"Tokenizer used during evaluation and inference.\"\"\"
+                    # ── BUG: Still using v1 — 847 tokens map to [UNK] during eval ──
+                    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_V1)   # ← should be TOKENIZER_V2
+                    return tokenizer
+                    # ─────────────────────────────────────────────────────────
+                def tokenize_batch(texts, tokenizer, max_length: int = 128):
+                    return tokenizer(
+                        texts,
+                        padding="max_length",
+                        truncation=True,
+                        max_length=max_length,
+                        return_tensors="pt",
+                    )
+            """)
+        else:
+            # Default normal preprocessing (for config-error bugs, preprocessing is clean)
+            return textwrap.dedent(f"""\
+                \"\"\"
+                Data preprocessing pipeline for {self.model_cfg['dataset']}
+                Run ID: {self.run_id}
+                \"\"\"
+                import numpy as np
+                from sklearn.preprocessing import StandardScaler, LabelEncoder
+                from sklearn.model_selection import train_test_split
+                def preprocess(data_dir: str, seed: int = {self.seed}):
+                    X = np.load(f"{{data_dir}}/features.npy")
+                    y = np.load(f"{{data_dir}}/labels.npy")
+                    le = LabelEncoder()
+                    y_encoded = le.fit_transform(y)
+                    X_train, X_temp, y_train, y_temp = train_test_split(
+                        X, y_encoded, test_size=0.2, random_state=seed
+                    )
+                    X_val, X_test, y_val, y_test = train_test_split(
+                        X_temp, y_temp, test_size=0.5, random_state=seed
+                    )
+                    # Correct: fit only on training data
+                    scaler = StandardScaler()
+                    X_train = scaler.fit_transform(X_train)
+                    X_val   = scaler.transform(X_val)
+                    X_test  = scaler.transform(X_test)
+                    return (X_train, y_train), (X_val, y_val), (X_test, y_test), scaler, le
+            """)
+    # ── eval_results.json ────────────────────────────────────────────────────
+    def _gen_eval_results(self) -> str:
+        bug = self.bug.bug_type
+        if bug in ("exploding_lr", "wrong_optimizer"):
+            val_acc  = round(self.rng.uniform(0.09, 0.13), 4)
+            test_acc = round(self.rng.uniform(0.09, 0.13), 4)
+            val_loss = 999999.9 if bug == "exploding_lr" else round(self.rng.uniform(2.1, 2.4), 4)
+            test_loss = val_loss
+        elif bug == "batch_size_overflow":
+            val_acc  = 0.9990
+            test_acc = round(self.rng.uniform(0.11, 0.15), 4)   # massive train/test gap
+            val_loss, test_loss = 0.0003, round(self.rng.uniform(1.8, 2.3), 4)
+        elif bug in ("data_leakage_scaler", "data_leakage_overlap", "wrong_split_ratio"):
+            val_acc  = round(self.rng.uniform(0.982, 0.998), 4)
+            test_acc = round(self.rng.uniform(0.61, 0.73), 4)    # test is much worse (no leakage)
+            val_loss  = round(self.rng.uniform(0.004, 0.015), 4)
+            test_loss = round(self.rng.uniform(0.42, 0.68), 4)
+        elif bug == "label_encoder_mismatch":
+            val_acc  = round(self.rng.uniform(0.84, 0.91), 4)
+            test_acc = round(self.rng.uniform(0.30, 0.38), 4)    # near random for 3-class
+            val_loss  = round(1 - val_acc  + self.rng.uniform(0.05, 0.15), 4)
+            test_loss = round(1 - test_acc + self.rng.uniform(0.05, 0.15), 4)
+        elif bug == "silent_metric_swap":
+            real_val  = round(self.rng.uniform(0.84, 0.91), 4)
+            real_test = round(self.rng.uniform(0.31, 0.39), 4)
+            # Swapped in output
+            val_acc  = real_test
+            test_acc = real_val
+            val_loss  = round(1 - real_test + 0.09, 4)
+            test_loss = round(1 - real_val  + 0.12, 4)
+        elif bug == "tokenizer_version_drift":
+            val_acc  = round(self.rng.uniform(0.83, 0.88), 4)
+            test_acc = round(self.rng.uniform(0.28, 0.36), 4)
+            val_loss  = round(1 - val_acc  + self.rng.uniform(0.05, 0.12), 4)
+            test_loss = round(1 - test_acc + self.rng.uniform(0.05, 0.12), 4)
+        else:
+            val_acc  = round(self.rng.uniform(0.78, 0.91), 4)
+            test_acc = round(val_acc - self.rng.uniform(0.02, 0.05), 4)
+            val_loss  = round(1 - val_acc  + 0.1, 4)
+            test_loss = round(1 - test_acc + 0.1, 4)
+        result = {
+            "run_id": self.run_id,
+            "final_epoch": self.epochs if bug not in ("exploding_lr",) else self.rng.randint(2,5),
+            "metrics": {
+                "val_loss":      val_loss,
+                "val_accuracy":  val_acc,
+                "test_loss":     test_loss,
+                "test_accuracy": test_acc,
+            },
+            "best_checkpoint": f"./checkpoints/{self.run_id}/best_model.pt",
+            "evaluation_timestamp": f"2024-03-{self.rng.randint(1,28):02d}T{self.rng.randint(10,22):02d}:{self.rng.randint(0,59):02d}:00Z",
+            "hardware": {"gpu": "A100-40GB", "cuda": "12.1"},
+        }
+        return json.dumps(result, indent=2)
+    # ── model_card.json ──────────────────────────────────────────────────────
+    def _gen_model_card(self) -> str:
+        bug = self.bug.bug_type
+        tokenizer_ver = "v1" if bug == "tokenizer_version_drift" else "v2"
+        card = {
+            "model_id": f"{self.run_id}",
+            "architecture": self.model_cfg["name"],
+            "task": self.model_cfg["type"],
+            "num_parameters": self.model_cfg["params"],
+            "dataset": self.model_cfg["dataset"],
+            "num_classes": self.model_cfg["num_classes"],
+            "framework": "PyTorch 2.2.0",
+            "training_config": {
+                "optimizer": self.optimizer_name,
+                "scheduler": self.scheduler_name,
+                "epochs": self.epochs,
+            },
+            "preprocessing": {
+                "label_encoder": "sklearn.LabelEncoder",
+                "tokenizer": tokenizer_ver if "bert" in self.model_cfg["name"].lower() else "N/A",
+                "normalizer": "StandardScaler (fit on training split)",
+            },
+            "authors": ["ml-platform-team"],
+            "license": "Apache-2.0",
+        }
+        return json.dumps(card, indent=2)
+# ─── Sanity Check Engine ──────────────────────────────────────────────────────
+def run_sanity_check(check_type: str, bug_type: str, artifacts: Dict[str, str],
+                     rng: random.Random) -> Dict:
+    """
+    Runs a named diagnostic check and returns computed results.
+    Results are grounded in the generated artifacts — not random.
+    """
+    bug = BUG_CATALOGUE[bug_type]
+    if check_type == "label_consistency":
+        if bug_type == "label_encoder_mismatch":
+            return {
+                "check": "label_consistency",
+                "result": "FAIL",
+                "details": "Training LabelEncoder class order: ['bird', 'cat', 'dog'] (index 0=bird, 1=cat, 2=dog). "
+                           "Evaluation LabelEncoder class order: ['cat', 'dog', 'bird'] (index 0=cat, 1=dog, 2=bird). "
+                           "Mismatch detected — 2 of 3 class indices differ between pipelines.",
+                "affected_classes": 2,
+                "recommendation": "Use a single LabelEncoder instance across both pipelines.",
+            }
+        return {"check": "label_consistency", "result": "PASS",
+                "details": "Train and eval label mappings are identical. No mismatch detected."}
+    elif check_type == "data_leakage":
+        if bug_type in ("data_leakage_overlap", "data_leakage_scaler"):
+            overlap = rng.randint(180, 450) if bug_type == "data_leakage_overlap" else 0
+            scaler_leak = bug_type == "data_leakage_scaler"
+            return {
+                "check": "data_leakage",
+                "result": "FAIL",
+                "sample_overlap": overlap,
+                "scaler_fitted_on_full_dataset": scaler_leak,
+                "details": (
+                    f"Found {overlap} samples present in both train and val splits. "
+                    if overlap > 0 else ""
+                ) + (
+                    "StandardScaler.fit_transform() called on full dataset before split — "
+                    "validation statistics contaminated by training distribution."
+                    if scaler_leak else ""
+                ),
+            }
+        return {"check": "data_leakage", "result": "PASS",
+                "sample_overlap": 0, "scaler_fitted_on_full_dataset": False,
+                "details": "No data leakage detected between train and val splits."}
+    elif check_type == "gradient_norms":
+        if bug_type == "exploding_lr":
+            return {
+                "check": "gradient_norms",
+                "result": "ANOMALY",
+                "epoch_1_norm": round(rng.uniform(840.0, 2100.0), 2),
+                "expected_range": "0.1 – 10.0",
+                "details": "Gradient norms exceeded safe threshold by 100–200×. "
+                           "Indicates learning rate is too large — gradients are not being controlled.",
+            }
+        return {"check": "gradient_norms", "result": "NORMAL",
+                "mean_norm": round(rng.uniform(0.3, 2.1), 3),
+                "max_norm": round(rng.uniform(2.1, 4.5), 3),
+                "details": "Gradient norms are within expected range throughout training."}
+    elif check_type == "metric_gap_analysis":
+        if bug_type in ("label_encoder_mismatch", "silent_metric_swap", "tokenizer_version_drift"):
+            val_acc  = round(rng.uniform(0.84, 0.91), 4)
+            test_acc = round(rng.uniform(0.28, 0.38), 4)
+            return {
+                "check": "metric_gap_analysis",
+                "result": "ANOMALY",
+                "val_accuracy":  val_acc,
+                "test_accuracy": test_acc,
+                "gap": round(val_acc - test_acc, 4),
+                "expected_max_gap": 0.08,
+                "details": f"Val/test accuracy gap is {val_acc - test_acc:.3f} — far exceeds expected max of 0.08. "
+                           f"This magnitude of gap (>{val_acc - test_acc:.0%}) strongly suggests an evaluation pipeline bug "
+                           f"rather than overfitting — the model generalises well to the val set but fails on test data.",
+            }
+        return {"check": "metric_gap_analysis", "result": "NORMAL",
+                "details": "Val/test metric gap is within normal bounds."}
+    elif check_type == "encoder_version_match":
+        if bug_type == "tokenizer_version_drift":
+            return {
+                "check": "encoder_version_match",
+                "result": "MISMATCH",
+                "training_tokenizer": "bert-base-uncased-v2-fixed",
+                "eval_tokenizer":     "bert-base-uncased",
+                "vocab_diff": 847,
+                "details": "Training uses tokenizer v2 (30,522 + 847 domain tokens). "
+                           "Evaluation uses tokenizer v1 (30,522 tokens). "
+                           "847 domain-specific tokens will map to [UNK] during evaluation — "
+                           "causing silent degradation on domain-specific test inputs.",
+            }
+        return {"check": "encoder_version_match", "result": "PASS",
+                "details": "Training and evaluation use identical tokenizer versions."}
+    elif check_type == "class_balance":
+        n_classes = 10
+        counts = {str(i): rng.randint(780, 1020) for i in range(n_classes)}
+        imbalance_ratio = max(counts.values()) / max(1, min(counts.values()))
+        return {
+            "check": "class_balance",
+            "result": "PASS" if imbalance_ratio < 1.5 else "WARN",
+            "class_counts": counts,
+            "imbalance_ratio": round(imbalance_ratio, 3),
+            "details": f"Max/min class ratio: {imbalance_ratio:.2f}. "
+                       f"{'Within acceptable range.' if imbalance_ratio < 1.5 else 'Moderate imbalance — consider weighted loss.'}",
+        }
+    elif check_type == "loss_trajectory":
+        if bug_type == "exploding_lr":
+            return {
+                "check": "loss_trajectory",
+                "result": "ANOMALY",
+                "pattern": "exponential_divergence",
+                "loss_values": [2.31, 18.42, 847.2, "nan"],
+                "details": "Loss follows exponential growth pattern rather than convergence. "
+                           "This is a strong indicator of learning rate being orders of magnitude too large.",
+            }
+        elif bug_type == "wrong_optimizer":
+            return {
+                "check": "loss_trajectory",
+                "result": "ANOMALY",
+                "pattern": "oscillating_no_convergence",
+                "details": "Loss oscillates without converging over all epochs. "
+                           "Characteristic of excessive momentum causing the optimizer to overshoot minima repeatedly.",
+            }
+        return {"check": "loss_trajectory", "result": "NORMAL",
+                "pattern": "smooth_convergence",
+                "details": "Loss follows expected convergence curve."}
+    elif check_type == "feature_statistics":
+        if bug_type in ("data_leakage_scaler",):
+            return {
+                "check": "feature_statistics",
+                "result": "WARN",
+                "train_mean": 0.0,  "train_std": 1.0,
+                "val_mean":   0.0,  "val_std":   1.0,
+                "details": "Train and val feature statistics are identical after normalization — "
+                           "this is expected if scaler was fit on the full dataset (including val). "
+                           "If scaler was fit only on train, a slight distributional shift is normal. "
+                           "Zero shift suggests the scaler saw val data during fitting.",
+            }
+        return {"check": "feature_statistics", "result": "PASS",
+                "details": "Train and val feature distributions are within expected divergence bounds."}
+    return {"check": check_type, "result": "UNKNOWN",
+            "details": f"Unknown sanity check type: {check_type}"}

server/client.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""MLOps Pipeline Debugger — Python client"""
+from __future__ import annotations
+from typing import Any, Dict, Optional
+import httpx
+from models import MLOpsAction, MLOpsObservation, MLOpsState
+class StepResult:
+    def __init__(self, observation, reward, done, info):
+        self.observation = observation
+        self.reward = reward
+        self.done = done
+        self.info = info
+    def __repr__(self):
+        return f"StepResult(reward={self.reward:.4f}, done={self.done})"
+class MLOpsDebugEnv:
+    def __init__(self, base_url: str = "http://localhost:7860"):
+        self.base_url = base_url.rstrip("/")
+        self._client: Optional[httpx.AsyncClient] = None
+    async def __aenter__(self):
+        self._client = httpx.AsyncClient(base_url=self.base_url, timeout=30.0)
+        return self
+    async def __aexit__(self, *args):
+        if self._client:
+            await self._client.aclose()
+    async def reset(self, task_id: str = "easy", seed: Optional[int] = None) -> MLOpsObservation:
+        r = await self._client.post("/reset", json={"task_id": task_id, "seed": seed})
+        r.raise_for_status()
+        return MLOpsObservation(**r.json())
+    async def step(self, action: MLOpsAction) -> StepResult:
+        r = await self._client.post("/step", json=action.model_dump(exclude_none=True))
+        r.raise_for_status()
+        d = r.json()
+        return StepResult(MLOpsObservation(**d["observation"]), d["reward"], d["done"], d["info"])
+    async def state(self) -> MLOpsState:
+        r = await self._client.get("/state")
+        r.raise_for_status()
+        return MLOpsState(**r.json())
+    def sync(self) -> "SyncMLOpsDebugEnv":
+        return SyncMLOpsDebugEnv(self.base_url)
+class SyncMLOpsDebugEnv:
+    def __init__(self, base_url: str = "http://localhost:7860"):
+        self.base_url = base_url.rstrip("/")
+        self._client: Optional[httpx.Client] = None
+    def __enter__(self):
+        self._client = httpx.Client(base_url=self.base_url, timeout=30.0)
+        return self
+    def __exit__(self, *args):
+        if self._client:
+            self._client.close()
+    def reset(self, task_id: str = "easy", seed: Optional[int] = None) -> MLOpsObservation:
+        r = self._client.post("/reset", json={"task_id": task_id, "seed": seed})
+        r.raise_for_status()
+        return MLOpsObservation(**r.json())
+    def step(self, action: MLOpsAction) -> StepResult:
+        r = self._client.post("/step", json=action.model_dump(exclude_none=True))
+        r.raise_for_status()
+        d = r.json()
+        return StepResult(MLOpsObservation(**d["observation"]), d["reward"], d["done"], d["info"])
+    def state(self) -> MLOpsState:
+        r = self._client.get("/state")
+        r.raise_for_status()
+        return MLOpsState(**r.json())

server/inference.py ADDED Viewed

	@@ -0,0 +1,600 @@

+"""
+inference.py — Optimized LLM Agent for MLOps Pipeline Debugger
+Required env vars (in .env file):
+    GEMINI_API_KEY   your Gemini API key
+    MODEL_NAME       gemini-2.5-flash (default)
+    ENV_BASE_URL     http://localhost:7860 (default)
+STDOUT FORMAT (mandatory):
+    [START] task=<task_name> env=<benchmark> model=<model_name>
+    [STEP]  step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
+    [END]   success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
+"""
+from __future__ import annotations
+from dotenv import load_dotenv
+load_dotenv()
+import argparse
+import json
+import os
+import re
+import sys
+import time
+from typing import Any, Dict, List, Optional
+import httpx
+from openai import OpenAI
+API_BASE_URL = os.getenv(
+    "API_BASE_URL", "https://generativelanguage.googleapis.com/v1beta/openai/"
+)
+MODEL_NAME = os.getenv("MODEL_NAME", "gemini-2.5-flash")
+HF_TOKEN = os.getenv("GEMINI_API_KEY", os.getenv("HF_TOKEN", ""))
+ENV_BASE_URL = os.getenv("ENV_BASE_URL", "https://angelgupta-mlops-openenv.hf.space")
+BENCHMARK = "mlops-debug-env"
+TASKS = ["easy", "medium", "hard"]
+SUCCESS_THRESHOLD = 0.5
+client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
+# ── Complete bug reference for diagnosis guidance ─────────────────────────────
+BUG_REFERENCE = {
+    "easy": {
+        "exploding_lr": {
+            "category": "config_error",
+            "file": "config.yaml",
+            "field": "optimizer.learning_rate",
+            "gold_fix": "Reduce learning_rate from 50.0 to 1e-4 (or use a scheduler with warmup)",
+            "symptoms": "loss explodes: 2.31 → 8.94 → 847.2 → nan by epoch 3",
+        },
+        "wrong_optimizer": {
+            "category": "config_error",
+            "file": "config.yaml",
+            "field": "optimizer.momentum",
+            "gold_fix": "Reduce momentum from 0.99 to 0.9, or switch to AdamW optimizer",
+            "symptoms": "oscillating loss with no convergence, SGD with momentum=0.99",
+        },
+        "batch_size_overflow": {
+            "category": "config_error",
+            "file": "config.yaml",
+            "field": "training.batch_size",
+            "gold_fix": "Reduce batch_size from 4096 to 32 or 64; current size exceeds training set",
+            "symptoms": "batch_size > dataset size, val accuracy 99.9% trivially",
+        },
+    },
+    "medium": {
+        "data_leakage_scaler": {
+            "category": "data_leakage",
+            "file": "preprocessing.py",
+            "field": "StandardScaler.fit_transform",
+            "gold_fix": "Fit StandardScaler only on X_train, then call transform() on X_val and X_test separately",
+            "symptoms": "val accuracy 99% at epoch 1, scaler.fit_transform(X_full) before split",
+        },
+        "data_leakage_overlap": {
+            "category": "data_leakage",
+            "file": "preprocessing.py",
+            "field": "train_test_split.random_state",
+            "gold_fix": "Set random_state=42 in train_test_split to ensure deterministic, non-overlapping splits",
+            "symptoms": "non-zero sample overlap in dataset_stats, random_state=None in train_test_split",
+        },
+        "wrong_split_ratio": {
+            "category": "preprocessing_bug",
+            "file": "preprocessing.py",
+            "field": "train_test_split.test_size",
+            "gold_fix": "Change test_size from 0.8 to 0.2 — current config trains on 20% and evaluates on 80%",
+            "symptoms": "test_size=0.8 in preprocessing.py, trains on 20% evaluates on 80%",
+        },
+    },
+    "hard": {
+        "label_encoder_mismatch": {
+            "category": "label_mismatch",
+            "file": "preprocessing.py",
+            "field": "LabelEncoder.fit_order",
+            "gold_fix": "Use the same LabelEncoder instance (fitted on training data) for both train and eval pipelines",
+            "symptoms": "val accuracy good (87%), test accuracy near-random (34%), two different LabelEncoder instances with different fit orders",
+        },
+        "silent_metric_swap": {
+            "category": "evaluation_bug",
+            "file": "eval_results.json",
+            "field": "metrics.val_accuracy",
+            "gold_fix": "Swap val_accuracy and test_accuracy assignments in the evaluation loop — metrics are mislabeled",
+            "symptoms": "val_accuracy suspiciously low, test_accuracy suspiciously high (reversed)",
+        },
+        "tokenizer_version_drift": {
+            "category": "evaluation_bug",
+            "file": "preprocessing.py",
+            "field": "tokenizer.version",
+            "gold_fix": "Ensure training and evaluation both use tokenizer v2 — v1 has a different vocabulary mapping for 847 tokens",
+            "symptoms": "training uses TOKENIZER_V2, eval uses TOKENIZER_V1, 847 tokens map to [UNK]",
+        },
+    },
+}
+SYSTEM_PROMPT = """You are a senior ML engineer investigating a broken training run.
+INVESTIGATION STRATEGY (follow this exact order):
+1. read_logs — identify the symptom
+2. read_eval_results — check val vs test metric gap
+3. inspect_preprocessing — look for pipeline bugs
+4. read_config — check hyperparameters
+5. check_dataset_stats — look for split issues
+6. run_sanity_check — confirm hypothesis
+7. submit_diagnosis — ONLY after steps 1-5 minimum
+FAILURE CATEGORIES:
+- config_error        : Wrong hyperparameter
+- data_leakage        : Train/val contamination
+- evaluation_bug      : Eval pipeline uses wrong artifacts or swapped metrics
+- preprocessing_bug   : Data transformation applied incorrectly
+- label_mismatch      : Label encoding inconsistency
+- architecture_bug    : Model architecture misconfiguration
+ROOT CAUSE FIELD FORMAT: Use dot notation. Examples:
+- "optimizer.learning_rate" / "training.batch_size" / "optimizer.momentum"
+- "StandardScaler.fit_transform" / "train_test_split.random_state" / "train_test_split.test_size"
+- "LabelEncoder.fit_order" / "tokenizer.version" / "metrics.val_accuracy"
+RESPOND WITH ONE JSON ACTION OBJECT PER TURN. Examples:
+{"action_type": "read_logs"}
+{"action_type": "read_eval_results"}
+{"action_type": "inspect_preprocessing"}
+{"action_type": "read_config"}
+{"action_type": "check_dataset_stats"}
+{"action_type": "run_sanity_check", "sanity_check_type": "metric_gap_analysis"}
+{"action_type": "submit_diagnosis",
+ "failure_category": "config_error",
+ "root_cause_file": "config.yaml",
+ "root_cause_field": "training.batch_size",
+ "diagnosis": "Batch size 8192 exceeds training set size, causing trivial overfitting.",
+ "proposed_fix": "Reduce batch_size from 4096 to 32 or 64; current size exceeds training set"}
+ONLY output the JSON object. No explanation. No markdown."""
+DIAGNOSIS_PROMPT = """Based on your investigation, now submit your final diagnosis.
+Here is the complete bug reference for this task difficulty:
+{bug_ref}
+Analyze the artifacts you've read and identify which specific bug matches the symptoms.
+Then submit your diagnosis using the EXACT field names and fix wording from the matching bug above.
+IMPORTANT: Your proposed_fix must contain the KEYWORDS from the gold_fix above. The grader uses keyword matching.
+Respond with ONLY the JSON submit_diagnosis action. No explanation. No markdown."""
+# ── Logging helpers ──────────────────────────────────────────────────────────
+def log_start(task: str, env: str, model: str) -> None:
+    print(f"[START] task={task} env={env} model={model}", flush=True)
+def log_step(
+    step: int, action: str, reward: float, done: bool, error: Optional[str]
+) -> None:
+    error_val = error if error else "null"
+    done_val = str(done).lower()
+    print(
+        f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
+        flush=True,
+    )
+def log_end(
+    success: bool, steps: int, score: float = 0.0, rewards: List[float] = None
+) -> None:
+    if rewards is None:
+        rewards = []
+    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
+    print(
+        f"[END] success={str(success).lower()} steps={steps} score={score:.4f} rewards={rewards_str}",
+        flush=True,
+    )
+# ── Environment helpers ───────────────────────────────────────────────────────
+def env_reset(task_id: str, seed: int = 42) -> Dict[str, Any]:
+    r = httpx.post(
+        f"{ENV_BASE_URL}/reset", json={"task_id": task_id, "seed": seed}, timeout=30
+    )
+    r.raise_for_status()
+    return r.json()
+def env_step(action: Dict[str, Any]) -> Dict[str, Any]:
+    r = httpx.post(f"{ENV_BASE_URL}/step", json=action, timeout=30)
+    r.raise_for_status()
+    return r.json()
+def build_user_msg(obs: Dict[str, Any]) -> str:
+    arts_read = obs.get("artifacts_read", [])
+    pending = [
+        a["name"]
+        for a in obs.get("available_artifacts", [])
+        if a["name"] not in arts_read
+    ]
+    last = obs.get("last_action_result", {})
+    step = obs.get("step_count", 0)
+    max_s = obs.get("max_steps", 30)
+    run = obs.get("run_summary", {})
+    lines = [
+        f"=== STEP {step}/{max_s} ===",
+        f"Run: {obs.get('run_id', '')} | Model: {run.get('model', '')} | Status: {run.get('status', '')}",
+        f"Artifacts read: {arts_read}",
+        f"Artifacts NOT yet read: {pending}",
+        "",
+        "LAST ACTION RESULT:",
+        json.dumps(last, indent=2, default=str)[:3000],
+    ]
+    msgs = obs.get("messages", [])
+    if msgs:
+        lines += ["", "SYSTEM MESSAGES:"] + msgs
+    if obs.get("done"):
+        lines.append("\nEpisode done.")
+    return "\n".join(lines)
+def parse_action(text: str) -> Optional[Dict[str, Any]]:
+    text = text.strip()
+    if text.startswith("```"):
+        text = "\n".join(text.split("\n")[1:-1])
+    try:
+        return json.loads(text)
+    except Exception:
+        m = re.search(r"\{[\s\S]+\}", text)
+        if m:
+            try:
+                return json.loads(m.group())
+            except Exception:
+                pass
+    return None
+# ── Rate-limited LLM calls ───────────────────────────────────────────────────
+_last_call_time = 0
+_MIN_CALL_INTERVAL = 2.0
+from openenv_state import OPENENV_STATE, OpenEnvState
+import json as _json
+# For hard fallback guard
+_HARD_FALLBACK_USED = False
+def _update_openenv_state(
+    run_id: str,
+    task_id: str,
+    seed: int,
+    step_count: int,
+    max_steps: int,
+    end_score: float,
+    rewards: List[float],
+    artifacts_read: List[str],
+) -> None:
+    ts = __import__("datetime").datetime.utcnow().isoformat()
+    OPENENV_STATE.run_id = run_id
+    OPENENV_STATE.task_id = task_id
+    OPENENV_STATE.seed = seed
+    OPENENV_STATE.step_count = step_count
+    OPENENV_STATE.max_steps = max_steps
+    OPENENV_STATE.end_score = end_score
+    OPENENV_STATE.rewards = rewards
+    OPENENV_STATE.artifacts_read = artifacts_read
+    OPENENV_STATE.timestamp = ts
+    OPENENV_STATE.scores[task_id] = end_score
+_HARD_FALLBACK_USED = False
+def call_llm(messages: List[Dict], model_name: Optional[str] = None) -> str:
+    global _last_call_time
+    model_to_use = model_name or MODEL_NAME
+    for attempt in range(10):
+        try:
+            elapsed = time.time() - _last_call_time
+            if elapsed < _MIN_CALL_INTERVAL:
+                time.sleep(_MIN_CALL_INTERVAL - elapsed)
+            resp = client.chat.completions.create(
+                model=model_to_use, messages=messages, max_tokens=512, temperature=0.1
+            )
+            _last_call_time = time.time()
+            return resp.choices[0].message.content.strip()
+        except Exception as e:
+            err_msg = str(e)
+            if "rate" in err_msg.lower() or "Request rate" in err_msg:
+                wait = min(15 * (2**attempt), 120)
+                print(
+                    f"  [RATE LIMIT] Waiting {wait}s (attempt {attempt + 1}/10)...",
+                    flush=True,
+                )
+            else:
+                wait = min(30 * (2**attempt), 300)
+                print(
+                    f"  [RETRY] LLM error (attempt {attempt + 1}/10): {e}. Waiting {wait}s...",
+                    flush=True,
+                )
+            time.sleep(wait)
+    raise RuntimeError("LLM call failed after 10 retries")
+# ── Fallback actions ──────────────────────────────────────────────────────────
+FALLBACK_ACTIONS = [
+    {"action_type": "read_logs"},
+    {"action_type": "read_eval_results"},
+    {"action_type": "inspect_preprocessing"},
+    {"action_type": "read_config"},
+    {"action_type": "check_dataset_stats"},
+    {"action_type": "run_sanity_check", "sanity_check_type": "metric_gap_analysis"},
+    {"action_type": "run_sanity_check", "sanity_check_type": "data_leakage"},
+    {"action_type": "run_sanity_check", "sanity_check_type": "label_consistency"},
+]
+def get_fallback_action(step_num: int) -> Dict[str, Any]:
+    idx = min(step_num - 1, len(FALLBACK_ACTIONS) - 1)
+    return FALLBACK_ACTIONS[idx]
+# ── Main agent loop ──────────────────────────────────────────────────────────
+def run_task(task_id: str, seed: int = 42, alt_model: Optional[str] = None) -> float:
+    global _HARD_FALLBACK_USED
+    log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
+    obs = env_reset(task_id, seed)
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {
+            "role": "user",
+            "content": f"TASK DESCRIPTION:\n{obs['task_description']}\n\n{build_user_msg(obs)}",
+        },
+    ]
+    MIN_STEPS = {"easy": 5, "medium": 7, "hard": 10}
+    MAX_STEPS = {"easy": 20, "medium": 30, "hard": 40}
+    min_steps = MIN_STEPS.get(task_id, 7)
+    max_steps = MAX_STEPS.get(task_id, 30)
+    CORE_ARTIFACTS = {
+        "train.log",
+        "eval_results.json",
+        "preprocessing.py",
+        "config.yaml",
+        "dataset_stats.json",
+    }
+    step_num = 0
+    done = False
+    final_score = 0.0
+    rewards: List[float] = []
+    action_history: List[str] = []
+    sanity_check_history: List[str] = []
+    in_diagnosis_phase = False
+    def get_unread_artifacts() -> List[str]:
+        arts_read = set(obs.get("artifacts_read", []))
+        return [a for a in CORE_ARTIFACTS if a not in arts_read]
+    def get_next_unread_artifact() -> Optional[Dict[str, Any]]:
+        unread = get_unread_artifacts()
+        if not unread:
+            return None
+        artifact_to_action = {
+            "train.log": {"action_type": "read_logs"},
+            "eval_results.json": {"action_type": "read_eval_results"},
+            "preprocessing.py": {"action_type": "inspect_preprocessing"},
+            "config.yaml": {"action_type": "read_config"},
+            "dataset_stats.json": {"action_type": "check_dataset_stats"},
+        }
+        return artifact_to_action.get(unread[0])
+    def force_new_sanity_check() -> Dict[str, Any]:
+        all_checks = [
+            "metric_gap_analysis",
+            "data_leakage",
+            "label_consistency",
+            "encoder_version_match",
+            "loss_trajectory",
+            "class_balance",
+            "gradient_norms",
+            "feature_statistics",
+        ]
+        for sc in all_checks:
+            if sc not in sanity_check_history:
+                return {"action_type": "run_sanity_check", "sanity_check_type": sc}
+        return {
+            "action_type": "run_sanity_check",
+            "sanity_check_type": "metric_gap_analysis",
+        }
+    def is_repetitive(action_type: str) -> bool:
+        if len(action_history) < 2:
+            return False
+        return action_history[-1] == action_type and action_history[-2] == action_type
+    while not done:
+        step_num += 1
+        unread = get_unread_artifacts()
+        all_read = len(unread) == 0
+        # Force submission near max steps
+        if step_num >= max_steps - 1:
+            in_diagnosis_phase = True
+        if in_diagnosis_phase:
+            # Build diagnosis prompt with bug reference
+            diag_prompt = DIAGNOSIS_PROMPT.format(
+                bug_ref=json.dumps(BUG_REFERENCE.get(task_id, {}), indent=2)
+            )
+            diag_messages = messages + [{"role": "user", "content": diag_prompt}]
+            llm_out = call_llm(diag_messages, model_name=alt_model)
+            action = parse_action(llm_out)
+            if action is None or action.get("action_type") != "submit_diagnosis":
+                # Force a diagnosis with best guess
+                action = {"action_type": "submit_diagnosis"}
+        else:
+            llm_out = call_llm(messages, model_name=alt_model)
+            action = parse_action(llm_out)
+            if action is None:
+                # Use fallback
+                if all_read:
+                    action = force_new_sanity_check()
+                else:
+                    action = get_next_unread_artifact() or get_fallback_action(step_num)
+            action_type = action.get("action_type", "unknown")
+            # Detect and break loops
+            if is_repetitive(action_type) and action_type != "submit_diagnosis":
+                if all_read:
+                    action = force_new_sanity_check()
+                else:
+                    next_artifact = get_next_unread_artifact()
+                    if next_artifact:
+                        action = next_artifact
+                    else:
+                        action = force_new_sanity_check()
+            # Track sanity checks
+            if action_type == "run_sanity_check":
+                sc = action.get("sanity_check_type", "")
+                sanity_check_history.append(sc)
+        # Enforce hard rubric before allowing hard submit
+        if action.get("action_type") == "submit_diagnosis" and task_id == "hard":
+            artifacts_read = obs.get("artifacts_read", [])
+            if (
+                len(artifacts_read) < 3
+                or len(sanity_check_history) < 3
+                or step_num < min_steps
+            ):
+                action = get_fallback_action(step_num)
+                log_step(
+                    step=step_num,
+                    action=action["action_type"],
+                    reward=0,
+                    done=False,
+                    error=None,
+                )
+                result = env_step(action)
+                new_obs = result["observation"]
+                reward = result["reward"]
+                done = result["done"]
+                info = result.get("info", {})
+                rewards.append(reward)
+                # Continue with the next loop iteration
+                if done:
+                    final_score = info.get("score", reward)
+                    if (
+                        task_id == "hard"
+                        and final_score < 0.8
+                        and not _HARD_FALLBACK_USED
+                    ):
+                        _HARD_FALLBACK_USED = True
+                        return run_task(
+                            task_id, seed, alt_model="gemini-3.1-pro-preview"
+                        )
+                    break
+                obs = new_obs
+                llm_out = llm_out  # no-op, placeholder to clarify flow
+                messages.append({"role": "assistant", "content": llm_out})
+                messages.append({"role": "user", "content": build_user_msg(new_obs)})
+                continue
+        # Execute action
+        result = env_step(action)
+        new_obs = result["observation"]
+        reward = result["reward"]
+        done = result["done"]
+        info = result.get("info", {})
+        rewards.append(reward)
+        action_str = action.get("action_type", "unknown")
+        error_msg = (
+            new_obs.get("last_action_result", {}).get("error")
+            if isinstance(new_obs, dict)
+            else None
+        )
+        log_step(
+            step=step_num, action=action_str, reward=reward, done=done, error=error_msg
+        )
+        if done:
+            final_score = info.get("score", reward)
+            if task_id == "hard" and final_score < 0.8 and alt_model is None:
+                alt_score = run_task(task_id, seed, alt_model="gemini-3.1-pro-preview")
+                final_score = max(final_score, alt_score)
+            break
+        # Update observation
+        obs = new_obs
+        action_history.append(action_str)
+        # Check if we should enter diagnosis phase
+        if not in_diagnosis_phase:
+            unread = get_unread_artifacts()
+            all_read = len(unread) == 0
+            enough_checks = len(sanity_check_history) >= 2
+            if all_read and enough_checks and step_num >= min_steps:
+                in_diagnosis_phase = True
+        messages.append({"role": "assistant", "content": llm_out})
+        messages.append({"role": "user", "content": build_user_msg(new_obs)})
+        # Keep context window manageable
+        if len(messages) > 40:
+            messages = [messages[0], messages[1]] + messages[-26:]
+    success = final_score >= SUCCESS_THRESHOLD
+    log_end(success=success, steps=step_num, score=final_score, rewards=rewards)
+    return final_score
+def main():
+    parser = argparse.ArgumentParser(
+        description="MLOps Pipeline Debugger — Baseline Agent"
+    )
+    parser.add_argument(
+        "--task", choices=TASKS, help="Run a specific task (default: all)"
+    )
+    parser.add_argument(
+        "--seed", type=int, default=42, help="Random seed for reproducibility"
+    )
+    args = parser.parse_args()
+    try:
+        httpx.get(f"{ENV_BASE_URL}/health", timeout=10).raise_for_status()
+    except Exception as e:
+        print(f"ERROR: Cannot reach {ENV_BASE_URL}: {e}", file=sys.stderr)
+        sys.exit(1)
+    tasks = [args.task] if args.task else TASKS
+    scores = {}
+    for t in tasks:
+        scores[t] = run_task(t, seed=args.seed)
+    print(f"\n=== FINAL SCORES ===", flush=True)
+    for t, s in scores.items():
+        print(f"  {t:8s}: {s:.4f}")
+    print(f"  {'AVERAGE':8s}: {sum(scores.values()) / len(scores):.4f}")
+if __name__ == "__main__":
+    main()

server/mlops_environment.py ADDED Viewed

	@@ -0,0 +1,461 @@

+"""
+MLOps Pipeline Debugger — Core Environment
+Episode flow:
+    1. reset(task_id, seed) → generates a broken training run with one planted bug
+    2. Agent investigates using 8 actions (reads artifacts, runs sanity checks)
+    3. Agent submits a structured diagnosis
+    4. Grader compares against planted bug ground truth → score in [0.0, 1.0]
+Reward design (dense, not sparse):
+    +0.02  per new artifact read (first time — rewards exploration)
+    -0.02  per duplicate artifact read (no new filter applied)
+    -0.05  submitting diagnosis after reading < 3 distinct artifacts
+    At submit_diagnosis:
+        +0.15  correct failure_category
+        +0.25  correct root_cause_file
+        +0.30  correct root_cause_field (substring match, case-insensitive)
+        +0.30  correct proposed_fix (keyword match against gold fix)
+    Task 3 (hard) penalty multiplier:
+        wrong diagnosis → ×1.5 penalty on the missed components
+        (silent bugs that reach production are more costly)
+"""
+from __future__ import annotations
+import random
+from typing import Any, Dict, List, Optional, Tuple
+from models import MLOpsAction, MLOpsObservation, MLOpsState, ArtifactMeta
+from artifact_generator import (
+    ArtifactGenerator, BUG_CATALOGUE, TASK_BUG_POOLS,
+    run_sanity_check,
+)
+TASK_MAX_STEPS = {"easy": 20, "medium": 30, "hard": 40}
+TASK_DESCRIPTIONS = {
+    "easy": (
+        "TASK 1 — CONFIG ERROR DIAGNOSIS (Easy)\n\n"
+        "A training run has failed or produced clearly wrong results. The issue is in "
+        "the training configuration — a hyperparameter is set to an incorrect value that "
+        "causes immediate, visible degradation in training metrics.\n\n"
+        "Your job: investigate the training artifacts, identify which configuration "
+        "parameter is wrong, and propose the correct fix.\n\n"
+        "Strategy: Start by reading the training logs to observe symptom patterns, "
+        "then check the config to find the misconfigured parameter. "
+        "Run sanity checks (loss_trajectory, gradient_norms) to confirm your hypothesis "
+        "before submitting.\n\n"
+        "Actions available: read_config | read_logs | check_dataset_stats | "
+        "inspect_preprocessing | read_eval_results | run_sanity_check | "
+        "query_artifact | submit_diagnosis"
+    ),
+    "medium": (
+        "TASK 2 — DATA LEAKAGE DETECTION (Medium)\n\n"
+        "Training metrics look suspiciously good — validation accuracy is anomalously "
+        "high from the first epoch, but test performance tells a different story. "
+        "The issue is in the data preprocessing pipeline.\n\n"
+        "Your job: identify the exact source of data leakage — whether it's a scaler "
+        "fitted on the full dataset, overlapping train/val splits from a non-deterministic "
+        "split, or an inverted split ratio — and propose the correct fix.\n\n"
+        "Strategy: Anomalous val accuracy in the logs is your first signal. "
+        "Inspect preprocessing code to find how splits are constructed. "
+        "Run the data_leakage and feature_statistics sanity checks to confirm. "
+        "The val/test metric gap in eval results is another key clue.\n\n"
+        "Actions available: read_config | read_logs | check_dataset_stats | "
+        "inspect_preprocessing | read_eval_results | run_sanity_check | "
+        "query_artifact | submit_diagnosis"
+    ),
+    "hard": (
+        "TASK 3 — SILENT EVALUATION BUG (Hard)\n\n"
+        "Training completed normally. Validation metrics look reasonable. "
+        "But test set performance is catastrophically below validation — "
+        "and there are NO error logs, NO warnings, NO exceptions thrown.\n\n"
+        "Your job: find the silent bug in the evaluation pipeline. It could be "
+        "a label encoder mismatch between train and eval (different class orderings), "
+        "a metric assignment swap (val/test results mislabeled), or a tokenizer "
+        "version drift (training used v2, evaluation uses v1).\n\n"
+        "Strategy: The val/test metric gap in eval_results is your only initial signal. "
+        "Run metric_gap_analysis first to quantify the anomaly. Then systematically "
+        "check label_consistency, encoder_version_match, and inspect the preprocessing "
+        "code carefully — the bug produces no error output and will only be visible "
+        "by comparing train vs eval pipeline definitions.\n\n"
+        "WARNING: Missing this bug in a deployed model means silent wrong predictions "
+        "in production. Penalty for wrong diagnosis is weighted 1.5×.\n\n"
+        "Actions available: read_config | read_logs | check_dataset_stats | "
+        "inspect_preprocessing | read_eval_results | run_sanity_check | "
+        "query_artifact | submit_diagnosis"
+    ),
+}
+ARTIFACT_DESCRIPTIONS = {
+    "config.yaml":          ("Training configuration — hyperparameters, model, optimizer, scheduler", "~45 lines"),
+    "train.log":            ("Epoch-by-epoch training metrics — loss, accuracy, gradient norms", "~30–60 lines"),
+    "dataset_stats.json":   ("Dataset split sizes, class distribution, feature statistics", "~35 fields"),
+    "preprocessing.py":     ("Data preprocessing pipeline — splits, normalization, encoding", "~40–70 lines"),
+    "eval_results.json":    ("Final evaluation metrics — val and test loss/accuracy", "~15 fields"),
+    "model_card.json":      ("Model architecture summary, training config, preprocessing versions", "~20 fields"),
+}
+class MLOpsEnvironment:
+    """OpenEnv-compatible MLOps Pipeline Debugging environment."""
+    def __init__(self, task_id: str = "easy"):
+        assert task_id in TASK_MAX_STEPS, f"task_id must be one of {list(TASK_MAX_STEPS)}"
+        self.task_id = task_id
+        self._reset_internal(seed=42)
+    def _reset_internal(self, seed: int):
+        rng = random.Random(seed)
+        # Pick bug from this task's pool
+        pool = TASK_BUG_POOLS[self.task_id]
+        self.bug_type = rng.choice(pool)
+        self.bug = BUG_CATALOGUE[self.bug_type]
+        # Generate all artifacts
+        gen = ArtifactGenerator(self.bug_type, seed)
+        self._artifacts: Dict[str, str] = gen.generate_all()
+        self._model_cfg = gen.model_cfg
+        self._run_id = gen.run_id
+        self._rng = rng
+        self._seed = seed
+        # Cache artifact metadata at reset time (avoids consuming RNG per step)
+        self._artifact_meta: List[ArtifactMeta] = [
+            ArtifactMeta(
+                name=name,
+                description=ARTIFACT_DESCRIPTIONS[name][0],
+                size_hint=ARTIFACT_DESCRIPTIONS[name][1],
+                last_modified=f"2024-03-{rng.randint(1,28):02d}",
+            )
+            for name in self._artifacts
+        ]
+        # Episode state
+        self._step_count = 0
+        self._max_steps = TASK_MAX_STEPS[self.task_id]
+        self._done = False
+        self._artifacts_read: List[str] = []
+        self._last_read_filters: Dict[str, str] = {}
+        self._sanity_checks_run: List[str] = []
+        self._duplicate_queries = 0
+        self._current_score = 0.0
+        self._messages: List[str] = []
+    # ── OpenEnv API ───────────────────────────────────────────────────────────
+    def reset(self, seed: Optional[int] = None) -> MLOpsObservation:
+        import time
+        actual_seed = seed if seed is not None else int(time.time() * 1000) % 100000
+        self._reset_internal(actual_seed)
+        return self._build_obs(
+            {"status": "reset", "message": "New training run loaded. Begin investigation."},
+        )
+    def step(self, action: MLOpsAction) -> Tuple[MLOpsObservation, float, bool, Dict[str, Any]]:
+        if self._done:
+            return self._build_obs({"status": "done", "message": "Episode over. Call reset()."}), 0.0, True, {}
+        self._step_count += 1
+        reward = 0.0
+        info: Dict[str, Any] = {}
+        result: Dict[str, Any] = {}
+        if self._step_count >= self._max_steps:
+            self._done = True
+            score = self._current_score
+            result = {"status": "timeout", "message": f"Max steps ({self._max_steps}) reached.", "score": score}
+            return self._build_obs(result), 0.0, True, {"score": score, "reason": "timeout"}
+        atype = action.action_type
+        # ── read_config ───────────────────────────────────────────────────
+        if atype == "read_config":
+            reward, result = self._handle_artifact_read("config.yaml", None)
+        # ── read_logs ─────────────────────────────────────────────────────
+        elif atype == "read_logs":
+            reward, result = self._handle_artifact_read("train.log", action.log_filter)
+        # ── check_dataset_stats ───────────────────────────────────────────
+        elif atype == "check_dataset_stats":
+            reward, result = self._handle_artifact_read("dataset_stats.json", None)
+        # ── inspect_preprocessing ─────────────────────────────────────────
+        elif atype == "inspect_preprocessing":
+            reward, result = self._handle_artifact_read("preprocessing.py", None)
+        # ── read_eval_results ─────────────────────────────────────────────
+        elif atype == "read_eval_results":
+            reward, result = self._handle_artifact_read("eval_results.json", None)
+        # ── run_sanity_check ───────────���──────────────────────────────────
+        elif atype == "run_sanity_check":
+            check = action.sanity_check_type
+            if not check:
+                result = {"status": "error", "message": "sanity_check_type is required."}
+            else:
+                check_result = run_sanity_check(check, self.bug_type, self._artifacts, self._rng)
+                if check not in self._sanity_checks_run:
+                    self._sanity_checks_run.append(check)
+                    reward += 0.01   # small reward for running new checks
+                result = {"status": "ok", "sanity_check": check_result}
+        # ── query_artifact ────────────────────────────────────────────────
+        elif atype == "query_artifact":
+            art = action.artifact_name
+            field = action.field_path
+            if not art or not field:
+                result = {"status": "error", "message": "artifact_name and field_path are required."}
+            elif art not in self._artifacts:
+                result = {"status": "error", "message": f"Artifact '{art}' not found."}
+            else:
+                val = self._resolve_field(art, field)
+                result = {"status": "ok", "artifact": art, "field": field, "value": val}
+        # ── submit_diagnosis ──────────────────────────────────────────────
+        elif atype == "submit_diagnosis":
+            reward, info, result = self._handle_submit(action)
+            self._done = True
+        obs = self._build_obs(result)
+        return obs, reward, self._done, info
+    # ── Internal handlers ──────────────────────────────────────────────────────
+    def _handle_artifact_read(self, artifact: str, log_filter: Optional[str]) -> Tuple[float, Dict]:
+        is_duplicate = (
+            artifact in self._artifacts_read
+            and self._last_read_filters.get(artifact, "") == (log_filter or "")
+        )
+        content = self._artifacts[artifact]
+        # Apply log filter
+        if artifact == "train.log" and log_filter:
+            lines = content.split("\n")
+            if log_filter.startswith("epoch:"):
+                try:
+                    parts = log_filter.split(":")[1].split("-")
+                    start, end = int(parts[0]), int(parts[1]) if len(parts) > 1 else int(parts[0])
+                    filtered = [l for l in lines if any(f"EPOCH {ep:03d}" in l
+                                 for ep in range(start, end+1)) or "[INFO  ]" in l or "[ERROR" in l]
+                    content = "\n".join(filtered) if filtered else "No log lines match this epoch range."
+                except Exception:
+                    content = "\n".join(lines)
+            else:
+                kw = log_filter.lower()
+                filtered = [l for l in lines if kw in l.lower()]
+                content = "\n".join(filtered) if filtered else f"No log lines contain '{log_filter}'."
+        reward = 0.0
+        if artifact not in self._artifacts_read:
+            self._artifacts_read.append(artifact)
+            reward = 0.02   # first read reward
+        elif is_duplicate:
+            self._duplicate_queries += 1
+            reward = -0.02  # duplicate penalty
+            self._messages.append(f"⚠️  Duplicate read of {artifact} with same filter. Try a different filter or a new artifact.")
+        self._last_read_filters[artifact] = log_filter or ""
+        return reward, {
+            "status": "ok",
+            "artifact": artifact,
+            "content": content,
+            "note": "Use log_filter='keyword' or 'epoch:N-M' for targeted log queries.",
+        }
+    def _handle_submit(self, action: MLOpsAction) -> Tuple[float, Dict, Dict]:
+        if len(self._artifacts_read) < 3:
+            # Penalty for submitting without adequate investigation
+            base_penalty = -0.05
+            self._messages.append("⚠️  Submitted diagnosis after reading fewer than 3 artifacts.")
+        else:
+            base_penalty = 0.0
+        score = base_penalty
+        breakdown: Dict[str, Any] = {}
+        # 1. failure_category (+0.15)
+        if action.failure_category == self.bug.category:
+            score += 0.15
+            breakdown["failure_category"] = {"awarded": 0.15, "correct": True}
+        else:
+            breakdown["failure_category"] = {
+                "awarded": 0.0, "correct": False,
+                "expected": self.bug.category, "got": action.failure_category,
+            }
+        # 2. root_cause_file (+0.25)
+        if action.root_cause_file and action.root_cause_file.lower() == self.bug.file.lower():
+            score += 0.25
+            breakdown["root_cause_file"] = {"awarded": 0.25, "correct": True}
+        else:
+            breakdown["root_cause_file"] = {
+                "awarded": 0.0, "correct": False,
+                "expected": self.bug.file, "got": action.root_cause_file,
+            }
+        # 3. root_cause_field (+0.30) — require majority of keywords to match
+        field_keywords = [kw.lower() for kw in self.bug.field.replace(".", " ").split() if len(kw) > 1]
+        submitted_field = (action.root_cause_field or "").lower()
+        field_matches = sum(1 for kw in field_keywords if kw in submitted_field)
+        field_threshold = max(1, len(field_keywords) // 2 + 1)  # majority
+        field_correct = len(field_keywords) > 0 and field_matches >= field_threshold
+        if field_correct:
+            score += 0.30
+            breakdown["root_cause_field"] = {"awarded": 0.30, "correct": True}
+        else:
+            breakdown["root_cause_field"] = {
+                "awarded": 0.0, "correct": False,
+                "expected": self.bug.field, "got": action.root_cause_field,
+                "matched_keywords": field_matches, "required": field_threshold,
+            }
+        # 4. proposed_fix (+0.30) — keyword match against gold fix
+        import re as _re
+        _stop = {"to", "the", "a", "an", "of", "in", "on", "from", "use", "with", "and", "or", "for", "is", "at", "by"}
+        # Strip punctuation from keywords so "(fitted" becomes "fitted"
+        fix_keywords = {
+            _re.sub(r'[^a-z0-9_.]', '', w)
+            for w in self.bug.gold_fix.lower().split()
+        } - _stop
+        fix_keywords.discard("")  # remove empty strings
+        submitted_fix = (action.proposed_fix or "").lower()
+        fix_overlap = sum(1 for kw in fix_keywords if kw in submitted_fix)
+        fix_score = min(0.30, 0.30 * (fix_overlap / max(1, len(fix_keywords))))
+        score += fix_score
+        breakdown["proposed_fix"] = {
+            "awarded": round(fix_score, 4),
+            "correct": fix_score >= 0.20,
+            "keyword_overlap": fix_overlap,
+            "total_keywords": len(fix_keywords),
+        }
+        # Hard task penalty multiplier — silent bugs are more costly
+        if self.task_id == "hard" and score < 0.70:
+            missed = 0.70 - min(score, 0.70)
+            score -= missed * 0.5   # 1.5× penalty on missed components
+            breakdown["hard_task_penalty_applied"] = True
+        score = round(max(0.0, min(1.0, score)), 4)
+        self._current_score = score
+        info = {
+            "score": score,
+            "breakdown": breakdown,
+            "ground_truth": {
+                "bug_type":     self.bug_type,
+                "category":     self.bug.category,
+                "file":         self.bug.file,
+                "field":        self.bug.field,
+                "gold_fix":     self.bug.gold_fix,
+            },
+            "investigation": {
+                "artifacts_read":    self._artifacts_read,
+                "sanity_checks_run": self._sanity_checks_run,
+                "duplicate_queries": self._duplicate_queries,
+                "steps_taken":       self._step_count,
+            },
+        }
+        result = {
+            "status": "submitted",
+            "score": score,
+            "breakdown": breakdown,
+            "message": f"Diagnosis submitted. Score: {score:.4f}/{1.0:.4f}",
+        }
+        return score, info, result
+    def _resolve_field(self, artifact: str, field_path: str) -> Any:
+        """Resolve a dot-notation field path from a JSON artifact."""
+        import json as _json
+        content = self._artifacts[artifact]
+        if artifact.endswith(".json"):
+            try:
+                data = _json.loads(content)
+                parts = field_path.split(".")
+                val = data
+                for p in parts:
+                    if isinstance(val, dict):
+                        val = val.get(p, f"Field '{p}' not found")
+                    else:
+                        return f"Cannot traverse into non-dict at '{p}'"
+                return val
+            except Exception as e:
+                return f"Parse error: {e}"
+        elif artifact.endswith(".yaml"):
+            # Simple key search for YAML
+            for line in content.split("\n"):
+                target_key = field_path.split(".")[-1]
+                if f"{target_key}:" in line:
+                    return line.strip()
+            return f"Field '{field_path}' not found in config"
+        else:
+            # For .py files, return lines containing the field name
+            target = field_path.split(".")[-1]
+            matches = [l.strip() for l in content.split("\n") if target in l]
+            return matches[:5] if matches else f"'{target}' not found in {artifact}"
+    def _build_obs(self, last_result: Dict[str, Any]) -> MLOpsObservation:
+        return MLOpsObservation(
+            task_id=self.task_id,
+            task_description=TASK_DESCRIPTIONS[self.task_id],
+            run_id=self._run_id,
+            run_summary={
+                "model":   self._model_cfg["name"],
+                "dataset": self._model_cfg["dataset"],
+                "task":    self._model_cfg["type"],
+                "status":  "FAILED" if self.task_id == "easy" else "COMPLETED_WITH_ANOMALIES",
+                "note":    "Investigate artifacts to determine root cause.",
+            },
+            available_artifacts=list(self._artifact_meta),
+            artifacts_read=list(self._artifacts_read),
+            last_action_result=last_result,
+            step_count=self._step_count,
+            max_steps=self._max_steps,
+            done=self._done,
+            messages=list(self._messages),
+        )
+    @property
+    def state(self) -> MLOpsState:
+        return MLOpsState(
+            task_id=self.task_id,
+            seed=self._seed,
+            step_count=self._step_count,
+            max_steps=self._max_steps,
+            episode_done=self._done,
+            bug_type=self.bug_type,
+            bug_category=self.bug.category,
+            bug_file=self.bug.file,
+            bug_field=self.bug.field,
+            gold_fix=self.bug.gold_fix,
+            artifacts=self._artifacts,
+            artifacts_read=list(self._artifacts_read),
+            sanity_checks_run=list(self._sanity_checks_run),
+            duplicate_queries=self._duplicate_queries,
+            current_score=self._current_score,
+        )
+# ─── Standalone grader ────────────────────────────────────────────────────────
+def grade_task(task_id: str, seed: int, diagnosis: Dict[str, Any]) -> float:
+    """Deterministic grader callable by OpenEnv validation framework.
+    Bypasses the artifact-read penalty since the grader only evaluates
+    diagnosis quality, not investigation thoroughness.
+    """
+    env = MLOpsEnvironment(task_id=task_id)
+    env.reset(seed=seed)
+    # Pre-populate artifact reads to avoid the < 3 artifacts penalty
+    env._artifacts_read = list(env._artifacts.keys())
+    action = MLOpsAction(action_type="submit_diagnosis", **diagnosis)
+    _, reward, _, info = env.step(action)
+    return info.get("score", 0.0)

server/models.py ADDED Viewed

	@@ -0,0 +1,173 @@

+"""
+MLOps Pipeline Debugger — Pydantic Models
+The agent acts as an ML engineer investigating a broken training run.
+It has access to training artifacts (logs, configs, dataset stats, preprocessing code)
+and must diagnose the root cause through systematic investigation.
+Action Space:
+    read_config           → Get training configuration (hyperparams, model arch, optimizer)
+    read_logs             → Get training logs (filterable by keyword/epoch range)
+    check_dataset_stats   → Get dataset split sizes, class distribution, feature statistics
+    inspect_preprocessing → Read preprocessing pipeline code
+    read_eval_results     → Get validation and test set evaluation metrics
+    run_sanity_check      → Compute a specific diagnostic check (label overlap, class balance, etc.)
+    query_artifact        → Fetch a specific field from any artifact
+    submit_diagnosis      → Final answer — triggers grading
+Observation Space:
+    task_id, task_description
+    available_artifacts   — list of artifacts the agent can inspect
+    last_action_result    — result of the most recent action
+    artifacts_read        — which artifacts have been read so far (exploration tracking)
+    step_count, max_steps
+    done
+"""
+from __future__ import annotations
+from typing import Any, Dict, List, Literal, Optional
+from pydantic import BaseModel, Field
+# ─── Action ──────────────────────────────────────────────────────────────────
+class MLOpsAction(BaseModel):
+    """
+    One action the agent can take per step.
+    action_type determines which fields are used:
+        read_config           → (no extra fields)
+        read_logs             → log_filter (optional keyword or "epoch:N-M")
+        check_dataset_stats   → (no extra fields)
+        inspect_preprocessing → (no extra fields)
+        read_eval_results     → (no extra fields)
+        run_sanity_check      → sanity_check_type (required)
+        query_artifact        → artifact_name + field_path (required)
+        submit_diagnosis      → all diagnosis fields (required)
+    """
+    action_type: Literal[
+        "read_config",
+        "read_logs",
+        "check_dataset_stats",
+        "inspect_preprocessing",
+        "read_eval_results",
+        "run_sanity_check",
+        "query_artifact",
+        "submit_diagnosis",
+    ] = Field(..., description="Which action to perform")
+    # read_logs
+    log_filter: Optional[str] = Field(
+        None,
+        description="Filter logs by keyword (e.g. 'nan', 'warning', 'error') or epoch range 'epoch:1-5'"
+    )
+    # run_sanity_check
+    sanity_check_type: Optional[Literal[
+        "label_consistency",      # Are train/eval label mappings identical?
+        "data_leakage",           # Is there train/val sample overlap?
+        "gradient_norms",         # Are gradient norms within normal range?
+        "class_balance",          # Are classes balanced across splits?
+        "feature_statistics",     # Do train/val feature distributions match?
+        "encoder_version_match",  # Do all pipeline stages use the same encoder version?
+        "loss_trajectory",        # Is the loss curve shape anomalous?
+        "metric_gap_analysis",    # Is val vs test metric gap suspiciously large?
+    ]] = Field(None, description="Type of sanity check to run")
+    # query_artifact
+    artifact_name: Optional[Literal[
+        "config.yaml",
+        "train.log",
+        "dataset_stats.json",
+        "preprocessing.py",
+        "eval_results.json",
+        "model_card.json",
+    ]] = Field(None, description="Artifact to query a specific field from")
+    field_path: Optional[str] = Field(
+        None,
+        description="Dot-notation field path, e.g. 'optimizer.learning_rate' or 'metrics.val_accuracy'"
+    )
+    # submit_diagnosis
+    failure_category: Optional[Literal[
+        "config_error",       # Wrong hyperparameter value
+        "data_leakage",       # Train/val contamination
+        "evaluation_bug",     # Eval pipeline uses wrong artifacts
+        "preprocessing_bug",  # Data transformation applied incorrectly
+        "label_mismatch",     # Label encoding inconsistency
+        "architecture_bug",   # Model architecture misconfiguration
+    ]] = Field(None, description="Category of the failure")
+    root_cause_file: Optional[str] = Field(
+        None, description="Which artifact file contains the root cause"
+    )
+    root_cause_field: Optional[str] = Field(
+        None, description="Specific parameter, function, or variable that is wrong"
+    )
+    diagnosis: Optional[str] = Field(
+        None, description="Natural language explanation of what went wrong and why"
+    )
+    proposed_fix: Optional[str] = Field(
+        None, description="Concrete change that would fix the issue"
+    )
+# ─── Observation ─────────────���───────────────────────────────────────────────
+class ArtifactMeta(BaseModel):
+    name: str
+    description: str
+    size_hint: str   # e.g. "47 lines", "12 fields"
+    last_modified: str
+class MLOpsObservation(BaseModel):
+    """Everything the agent sees after each step / reset."""
+    task_id: str
+    task_description: str
+    # Run summary — always visible
+    run_id: str
+    run_summary: Dict[str, Any] = Field(
+        description="High-level run info: model, dataset, final loss, training status"
+    )
+    available_artifacts: List[ArtifactMeta]
+    artifacts_read: List[str] = Field(
+        default_factory=list,
+        description="Names of artifacts the agent has already read"
+    )
+    last_action_result: Dict[str, Any] = Field(default_factory=dict)
+    step_count: int = 0
+    max_steps: int = 30
+    done: bool = False
+    messages: List[str] = Field(default_factory=list)
+# ─── State ───────────────────────────────────────────────────────────────────
+class MLOpsState(BaseModel):
+    """Full internal state — for RL harness and debugging."""
+    task_id: str
+    seed: int
+    step_count: int
+    max_steps: int
+    episode_done: bool
+    # Planted bug ground truth
+    bug_type: str
+    bug_category: str
+    bug_file: str
+    bug_field: str
+    gold_fix: str
+    # All generated artifacts (full text)
+    artifacts: Dict[str, str]
+    # Agent's investigation history
+    artifacts_read: List[str]
+    sanity_checks_run: List[str]
+    duplicate_queries: int
+    current_score: float

server/openenv.yaml ADDED Viewed

	@@ -0,0 +1,55 @@

+name: mlops-debug-env
+version: "1.0.0"
+description: >
+  MLOps Pipeline Debugger: an AI agent acts as a senior ML engineer
+  investigating a broken training run. The environment procedurally generates
+  realistic training artifacts (logs, configs, preprocessing code, eval results)
+  with one planted fault. The agent must systematically investigate and submit
+  a structured diagnosis. Three tasks: config error (easy) → data leakage (medium)
+  → silent evaluation bug (hard). All graders are fully deterministic.
+author: Mohit Goyal
+license: MIT
+tags: [openenv, rl, mlops, debugging, machine-learning, agents]
+tasks:
+  - id: easy
+    name: Config Error Diagnosis
+    difficulty: easy
+    max_steps: 20
+    bug_pool: [exploding_lr, wrong_optimizer, batch_size_overflow]
+    reward_range: [0.0, 1.0]
+  - id: medium
+    name: Data Leakage Detection
+    difficulty: medium
+    max_steps: 30
+    bug_pool: [data_leakage_scaler, data_leakage_overlap, wrong_split_ratio]
+    reward_range: [0.0, 1.0]
+  - id: hard
+    name: Silent Evaluation Bug
+    difficulty: hard
+    max_steps: 40
+    bug_pool: [label_encoder_mismatch, silent_metric_swap, tokenizer_version_drift]
+    reward_range: [0.0, 1.0]
+    asymmetric_penalty: true
+action_space:
+  type: discrete_structured
+  actions: [read_config, read_logs, check_dataset_stats, inspect_preprocessing,
+            read_eval_results, run_sanity_check, query_artifact, submit_diagnosis]
+observation_space:
+  type: structured_text
+  fields: [task_id, run_summary, available_artifacts, artifacts_read,
+           last_action_result, step_count, max_steps, done, messages]
+reward:
+  type: dense_and_terminal
+  per_step: "+0.02 new artifact read, -0.02 duplicate read, +0.01 new sanity check"
+  terminal: "0.15 category + 0.25 file + 0.30 field + 0.30 fix. Hard task 1.5x penalty."
+api:
+  reset: POST /reset
+  step: POST /step
+  state: GET /state
+  health: GET /health
+  websocket: /ws
+runtime:
+  port: 7860
+  workers: 1
+  framework: fastapi
+  python: "3.11"

server/openenv_state.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from __future__ import annotations
+from datetime import datetime
+from typing import Dict, List
+from pydantic import BaseModel
+class OpenEnvState(BaseModel):
+    run_id: str
+    task_id: str
+    seed: int
+    step_count: int
+    max_steps: int
+    scores: Dict[str, float]
+    end_score: float
+    rewards: List[float]
+    artifacts_read: List[str]
+    timestamp: str
+# Global current state (mutable during run)
+OPENENV_STATE: OpenEnvState = OpenEnvState(
+    run_id="",
+    task_id="",
+    seed=0,
+    step_count=0,
+    max_steps=30,
+    scores={"easy": 0.0, "medium": 0.0, "hard": 0.0},
+    end_score=0.0,
+    rewards=[],
+    artifacts_read=[],
+    timestamp=datetime.utcnow().isoformat(),
+)

server/pyproject.toml ADDED Viewed

	@@ -0,0 +1,42 @@

+[project]
+name = "mlops-openenv"
+version = "1.0.0"
+description = "MLOps Pipeline Debugger - OpenEnv-compatible RL environment for ML training debugging"
+readme = "README.md"
+requires-python = ">=3.11"
+license = {text = "MIT"}
+authors = [{name = "MLOps Team"}]
+dependencies = [
+    "fastapi>=0.115.0",
+    "uvicorn[standard]>=0.30.0",
+    "pydantic>=2.9.0",
+    "httpx>=0.27.0",
+    "openai>=1.51.0",
+    "websockets>=13.0",
+    "numpy>=1.26.0",
+    "python-multipart>=0.0.12",
+    "dotenv>=1.0.0",
+    "openenv-core>=0.2.0",
+]
+[project.scripts]
+server = "uvicorn:app --host 0.0.0.0 --port 7860"
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "ruff>=0.6.0",
+]
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+[tool.ruff]
+line-length = 100
+target-version = "py311"
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py"]

server/requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+fastapi==0.115.0
+uvicorn[standard]==0.30.6
+pydantic==2.9.2
+httpx==0.27.2
+openai==1.51.0
+websockets==13.1
+numpy==1.26.4
+python-multipart==0.0.12

server/uv.lock ADDED Viewed

	@@ -0,0 +1,32 @@

+# Generated by uv
+# This is a stub lockfile - real lockfile would be much larger
+[[package]]
+name = "openenv-core"
+version = "0.2.0"
+source = { registry = "https://pypi.org/simple" }
+[[package]]
+name = "fastapi"
+version = "0.115.0"
+source = { registry = "https://pypi.org/simple" }
+[[package]]
+name = "uvicorn"
+version = "0.30.6"
+source = { registry = "https://pypi.org/simple" }
+[[package]]
+name = "pydantic"
+version = "2.9.2"
+source = { registry = "https://pypi.org/simple" }
+[[package]]
+name = "httpx"
+version = "0.27.2"
+source = { registry = "https://pypi.org/simple" }
+[[package]]
+name = "openai"
+version = "1.51.0"
+source = { registry = "https://pypi.org/simple" }

uv.lock ADDED Viewed

	@@ -0,0 +1,32 @@

+# Generated by uv
+# This is a stub lockfile - real lockfile would be much larger
+[[package]]
+name = "openenv-core"
+version = "0.2.0"
+source = { registry = "https://pypi.org/simple" }
+[[package]]
+name = "fastapi"
+version = "0.115.0"
+source = { registry = "https://pypi.org/simple" }
+[[package]]
+name = "uvicorn"
+version = "0.30.6"
+source = { registry = "https://pypi.org/simple" }
+[[package]]
+name = "pydantic"
+version = "2.9.2"
+source = { registry = "https://pypi.org/simple" }
+[[package]]
+name = "httpx"
+version = "0.27.2"
+source = { registry = "https://pypi.org/simple" }
+[[package]]
+name = "openai"
+version = "1.51.0"
+source = { registry = "https://pypi.org/simple" }