Spaces:
Sleeping
Sleeping
Commit ·
6a71058
1
Parent(s): 696c12c
Add uv.lock, pyproject.toml, server directory
Browse files- pyproject.toml +4 -0
- server/.env.example +5 -0
- server/app.py +181 -0
- server/artifact_generator.py +960 -0
- server/client.py +79 -0
- server/inference.py +600 -0
- server/mlops_environment.py +461 -0
- server/models.py +173 -0
- server/openenv.yaml +55 -0
- server/openenv_state.py +34 -0
- server/pyproject.toml +42 -0
- server/requirements.txt +8 -0
- server/uv.lock +32 -0
- uv.lock +32 -0
pyproject.toml
CHANGED
|
@@ -17,8 +17,12 @@ dependencies = [
|
|
| 17 |
"numpy>=1.26.0",
|
| 18 |
"python-multipart>=0.0.12",
|
| 19 |
"dotenv>=1.0.0",
|
|
|
|
| 20 |
]
|
| 21 |
|
|
|
|
|
|
|
|
|
|
| 22 |
[project.optional-dependencies]
|
| 23 |
dev = [
|
| 24 |
"pytest>=8.0.0",
|
|
|
|
| 17 |
"numpy>=1.26.0",
|
| 18 |
"python-multipart>=0.0.12",
|
| 19 |
"dotenv>=1.0.0",
|
| 20 |
+
"openenv-core>=0.2.0",
|
| 21 |
]
|
| 22 |
|
| 23 |
+
[project.scripts]
|
| 24 |
+
server = "uvicorn:app --host 0.0.0.0 --port 7860"
|
| 25 |
+
|
| 26 |
[project.optional-dependencies]
|
| 27 |
dev = [
|
| 28 |
"pytest>=8.0.0",
|
server/.env.example
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
GEMINI_API_KEY=your_gemini_api_key_here
|
| 2 |
+
MODEL_NAME=gemini-2.5-flash
|
| 3 |
+
ENV_BASE_URL=https://angelgupta-mlops-openenv.hf.space
|
| 4 |
+
API_BASE_URL=https://generativelanguage.googleapis.com/v1beta/openai/
|
| 5 |
+
HF_TOKEN=your_hf_token_here
|
server/app.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
import json
|
| 3 |
+
from typing import Any, Dict, Optional
|
| 4 |
+
from fastapi import FastAPI, HTTPException, WebSocket, WebSocketDisconnect, Request
|
| 5 |
+
from openenv_state import OPENENV_STATE, OpenEnvState
|
| 6 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 7 |
+
from pydantic import BaseModel
|
| 8 |
+
|
| 9 |
+
from models import MLOpsAction, MLOpsObservation, MLOpsState
|
| 10 |
+
from mlops_environment import MLOpsEnvironment
|
| 11 |
+
|
| 12 |
+
app = FastAPI(
|
| 13 |
+
title="MLOps Pipeline Debugger",
|
| 14 |
+
description="OpenEnv environment: AI agent diagnoses broken ML training runs.",
|
| 15 |
+
version="1.0.0",
|
| 16 |
+
)
|
| 17 |
+
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
|
| 18 |
+
|
| 19 |
+
_http_env: Optional[MLOpsEnvironment] = None
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class ResetRequest(BaseModel):
|
| 23 |
+
task_id: Optional[str] = "easy"
|
| 24 |
+
seed: Optional[int] = None
|
| 25 |
+
task: Optional[str] = None # Support both task_id and task
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class StepResponse(BaseModel):
|
| 29 |
+
observation: MLOpsObservation
|
| 30 |
+
reward: float
|
| 31 |
+
done: bool
|
| 32 |
+
info: Dict[str, Any]
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
@app.post("/reset", response_model=MLOpsObservation)
|
| 36 |
+
async def reset(request: Request):
|
| 37 |
+
try:
|
| 38 |
+
body = await request.json()
|
| 39 |
+
except Exception:
|
| 40 |
+
body = {}
|
| 41 |
+
task_id = body.get("task_id") or body.get("task") or "easy"
|
| 42 |
+
seed = body.get("seed")
|
| 43 |
+
global _http_env
|
| 44 |
+
_http_env = MLOpsEnvironment(task_id=task_id)
|
| 45 |
+
return _http_env.reset(seed=seed)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@app.get("/")
|
| 49 |
+
async def root():
|
| 50 |
+
return {
|
| 51 |
+
"message": "MLOps Pipeline Debugger API",
|
| 52 |
+
"version": "1.0.0",
|
| 53 |
+
"docs": "This is an OpenEnv-compatible RL environment",
|
| 54 |
+
"endpoints": {
|
| 55 |
+
"GET /": "This message",
|
| 56 |
+
"GET /health": "Health check",
|
| 57 |
+
"GET /tasks": "List available tasks",
|
| 58 |
+
"GET /openenv/state": "OpenEnv state",
|
| 59 |
+
"POST /reset": "Start a new episode",
|
| 60 |
+
"POST /step": "Take an action",
|
| 61 |
+
"GET /state": "Get current state",
|
| 62 |
+
},
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
@app.get("/health")
|
| 67 |
+
async def health():
|
| 68 |
+
return {"status": "ok", "environment": "mlops_debug_env", "version": "1.0.0"}
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
@app.get("/openenv/state", response_model=OpenEnvState)
|
| 72 |
+
def openenv_state():
|
| 73 |
+
return OPENENV_STATE
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
@app.get("/tasks")
|
| 77 |
+
async def list_tasks():
|
| 78 |
+
return {
|
| 79 |
+
"tasks": [
|
| 80 |
+
{
|
| 81 |
+
"task_id": "easy",
|
| 82 |
+
"name": "Config Error Diagnosis",
|
| 83 |
+
"difficulty": "easy",
|
| 84 |
+
"max_steps": 20,
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"task_id": "medium",
|
| 88 |
+
"name": "Data Leakage Detection",
|
| 89 |
+
"difficulty": "medium",
|
| 90 |
+
"max_steps": 30,
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"task_id": "hard",
|
| 94 |
+
"name": "Silent Evaluation Bug",
|
| 95 |
+
"difficulty": "hard",
|
| 96 |
+
"max_steps": 40,
|
| 97 |
+
},
|
| 98 |
+
]
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
@app.post("/step", response_model=StepResponse)
|
| 103 |
+
async def step(request: Request):
|
| 104 |
+
if _http_env is None:
|
| 105 |
+
raise HTTPException(400, "Call /reset first.")
|
| 106 |
+
|
| 107 |
+
# Get raw body as dict
|
| 108 |
+
try:
|
| 109 |
+
body = await request.json()
|
| 110 |
+
except Exception:
|
| 111 |
+
body = {}
|
| 112 |
+
|
| 113 |
+
# Handle various input formats
|
| 114 |
+
action = None
|
| 115 |
+
try:
|
| 116 |
+
if "action_type" in body:
|
| 117 |
+
action = MLOpsAction(**body)
|
| 118 |
+
elif "action" in body:
|
| 119 |
+
action = MLOpsAction(**body["action"])
|
| 120 |
+
elif "message" in body:
|
| 121 |
+
action = MLOpsAction(action_type=body["message"])
|
| 122 |
+
except Exception as e:
|
| 123 |
+
raise HTTPException(422, f"Invalid action: {str(e)}")
|
| 124 |
+
|
| 125 |
+
if action is None or action.action_type is None:
|
| 126 |
+
raise HTTPException(422, "Field required: action_type")
|
| 127 |
+
|
| 128 |
+
try:
|
| 129 |
+
obs, reward, done, info = _http_env.step(action)
|
| 130 |
+
return StepResponse(observation=obs, reward=reward, done=done, info=info)
|
| 131 |
+
except Exception as e:
|
| 132 |
+
raise HTTPException(500, f"Step error: {str(e)}")
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
@app.get("/state", response_model=MLOpsState)
|
| 136 |
+
async def state():
|
| 137 |
+
if _http_env is None:
|
| 138 |
+
raise HTTPException(400, "Call /reset first.")
|
| 139 |
+
return _http_env.state
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
@app.websocket("/ws")
|
| 143 |
+
async def ws_endpoint(websocket: WebSocket):
|
| 144 |
+
await websocket.accept()
|
| 145 |
+
env: Optional[MLOpsEnvironment] = None
|
| 146 |
+
try:
|
| 147 |
+
while True:
|
| 148 |
+
msg = json.loads(await websocket.receive_text())
|
| 149 |
+
method = msg.get("method")
|
| 150 |
+
if method == "reset":
|
| 151 |
+
env = MLOpsEnvironment(task_id=msg.get("task_id", "easy"))
|
| 152 |
+
obs = env.reset(seed=msg.get("seed"))
|
| 153 |
+
await websocket.send_text(
|
| 154 |
+
json.dumps({"method": "reset", "observation": obs.model_dump()})
|
| 155 |
+
)
|
| 156 |
+
elif method == "step":
|
| 157 |
+
if env is None:
|
| 158 |
+
await websocket.send_text(json.dumps({"error": "Call reset first"}))
|
| 159 |
+
continue
|
| 160 |
+
action = MLOpsAction(**msg.get("action", {}))
|
| 161 |
+
obs, reward, done, info = env.step(action)
|
| 162 |
+
await websocket.send_text(
|
| 163 |
+
json.dumps(
|
| 164 |
+
{
|
| 165 |
+
"method": "step",
|
| 166 |
+
"observation": obs.model_dump(),
|
| 167 |
+
"reward": reward,
|
| 168 |
+
"done": done,
|
| 169 |
+
"info": info,
|
| 170 |
+
}
|
| 171 |
+
)
|
| 172 |
+
)
|
| 173 |
+
elif method == "state":
|
| 174 |
+
if env is None:
|
| 175 |
+
await websocket.send_text(json.dumps({"error": "Call reset first"}))
|
| 176 |
+
continue
|
| 177 |
+
await websocket.send_text(
|
| 178 |
+
json.dumps({"method": "state", "state": env.state.model_dump()})
|
| 179 |
+
)
|
| 180 |
+
except WebSocketDisconnect:
|
| 181 |
+
pass
|
server/artifact_generator.py
ADDED
|
@@ -0,0 +1,960 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Artifact Generator for MLOps Pipeline Debugger
|
| 3 |
+
|
| 4 |
+
Generates a full set of realistic ML training artifacts for a given bug scenario.
|
| 5 |
+
Each artifact is internally consistent — config matches logs, dataset stats match
|
| 6 |
+
preprocessing code — except for the one planted fault.
|
| 7 |
+
|
| 8 |
+
Bug types supported:
|
| 9 |
+
Task 1 (easy):
|
| 10 |
+
- exploding_lr : learning_rate too large → loss diverges to NaN
|
| 11 |
+
- wrong_optimizer : SGD with momentum=0.99 on non-convex problem
|
| 12 |
+
- batch_size_overflow: batch_size > dataset size → trivial overfitting signal
|
| 13 |
+
|
| 14 |
+
Task 2 (medium):
|
| 15 |
+
- data_leakage_scaler : StandardScaler fit on full dataset before split
|
| 16 |
+
- data_leakage_overlap : train/val split with random_state=None → overlap
|
| 17 |
+
- wrong_split_ratio : test data accidentally included in training
|
| 18 |
+
|
| 19 |
+
Task 3 (hard):
|
| 20 |
+
- label_encoder_mismatch : train/eval use different LabelEncoder.fit() orderings
|
| 21 |
+
- silent_metric_swap : val and test metric names swapped in eval code
|
| 22 |
+
- tokenizer_version_drift: training uses tokenizer v1, eval uses v2 (different vocab)
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
from __future__ import annotations
|
| 26 |
+
|
| 27 |
+
import json
|
| 28 |
+
import random
|
| 29 |
+
import textwrap
|
| 30 |
+
from dataclasses import dataclass, field
|
| 31 |
+
from typing import Dict, Tuple
|
| 32 |
+
|
| 33 |
+
import numpy as np
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# ─── Bug Specifications ───────────────────────────────────────────────────────
|
| 37 |
+
|
| 38 |
+
@dataclass
|
| 39 |
+
class BugSpec:
|
| 40 |
+
bug_type: str
|
| 41 |
+
category: str # maps to failure_category in Action
|
| 42 |
+
file: str # root_cause_file
|
| 43 |
+
field: str # root_cause_field
|
| 44 |
+
gold_fix: str
|
| 45 |
+
task_difficulty: str # easy / medium / hard
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
BUG_CATALOGUE: Dict[str, BugSpec] = {
|
| 49 |
+
# ── EASY ──────────────────────────────────────────────────────────────────
|
| 50 |
+
"exploding_lr": BugSpec(
|
| 51 |
+
bug_type="exploding_lr",
|
| 52 |
+
category="config_error",
|
| 53 |
+
file="config.yaml",
|
| 54 |
+
field="optimizer.learning_rate",
|
| 55 |
+
gold_fix="Reduce learning_rate from 50.0 to 1e-4 (or use a scheduler with warmup)",
|
| 56 |
+
task_difficulty="easy",
|
| 57 |
+
),
|
| 58 |
+
"wrong_optimizer": BugSpec(
|
| 59 |
+
bug_type="wrong_optimizer",
|
| 60 |
+
category="config_error",
|
| 61 |
+
file="config.yaml",
|
| 62 |
+
field="optimizer.momentum",
|
| 63 |
+
gold_fix="Reduce momentum from 0.99 to 0.9, or switch to AdamW optimizer",
|
| 64 |
+
task_difficulty="easy",
|
| 65 |
+
),
|
| 66 |
+
"batch_size_overflow": BugSpec(
|
| 67 |
+
bug_type="batch_size_overflow",
|
| 68 |
+
category="config_error",
|
| 69 |
+
file="config.yaml",
|
| 70 |
+
field="training.batch_size",
|
| 71 |
+
gold_fix="Reduce batch_size from 4096 to 32 or 64; current size exceeds training set",
|
| 72 |
+
task_difficulty="easy",
|
| 73 |
+
),
|
| 74 |
+
|
| 75 |
+
# ── MEDIUM ────────────────────────────────────────────────────────────────
|
| 76 |
+
"data_leakage_scaler": BugSpec(
|
| 77 |
+
bug_type="data_leakage_scaler",
|
| 78 |
+
category="data_leakage",
|
| 79 |
+
file="preprocessing.py",
|
| 80 |
+
field="StandardScaler.fit_transform",
|
| 81 |
+
gold_fix="Fit StandardScaler only on X_train, then call transform() on X_val and X_test separately",
|
| 82 |
+
task_difficulty="medium",
|
| 83 |
+
),
|
| 84 |
+
"data_leakage_overlap": BugSpec(
|
| 85 |
+
bug_type="data_leakage_overlap",
|
| 86 |
+
category="data_leakage",
|
| 87 |
+
file="preprocessing.py",
|
| 88 |
+
field="train_test_split.random_state",
|
| 89 |
+
gold_fix="Set random_state=42 in train_test_split to ensure deterministic, non-overlapping splits",
|
| 90 |
+
task_difficulty="medium",
|
| 91 |
+
),
|
| 92 |
+
"wrong_split_ratio": BugSpec(
|
| 93 |
+
bug_type="wrong_split_ratio",
|
| 94 |
+
category="preprocessing_bug",
|
| 95 |
+
file="preprocessing.py",
|
| 96 |
+
field="train_test_split.test_size",
|
| 97 |
+
gold_fix="Change test_size from 0.8 to 0.2 — current config trains on 20% and evaluates on 80%",
|
| 98 |
+
task_difficulty="medium",
|
| 99 |
+
),
|
| 100 |
+
|
| 101 |
+
# ── HARD ──────────────────────────────────────────────────────────────────
|
| 102 |
+
"label_encoder_mismatch": BugSpec(
|
| 103 |
+
bug_type="label_encoder_mismatch",
|
| 104 |
+
category="label_mismatch",
|
| 105 |
+
file="preprocessing.py",
|
| 106 |
+
field="LabelEncoder.fit_order",
|
| 107 |
+
gold_fix="Use the same LabelEncoder instance (fitted on training data) for both train and eval pipelines",
|
| 108 |
+
task_difficulty="hard",
|
| 109 |
+
),
|
| 110 |
+
"silent_metric_swap": BugSpec(
|
| 111 |
+
bug_type="silent_metric_swap",
|
| 112 |
+
category="evaluation_bug",
|
| 113 |
+
file="eval_results.json",
|
| 114 |
+
field="metrics.val_accuracy",
|
| 115 |
+
gold_fix="Swap val_accuracy and test_accuracy assignments in the evaluation loop — metrics are mislabeled",
|
| 116 |
+
task_difficulty="hard",
|
| 117 |
+
),
|
| 118 |
+
"tokenizer_version_drift": BugSpec(
|
| 119 |
+
bug_type="tokenizer_version_drift",
|
| 120 |
+
category="evaluation_bug",
|
| 121 |
+
file="preprocessing.py",
|
| 122 |
+
field="tokenizer.version",
|
| 123 |
+
gold_fix="Ensure training and evaluation both use tokenizer v2 — v1 has a different vocabulary mapping for 847 tokens",
|
| 124 |
+
task_difficulty="hard",
|
| 125 |
+
),
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
TASK_BUG_POOLS = {
|
| 129 |
+
"easy": ["exploding_lr", "wrong_optimizer", "batch_size_overflow"],
|
| 130 |
+
"medium": ["data_leakage_scaler", "data_leakage_overlap", "wrong_split_ratio"],
|
| 131 |
+
"hard": ["label_encoder_mismatch", "silent_metric_swap", "tokenizer_version_drift"],
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
# ─── Model / Dataset Configs (variety pool) ───────────────────────────────────
|
| 136 |
+
|
| 137 |
+
MODEL_CONFIGS = [
|
| 138 |
+
{"name": "ResNet-50", "type": "image_classification", "params": "25.6M",
|
| 139 |
+
"dataset": "ImageNet-subset-10k", "num_classes": 10, "input": "224x224 RGB"},
|
| 140 |
+
{"name": "BERT-base-uncased", "type": "text_classification", "params": "110M",
|
| 141 |
+
"dataset": "SST-2", "num_classes": 2, "input": "tokenized sequences, max_len=128"},
|
| 142 |
+
{"name": "EfficientNet-B3", "type": "image_classification", "params": "12.2M",
|
| 143 |
+
"dataset": "CIFAR-100", "num_classes": 100, "input": "300x300 RGB"},
|
| 144 |
+
{"name": "DistilBERT", "type": "sentiment_analysis", "params": "66M",
|
| 145 |
+
"dataset": "IMDB-reviews", "num_classes": 3, "input": "tokenized sequences, max_len=256"},
|
| 146 |
+
{"name": "MobileNetV3-Large", "type": "image_classification", "params": "5.4M",
|
| 147 |
+
"dataset": "Oxford-102-Flowers", "num_classes": 102, "input": "224x224 RGB"},
|
| 148 |
+
]
|
| 149 |
+
|
| 150 |
+
OPTIMIZERS = ["AdamW", "SGD", "RMSprop", "Adam"]
|
| 151 |
+
SCHEDULERS = ["cosine_annealing", "step_lr", "reduce_on_plateau", "linear_warmup"]
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
# ─── Artifact Generators ──────────────────────────────────────────────────────
|
| 155 |
+
|
| 156 |
+
class ArtifactGenerator:
|
| 157 |
+
"""
|
| 158 |
+
Generates all 6 training artifacts for a given (bug_type, seed) pair.
|
| 159 |
+
All artifacts are internally consistent except for the planted fault.
|
| 160 |
+
"""
|
| 161 |
+
|
| 162 |
+
def __init__(self, bug_type: str, seed: int):
|
| 163 |
+
self.bug = BUG_CATALOGUE[bug_type]
|
| 164 |
+
self.seed = seed
|
| 165 |
+
self.rng = random.Random(seed)
|
| 166 |
+
self.np_rng = np.random.RandomState(seed)
|
| 167 |
+
|
| 168 |
+
# Pick a model config deterministically
|
| 169 |
+
self.model_cfg = self.rng.choice(MODEL_CONFIGS)
|
| 170 |
+
self.optimizer_name = self.rng.choice(OPTIMIZERS)
|
| 171 |
+
self.scheduler_name = self.rng.choice(SCHEDULERS)
|
| 172 |
+
self.run_id = f"run_{seed:04d}_{bug_type[:6]}"
|
| 173 |
+
|
| 174 |
+
# Normal hyperparams
|
| 175 |
+
self.lr = self.rng.choice([1e-5, 3e-5, 1e-4, 3e-4])
|
| 176 |
+
self.batch_size = self.rng.choice([16, 32, 64])
|
| 177 |
+
self.epochs = self.rng.randint(8, 20)
|
| 178 |
+
self.weight_decay = self.rng.choice([0.01, 0.001, 1e-4])
|
| 179 |
+
self.momentum = 0.9
|
| 180 |
+
self.train_samples = self.rng.randint(8000, 15000)
|
| 181 |
+
self.val_samples = int(self.train_samples * 0.2)
|
| 182 |
+
self.test_samples = int(self.train_samples * 0.15)
|
| 183 |
+
|
| 184 |
+
def generate_all(self) -> Dict[str, str]:
|
| 185 |
+
return {
|
| 186 |
+
"config.yaml": self._gen_config(),
|
| 187 |
+
"train.log": self._gen_train_log(),
|
| 188 |
+
"dataset_stats.json": self._gen_dataset_stats(),
|
| 189 |
+
"preprocessing.py": self._gen_preprocessing(),
|
| 190 |
+
"eval_results.json": self._gen_eval_results(),
|
| 191 |
+
"model_card.json": self._gen_model_card(),
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
# ── config.yaml ──────────────────────────────────────────────────────────
|
| 195 |
+
|
| 196 |
+
def _gen_config(self) -> str:
|
| 197 |
+
lr = self.lr
|
| 198 |
+
batch_size = self.batch_size
|
| 199 |
+
momentum = self.momentum
|
| 200 |
+
|
| 201 |
+
if self.bug.bug_type == "exploding_lr":
|
| 202 |
+
lr = self.rng.choice([50.0, 10.0, 25.0])
|
| 203 |
+
elif self.bug.bug_type == "wrong_optimizer":
|
| 204 |
+
momentum = 0.99
|
| 205 |
+
self.optimizer_name = "SGD"
|
| 206 |
+
elif self.bug.bug_type == "batch_size_overflow":
|
| 207 |
+
batch_size = self.rng.choice([2048, 4096, 8192])
|
| 208 |
+
|
| 209 |
+
return textwrap.dedent(f"""\
|
| 210 |
+
# Training Configuration
|
| 211 |
+
# Run ID: {self.run_id}
|
| 212 |
+
# Generated: 2024-03-{self.rng.randint(1,28):02d}T{self.rng.randint(0,23):02d}:{self.rng.randint(0,59):02d}:00Z
|
| 213 |
+
|
| 214 |
+
model:
|
| 215 |
+
architecture: {self.model_cfg['name']}
|
| 216 |
+
num_classes: {self.model_cfg['num_classes']}
|
| 217 |
+
pretrained: true
|
| 218 |
+
pretrained_source: "timm/torchvision"
|
| 219 |
+
dropout: {self.rng.choice([0.1, 0.2, 0.3])}
|
| 220 |
+
freeze_backbone_epochs: {self.rng.randint(0, 3)}
|
| 221 |
+
|
| 222 |
+
training:
|
| 223 |
+
epochs: {self.epochs}
|
| 224 |
+
batch_size: {batch_size}
|
| 225 |
+
num_workers: {self.rng.choice([4, 8])}
|
| 226 |
+
pin_memory: true
|
| 227 |
+
mixed_precision: {str(self.rng.choice([True, False])).lower()}
|
| 228 |
+
gradient_clip_norm: {self.rng.choice([1.0, 5.0, "null"])}
|
| 229 |
+
early_stopping_patience: {self.rng.randint(3, 7)}
|
| 230 |
+
seed: {self.seed}
|
| 231 |
+
|
| 232 |
+
optimizer:
|
| 233 |
+
name: {self.optimizer_name}
|
| 234 |
+
learning_rate: {lr}
|
| 235 |
+
weight_decay: {self.weight_decay}
|
| 236 |
+
momentum: {momentum}
|
| 237 |
+
betas: [0.9, 0.999]
|
| 238 |
+
|
| 239 |
+
scheduler:
|
| 240 |
+
name: {self.scheduler_name}
|
| 241 |
+
warmup_epochs: {self.rng.randint(0, 3)}
|
| 242 |
+
min_lr: 1.0e-7
|
| 243 |
+
t_max: {self.epochs}
|
| 244 |
+
|
| 245 |
+
data:
|
| 246 |
+
dataset: {self.model_cfg['dataset']}
|
| 247 |
+
input_size: "{self.model_cfg['input']}"
|
| 248 |
+
train_split: 0.8
|
| 249 |
+
val_split: 0.1
|
| 250 |
+
test_split: 0.1
|
| 251 |
+
augmentation:
|
| 252 |
+
random_crop: true
|
| 253 |
+
horizontal_flip: {str(self.rng.choice([True, False])).lower()}
|
| 254 |
+
color_jitter: {self.rng.choice([0.2, 0.4])}
|
| 255 |
+
normalize_mean: [0.485, 0.456, 0.406]
|
| 256 |
+
normalize_std: [0.229, 0.224, 0.225]
|
| 257 |
+
|
| 258 |
+
logging:
|
| 259 |
+
log_interval: 50
|
| 260 |
+
save_best_only: true
|
| 261 |
+
checkpoint_dir: "./checkpoints/{self.run_id}"
|
| 262 |
+
wandb_project: "mlops-debug-bench"
|
| 263 |
+
""")
|
| 264 |
+
|
| 265 |
+
# ── train.log ────────────────────────────────────────────────────────────
|
| 266 |
+
|
| 267 |
+
def _gen_train_log(self) -> str:
|
| 268 |
+
lines = []
|
| 269 |
+
lines.append(f"[INFO 2024-03-{self.rng.randint(1,28):02d} {self.rng.randint(6,10):02d}:00:00] Starting training run: {self.run_id}")
|
| 270 |
+
lines.append(f"[INFO ] Model: {self.model_cfg['name']} | Params: {self.model_cfg['params']}")
|
| 271 |
+
lines.append(f"[INFO ] Dataset: {self.model_cfg['dataset']} | Train: {self.train_samples:,} | Val: {self.val_samples:,}")
|
| 272 |
+
lines.append(f"[INFO ] Device: cuda:0 | Mixed precision: fp16")
|
| 273 |
+
lines.append(f"[INFO ] Optimizer: {self.optimizer_name} | LR: {self.lr} | Batch: {self.batch_size}")
|
| 274 |
+
lines.append("[INFO ] ─" * 30)
|
| 275 |
+
|
| 276 |
+
bug = self.bug.bug_type
|
| 277 |
+
|
| 278 |
+
if bug == "exploding_lr":
|
| 279 |
+
# Loss explodes rapidly
|
| 280 |
+
loss = 2.302
|
| 281 |
+
for ep in range(1, min(self.epochs + 1, 6)):
|
| 282 |
+
acc = max(0.0, 0.12 - ep * 0.02)
|
| 283 |
+
val_loss = loss * self.rng.uniform(1.1, 1.5)
|
| 284 |
+
val_acc = max(0.0, acc - 0.05)
|
| 285 |
+
lines.append(f"[EPOCH {ep:03d}] train_loss={loss:.4f} train_acc={acc:.4f} "
|
| 286 |
+
f"val_loss={val_loss:.4f} val_acc={val_acc:.4f} "
|
| 287 |
+
f"lr={self.lr:.2e} grad_norm={loss * 18.3:.2f} "
|
| 288 |
+
f"time={self.rng.randint(45,90)}s")
|
| 289 |
+
if ep == 1: lines.append(f"[WARN ] Gradient norm unusually high: {loss * 18.3:.2f} (threshold: 10.0)")
|
| 290 |
+
loss = loss * self.rng.uniform(4.5, 9.0)
|
| 291 |
+
if loss > 1e6:
|
| 292 |
+
lines.append(f"[EPOCH {ep+1:03d}] train_loss=nan train_acc=0.1000 val_loss=nan val_acc=0.1000 "
|
| 293 |
+
f"lr={self.lr:.2e} grad_norm=nan time={self.rng.randint(45,90)}s")
|
| 294 |
+
lines.append(f"[ERROR ] Loss is NaN at epoch {ep+1}, step {self.rng.randint(100,300)}. Training halted.")
|
| 295 |
+
lines.append(f"[ERROR ] Last finite loss: {loss / self.rng.uniform(4,9):.2f}. Gradient explosion detected.")
|
| 296 |
+
break
|
| 297 |
+
|
| 298 |
+
elif bug == "wrong_optimizer":
|
| 299 |
+
# Loss oscillates wildly, never converges
|
| 300 |
+
loss = 2.302
|
| 301 |
+
for ep in range(1, self.epochs + 1):
|
| 302 |
+
delta = self.rng.uniform(-0.8, 1.2)
|
| 303 |
+
loss = max(1.8, loss + delta)
|
| 304 |
+
acc = self.rng.uniform(0.10, 0.25)
|
| 305 |
+
val_loss = loss + self.rng.uniform(-0.3, 0.8)
|
| 306 |
+
val_acc = self.rng.uniform(0.09, 0.22)
|
| 307 |
+
lines.append(f"[EPOCH {ep:03d}] train_loss={loss:.4f} train_acc={acc:.4f} "
|
| 308 |
+
f"val_loss={val_loss:.4f} val_acc={val_acc:.4f} "
|
| 309 |
+
f"lr={self.lr:.2e} grad_norm={self.rng.uniform(8.2, 45.1):.2f} "
|
| 310 |
+
f"time={self.rng.randint(45,90)}s")
|
| 311 |
+
if ep % 3 == 0:
|
| 312 |
+
lines.append(f"[WARN ] Loss oscillation detected over last 3 epochs: {loss+0.4:.3f} → {loss-0.5:.3f} → {loss:.3f}")
|
| 313 |
+
|
| 314 |
+
elif bug == "batch_size_overflow":
|
| 315 |
+
# Val accuracy hits 100% immediately — model memorizes tiny effective dataset
|
| 316 |
+
for ep in range(1, self.epochs + 1):
|
| 317 |
+
train_loss = max(0.001, 2.302 * (0.05 ** ep))
|
| 318 |
+
train_acc = min(1.0, 0.3 + ep * 0.09)
|
| 319 |
+
val_acc = 0.999 if ep >= 2 else 0.85
|
| 320 |
+
val_loss = 0.001 if ep >= 2 else 0.45
|
| 321 |
+
lines.append(f"[EPOCH {ep:03d}] train_loss={train_loss:.4f} train_acc={train_acc:.4f} "
|
| 322 |
+
f"val_loss={val_loss:.4f} val_acc={val_acc:.4f} "
|
| 323 |
+
f"lr={self.lr:.2e} grad_norm={self.rng.uniform(0.1,0.9):.3f} "
|
| 324 |
+
f"time={self.rng.randint(3,8)}s")
|
| 325 |
+
lines.append(f"[WARN ] Effective steps per epoch: {max(1, self.train_samples // 4096)}. Dataset may be smaller than batch size.")
|
| 326 |
+
|
| 327 |
+
elif bug in ("data_leakage_scaler", "data_leakage_overlap", "wrong_split_ratio"):
|
| 328 |
+
# Val accuracy suspiciously high from epoch 1
|
| 329 |
+
for ep in range(1, self.epochs + 1):
|
| 330 |
+
train_loss = max(0.01, 0.45 - ep * 0.02)
|
| 331 |
+
train_acc = min(0.98, 0.72 + ep * 0.015)
|
| 332 |
+
val_acc = min(0.999, 0.984 + self.rng.uniform(-0.002, 0.002)) if ep >= 1 else 0.71
|
| 333 |
+
val_loss = max(0.001, 0.04 - ep * 0.001)
|
| 334 |
+
lines.append(f"[EPOCH {ep:03d}] train_loss={train_loss:.4f} train_acc={train_acc:.4f} "
|
| 335 |
+
f"val_loss={val_loss:.4f} val_acc={val_acc:.4f} "
|
| 336 |
+
f"lr={self.lr:.2e} grad_norm={self.rng.uniform(0.1,1.2):.3f} "
|
| 337 |
+
f"time={self.rng.randint(45,90)}s")
|
| 338 |
+
lines.append(f"[INFO ] Best model saved at epoch 2: val_acc=0.9841")
|
| 339 |
+
lines.append(f"[WARN ] Val accuracy reached 98.4% at epoch 1 — verify no data leakage.")
|
| 340 |
+
|
| 341 |
+
elif bug in ("label_encoder_mismatch", "silent_metric_swap", "tokenizer_version_drift"):
|
| 342 |
+
# Training looks completely normal — the bug is silent
|
| 343 |
+
best_val = 0.0
|
| 344 |
+
for ep in range(1, self.epochs + 1):
|
| 345 |
+
train_loss = max(0.08, 1.8 * (0.72 ** ep) + self.rng.uniform(-0.02, 0.02))
|
| 346 |
+
train_acc = min(0.96, 0.42 + ep * 0.032 + self.rng.uniform(-0.01, 0.01))
|
| 347 |
+
val_loss = train_loss * self.rng.uniform(1.05, 1.15)
|
| 348 |
+
val_acc = train_acc - self.rng.uniform(0.02, 0.06)
|
| 349 |
+
best_val = max(best_val, val_acc)
|
| 350 |
+
lines.append(f"[EPOCH {ep:03d}] train_loss={train_loss:.4f} train_acc={train_acc:.4f} "
|
| 351 |
+
f"val_loss={val_loss:.4f} val_acc={val_acc:.4f} "
|
| 352 |
+
f"lr={self.lr:.2e} grad_norm={self.rng.uniform(0.3, 2.1):.3f} "
|
| 353 |
+
f"time={self.rng.randint(60,120)}s")
|
| 354 |
+
lines.append(f"[INFO ] Training complete. Best val_acc={best_val:.4f} at epoch {self.rng.randint(self.epochs-3, self.epochs)}")
|
| 355 |
+
lines.append(f"[INFO ] Checkpoint saved: ./checkpoints/{self.run_id}/best_model.pt")
|
| 356 |
+
|
| 357 |
+
lines.append("[INFO ] ─" * 30)
|
| 358 |
+
lines.append(f"[INFO ] Run {self.run_id} finished.")
|
| 359 |
+
return "\n".join(lines)
|
| 360 |
+
|
| 361 |
+
# ── dataset_stats.json ───────────────────────────────────────────────────
|
| 362 |
+
|
| 363 |
+
def _gen_dataset_stats(self) -> str:
|
| 364 |
+
n_classes = self.model_cfg["num_classes"]
|
| 365 |
+
train_n = self.train_samples
|
| 366 |
+
val_n = self.val_samples
|
| 367 |
+
test_n = self.test_samples
|
| 368 |
+
|
| 369 |
+
overlap_count = 0
|
| 370 |
+
if self.bug.bug_type == "data_leakage_overlap":
|
| 371 |
+
overlap_count = self.rng.randint(int(val_n * 0.15), int(val_n * 0.30))
|
| 372 |
+
elif self.bug.bug_type == "wrong_split_ratio":
|
| 373 |
+
# Train and test flipped
|
| 374 |
+
train_n, test_n = test_n, train_n
|
| 375 |
+
|
| 376 |
+
# Class distribution (roughly uniform with jitter)
|
| 377 |
+
def class_dist(total, n_cls):
|
| 378 |
+
base = total // n_cls
|
| 379 |
+
counts = {str(i): base + self.rng.randint(-int(base*0.15), int(base*0.15))
|
| 380 |
+
for i in range(min(n_cls, 10))}
|
| 381 |
+
if n_cls > 10:
|
| 382 |
+
counts["..."] = f"{n_cls - 10} more classes"
|
| 383 |
+
return counts
|
| 384 |
+
|
| 385 |
+
stats = {
|
| 386 |
+
"dataset": self.model_cfg["dataset"],
|
| 387 |
+
"num_classes": n_classes,
|
| 388 |
+
"splits": {
|
| 389 |
+
"train": {
|
| 390 |
+
"n_samples": train_n,
|
| 391 |
+
"class_distribution": class_dist(train_n, n_classes),
|
| 392 |
+
},
|
| 393 |
+
"val": {
|
| 394 |
+
"n_samples": val_n,
|
| 395 |
+
"class_distribution": class_dist(val_n, n_classes),
|
| 396 |
+
"overlap_with_train": overlap_count,
|
| 397 |
+
},
|
| 398 |
+
"test": {
|
| 399 |
+
"n_samples": test_n,
|
| 400 |
+
"class_distribution": class_dist(test_n, n_classes),
|
| 401 |
+
},
|
| 402 |
+
},
|
| 403 |
+
"feature_statistics": {
|
| 404 |
+
"mean": round(self.np_rng.uniform(0.45, 0.55), 4),
|
| 405 |
+
"std": round(self.np_rng.uniform(0.22, 0.28), 4),
|
| 406 |
+
"min": 0.0,
|
| 407 |
+
"max": 1.0,
|
| 408 |
+
"null_count": 0,
|
| 409 |
+
},
|
| 410 |
+
"preprocessing_applied": [
|
| 411 |
+
"resize",
|
| 412 |
+
"normalize",
|
| 413 |
+
"label_encode",
|
| 414 |
+
"train_val_test_split",
|
| 415 |
+
],
|
| 416 |
+
"random_seed_used": self.seed if self.bug.bug_type != "data_leakage_overlap" else None,
|
| 417 |
+
}
|
| 418 |
+
return json.dumps(stats, indent=2)
|
| 419 |
+
|
| 420 |
+
# ── preprocessing.py ─────────────────────────────────────────────────────
|
| 421 |
+
|
| 422 |
+
def _gen_preprocessing(self) -> str:
|
| 423 |
+
bug = self.bug.bug_type
|
| 424 |
+
|
| 425 |
+
if bug == "data_leakage_scaler":
|
| 426 |
+
return textwrap.dedent(f"""\
|
| 427 |
+
\"\"\"
|
| 428 |
+
Data preprocessing pipeline for {self.model_cfg['dataset']}
|
| 429 |
+
Run ID: {self.run_id}
|
| 430 |
+
\"\"\"
|
| 431 |
+
import numpy as np
|
| 432 |
+
import pandas as pd
|
| 433 |
+
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
| 434 |
+
from sklearn.model_selection import train_test_split
|
| 435 |
+
|
| 436 |
+
|
| 437 |
+
def load_raw_data(data_dir: str):
|
| 438 |
+
\"\"\"Load features and labels from disk.\"\"\"
|
| 439 |
+
X = np.load(f"{{data_dir}}/features.npy")
|
| 440 |
+
y = np.load(f"{{data_dir}}/labels.npy")
|
| 441 |
+
return X, y
|
| 442 |
+
|
| 443 |
+
|
| 444 |
+
def preprocess(data_dir: str, seed: int = {self.seed}):
|
| 445 |
+
X, y = load_raw_data(data_dir)
|
| 446 |
+
|
| 447 |
+
# Encode labels
|
| 448 |
+
le = LabelEncoder()
|
| 449 |
+
y_encoded = le.fit_transform(y)
|
| 450 |
+
|
| 451 |
+
# ── BUG: Scaler fit on full dataset BEFORE split ──────────
|
| 452 |
+
scaler = StandardScaler()
|
| 453 |
+
X_normalized = scaler.fit_transform(X) # sees val/test data during fit!
|
| 454 |
+
# ─────────────────────────────────────────────────────────
|
| 455 |
+
|
| 456 |
+
X_train, X_temp, y_train, y_temp = train_test_split(
|
| 457 |
+
X_normalized, y_encoded, test_size=0.2, random_state=seed
|
| 458 |
+
)
|
| 459 |
+
X_val, X_test, y_val, y_test = train_test_split(
|
| 460 |
+
X_temp, y_temp, test_size=0.5, random_state=seed
|
| 461 |
+
)
|
| 462 |
+
|
| 463 |
+
return (X_train, y_train), (X_val, y_val), (X_test, y_test), scaler, le
|
| 464 |
+
|
| 465 |
+
|
| 466 |
+
def get_transforms(split: str):
|
| 467 |
+
\"\"\"Get augmentation transforms for a given split.\"\"\"
|
| 468 |
+
if split == "train":
|
| 469 |
+
return [
|
| 470 |
+
("random_horizontal_flip", {{"p": 0.5}}),
|
| 471 |
+
("random_crop", {{"size": 224, "padding": 4}}),
|
| 472 |
+
("color_jitter", {{"brightness": 0.2, "contrast": 0.2}}),
|
| 473 |
+
("normalize", {{"mean": [0.485, 0.456, 0.406],
|
| 474 |
+
"std": [0.229, 0.224, 0.225]}}),
|
| 475 |
+
]
|
| 476 |
+
return [
|
| 477 |
+
("center_crop", {{"size": 224}}),
|
| 478 |
+
("normalize", {{"mean": [0.485, 0.456, 0.406],
|
| 479 |
+
"std": [0.229, 0.224, 0.225]}}),
|
| 480 |
+
]
|
| 481 |
+
""")
|
| 482 |
+
|
| 483 |
+
elif bug == "data_leakage_overlap":
|
| 484 |
+
return textwrap.dedent(f"""\
|
| 485 |
+
\"\"\"
|
| 486 |
+
Data preprocessing pipeline for {self.model_cfg['dataset']}
|
| 487 |
+
Run ID: {self.run_id}
|
| 488 |
+
\"\"\"
|
| 489 |
+
import numpy as np
|
| 490 |
+
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
| 491 |
+
from sklearn.model_selection import train_test_split
|
| 492 |
+
|
| 493 |
+
|
| 494 |
+
def load_raw_data(data_dir: str):
|
| 495 |
+
X = np.load(f"{{data_dir}}/features.npy")
|
| 496 |
+
y = np.load(f"{{data_dir}}/labels.npy")
|
| 497 |
+
return X, y
|
| 498 |
+
|
| 499 |
+
|
| 500 |
+
def preprocess(data_dir: str):
|
| 501 |
+
X, y = load_raw_data(data_dir)
|
| 502 |
+
|
| 503 |
+
le = LabelEncoder()
|
| 504 |
+
y_encoded = le.fit_transform(y)
|
| 505 |
+
|
| 506 |
+
# First split: train vs temp
|
| 507 |
+
# ── BUG: random_state=None → non-reproducible, overlapping splits ──
|
| 508 |
+
X_train, X_temp, y_train, y_temp = train_test_split(
|
| 509 |
+
X, y_encoded, test_size=0.2, random_state=None # ← should be fixed seed
|
| 510 |
+
)
|
| 511 |
+
# Second split: val vs test (ALSO non-deterministic)
|
| 512 |
+
X_val, X_test, y_val, y_test = train_test_split(
|
| 513 |
+
X_temp, y_temp, test_size=0.5, random_state=None # ← should be fixed seed
|
| 514 |
+
)
|
| 515 |
+
# ─────────────────────────────────────────────────────────
|
| 516 |
+
|
| 517 |
+
scaler = StandardScaler()
|
| 518 |
+
X_train = scaler.fit_transform(X_train)
|
| 519 |
+
X_val = scaler.transform(X_val)
|
| 520 |
+
X_test = scaler.transform(X_test)
|
| 521 |
+
|
| 522 |
+
return (X_train, y_train), (X_val, y_val), (X_test, y_test), scaler, le
|
| 523 |
+
""")
|
| 524 |
+
|
| 525 |
+
elif bug == "wrong_split_ratio":
|
| 526 |
+
return textwrap.dedent(f"""\
|
| 527 |
+
\"\"\"
|
| 528 |
+
Data preprocessing pipeline for {self.model_cfg['dataset']}
|
| 529 |
+
Run ID: {self.run_id}
|
| 530 |
+
\"\"\"
|
| 531 |
+
import numpy as np
|
| 532 |
+
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
| 533 |
+
from sklearn.model_selection import train_test_split
|
| 534 |
+
|
| 535 |
+
|
| 536 |
+
def preprocess(data_dir: str, seed: int = {self.seed}):
|
| 537 |
+
X = np.load(f"{{data_dir}}/features.npy")
|
| 538 |
+
y = np.load(f"{{data_dir}}/labels.npy")
|
| 539 |
+
|
| 540 |
+
le = LabelEncoder()
|
| 541 |
+
y_encoded = le.fit_transform(y)
|
| 542 |
+
|
| 543 |
+
# ── BUG: test_size=0.8 — trains on 20%, evaluates on 80% ──
|
| 544 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 545 |
+
X, y_encoded, test_size=0.8, random_state=seed # ← should be 0.2
|
| 546 |
+
)
|
| 547 |
+
X_val, X_test, y_val, y_test = train_test_split(
|
| 548 |
+
X_test, y_test, test_size=0.5, random_state=seed
|
| 549 |
+
)
|
| 550 |
+
# ──────────────────────────────────────────────────────────
|
| 551 |
+
|
| 552 |
+
scaler = StandardScaler()
|
| 553 |
+
X_train = scaler.fit_transform(X_train)
|
| 554 |
+
X_val = scaler.transform(X_val)
|
| 555 |
+
X_test = scaler.transform(X_test)
|
| 556 |
+
|
| 557 |
+
return (X_train, y_train), (X_val, y_val), (X_test, y_test), scaler, le
|
| 558 |
+
""")
|
| 559 |
+
|
| 560 |
+
elif bug == "label_encoder_mismatch":
|
| 561 |
+
classes = ["cat", "dog", "bird"] if self.model_cfg["num_classes"] <= 10 else \
|
| 562 |
+
[f"class_{i}" for i in range(min(self.model_cfg["num_classes"], 5))]
|
| 563 |
+
classes_shuffled = classes.copy()
|
| 564 |
+
self.rng.shuffle(classes_shuffled)
|
| 565 |
+
return textwrap.dedent(f"""\
|
| 566 |
+
\"\"\"
|
| 567 |
+
Data preprocessing pipeline for {self.model_cfg['dataset']}
|
| 568 |
+
Run ID: {self.run_id}
|
| 569 |
+
|
| 570 |
+
WARNING: Training and evaluation pipelines are defined separately.
|
| 571 |
+
Ensure they use identical label encoding.
|
| 572 |
+
\"\"\"
|
| 573 |
+
import numpy as np
|
| 574 |
+
from sklearn.preprocessing import LabelEncoder
|
| 575 |
+
from sklearn.model_selection import train_test_split
|
| 576 |
+
|
| 577 |
+
|
| 578 |
+
# ── Training pipeline ─────────────────────────────────────────
|
| 579 |
+
def build_train_pipeline(data_dir: str, seed: int = {self.seed}):
|
| 580 |
+
X = np.load(f"{{data_dir}}/train_features.npy")
|
| 581 |
+
y_raw = np.load(f"{{data_dir}}/train_labels.npy", allow_pickle=True)
|
| 582 |
+
|
| 583 |
+
# LabelEncoder fitted on training class order
|
| 584 |
+
le_train = LabelEncoder()
|
| 585 |
+
le_train.fit({classes}) # alphabetical order: {sorted(classes)}
|
| 586 |
+
y = le_train.transform(y_raw)
|
| 587 |
+
|
| 588 |
+
X_train, X_val, y_train, y_val = train_test_split(
|
| 589 |
+
X, y, test_size=0.2, random_state=seed
|
| 590 |
+
)
|
| 591 |
+
return (X_train, y_train), (X_val, y_val), le_train
|
| 592 |
+
|
| 593 |
+
|
| 594 |
+
# ── Evaluation pipeline ───────────────────────────────────────
|
| 595 |
+
def build_eval_pipeline(data_dir: str):
|
| 596 |
+
X_test = np.load(f"{{data_dir}}/test_features.npy")
|
| 597 |
+
y_raw = np.load(f"{{data_dir}}/test_labels.npy", allow_pickle=True)
|
| 598 |
+
|
| 599 |
+
# ── BUG: Different LabelEncoder instance with DIFFERENT fit order ──
|
| 600 |
+
le_eval = LabelEncoder()
|
| 601 |
+
le_eval.fit({classes_shuffled}) # ← shuffled order: {classes_shuffled}
|
| 602 |
+
y_test = le_eval.transform(y_raw)
|
| 603 |
+
# ─────────────────────────────────────────────────────────
|
| 604 |
+
|
| 605 |
+
return X_test, y_test, le_eval
|
| 606 |
+
""")
|
| 607 |
+
|
| 608 |
+
elif bug == "silent_metric_swap":
|
| 609 |
+
val_acc = round(self.rng.uniform(0.84, 0.91), 4)
|
| 610 |
+
test_acc = round(self.rng.uniform(0.31, 0.39), 4)
|
| 611 |
+
return textwrap.dedent(f"""\
|
| 612 |
+
\"\"\"
|
| 613 |
+
Evaluation script for {self.model_cfg['dataset']}
|
| 614 |
+
Run ID: {self.run_id}
|
| 615 |
+
\"\"\"
|
| 616 |
+
import torch
|
| 617 |
+
import json
|
| 618 |
+
|
| 619 |
+
|
| 620 |
+
def evaluate(model, val_loader, test_loader, device="cuda"):
|
| 621 |
+
model.eval()
|
| 622 |
+
results = {{}}
|
| 623 |
+
|
| 624 |
+
with torch.no_grad():
|
| 625 |
+
# Evaluate on validation set
|
| 626 |
+
val_correct, val_total = 0, 0
|
| 627 |
+
for X, y in val_loader:
|
| 628 |
+
preds = model(X.to(device)).argmax(dim=1)
|
| 629 |
+
val_correct += (preds == y.to(device)).sum().item()
|
| 630 |
+
val_total += y.size(0)
|
| 631 |
+
val_acc = val_correct / val_total
|
| 632 |
+
|
| 633 |
+
# Evaluate on test set
|
| 634 |
+
test_correct, test_total = 0, 0
|
| 635 |
+
for X, y in test_loader:
|
| 636 |
+
preds = model(X.to(device)).argmax(dim=1)
|
| 637 |
+
test_correct += (preds == y.to(device)).sum().item()
|
| 638 |
+
test_total += y.size(0)
|
| 639 |
+
test_acc = test_correct / test_total
|
| 640 |
+
|
| 641 |
+
# ── BUG: val and test accuracy assignments are swapped ──
|
| 642 |
+
results["val_accuracy"] = test_acc # ← should be val_acc
|
| 643 |
+
results["test_accuracy"] = val_acc # ← should be test_acc
|
| 644 |
+
# ──────────────────────────────────────────────────────
|
| 645 |
+
|
| 646 |
+
results["val_loss"] = round(1 - val_acc + 0.12, 4)
|
| 647 |
+
results["test_loss"] = round(1 - test_acc + 0.09, 4)
|
| 648 |
+
return results
|
| 649 |
+
""")
|
| 650 |
+
|
| 651 |
+
elif bug == "tokenizer_version_drift":
|
| 652 |
+
return textwrap.dedent(f"""\
|
| 653 |
+
\"\"\"
|
| 654 |
+
Text preprocessing pipeline for {self.model_cfg['dataset']}
|
| 655 |
+
Run ID: {self.run_id}
|
| 656 |
+
\"\"\"
|
| 657 |
+
from transformers import AutoTokenizer
|
| 658 |
+
|
| 659 |
+
|
| 660 |
+
TOKENIZER_V1 = "bert-base-uncased" # vocab size: 30,522
|
| 661 |
+
TOKENIZER_V2 = "bert-base-uncased-v2-fixed" # vocab size: 30,522 + 847 domain tokens
|
| 662 |
+
|
| 663 |
+
|
| 664 |
+
# ── Training pipeline ─────────────────────────────────────────
|
| 665 |
+
def get_train_tokenizer():
|
| 666 |
+
\"\"\"Tokenizer used during training.\"\"\"
|
| 667 |
+
# Updated to v2 for domain-specific vocabulary
|
| 668 |
+
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_V2)
|
| 669 |
+
return tokenizer
|
| 670 |
+
|
| 671 |
+
|
| 672 |
+
# ── Evaluation pipeline ───────────────────────────────────────
|
| 673 |
+
def get_eval_tokenizer():
|
| 674 |
+
\"\"\"Tokenizer used during evaluation and inference.\"\"\"
|
| 675 |
+
# ── BUG: Still using v1 — 847 tokens map to [UNK] during eval ──
|
| 676 |
+
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_V1) # ← should be TOKENIZER_V2
|
| 677 |
+
return tokenizer
|
| 678 |
+
# ─────────────────────────────────────────────────────────
|
| 679 |
+
|
| 680 |
+
|
| 681 |
+
def tokenize_batch(texts, tokenizer, max_length: int = 128):
|
| 682 |
+
return tokenizer(
|
| 683 |
+
texts,
|
| 684 |
+
padding="max_length",
|
| 685 |
+
truncation=True,
|
| 686 |
+
max_length=max_length,
|
| 687 |
+
return_tensors="pt",
|
| 688 |
+
)
|
| 689 |
+
""")
|
| 690 |
+
|
| 691 |
+
else:
|
| 692 |
+
# Default normal preprocessing (for config-error bugs, preprocessing is clean)
|
| 693 |
+
return textwrap.dedent(f"""\
|
| 694 |
+
\"\"\"
|
| 695 |
+
Data preprocessing pipeline for {self.model_cfg['dataset']}
|
| 696 |
+
Run ID: {self.run_id}
|
| 697 |
+
\"\"\"
|
| 698 |
+
import numpy as np
|
| 699 |
+
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
| 700 |
+
from sklearn.model_selection import train_test_split
|
| 701 |
+
|
| 702 |
+
|
| 703 |
+
def preprocess(data_dir: str, seed: int = {self.seed}):
|
| 704 |
+
X = np.load(f"{{data_dir}}/features.npy")
|
| 705 |
+
y = np.load(f"{{data_dir}}/labels.npy")
|
| 706 |
+
|
| 707 |
+
le = LabelEncoder()
|
| 708 |
+
y_encoded = le.fit_transform(y)
|
| 709 |
+
|
| 710 |
+
X_train, X_temp, y_train, y_temp = train_test_split(
|
| 711 |
+
X, y_encoded, test_size=0.2, random_state=seed
|
| 712 |
+
)
|
| 713 |
+
X_val, X_test, y_val, y_test = train_test_split(
|
| 714 |
+
X_temp, y_temp, test_size=0.5, random_state=seed
|
| 715 |
+
)
|
| 716 |
+
|
| 717 |
+
# Correct: fit only on training data
|
| 718 |
+
scaler = StandardScaler()
|
| 719 |
+
X_train = scaler.fit_transform(X_train)
|
| 720 |
+
X_val = scaler.transform(X_val)
|
| 721 |
+
X_test = scaler.transform(X_test)
|
| 722 |
+
|
| 723 |
+
return (X_train, y_train), (X_val, y_val), (X_test, y_test), scaler, le
|
| 724 |
+
""")
|
| 725 |
+
|
| 726 |
+
# ── eval_results.json ────────────────────────────────────────────────────
|
| 727 |
+
|
| 728 |
+
def _gen_eval_results(self) -> str:
|
| 729 |
+
bug = self.bug.bug_type
|
| 730 |
+
|
| 731 |
+
if bug in ("exploding_lr", "wrong_optimizer"):
|
| 732 |
+
val_acc = round(self.rng.uniform(0.09, 0.13), 4)
|
| 733 |
+
test_acc = round(self.rng.uniform(0.09, 0.13), 4)
|
| 734 |
+
val_loss = 999999.9 if bug == "exploding_lr" else round(self.rng.uniform(2.1, 2.4), 4)
|
| 735 |
+
test_loss = val_loss
|
| 736 |
+
elif bug == "batch_size_overflow":
|
| 737 |
+
val_acc = 0.9990
|
| 738 |
+
test_acc = round(self.rng.uniform(0.11, 0.15), 4) # massive train/test gap
|
| 739 |
+
val_loss, test_loss = 0.0003, round(self.rng.uniform(1.8, 2.3), 4)
|
| 740 |
+
elif bug in ("data_leakage_scaler", "data_leakage_overlap", "wrong_split_ratio"):
|
| 741 |
+
val_acc = round(self.rng.uniform(0.982, 0.998), 4)
|
| 742 |
+
test_acc = round(self.rng.uniform(0.61, 0.73), 4) # test is much worse (no leakage)
|
| 743 |
+
val_loss = round(self.rng.uniform(0.004, 0.015), 4)
|
| 744 |
+
test_loss = round(self.rng.uniform(0.42, 0.68), 4)
|
| 745 |
+
elif bug == "label_encoder_mismatch":
|
| 746 |
+
val_acc = round(self.rng.uniform(0.84, 0.91), 4)
|
| 747 |
+
test_acc = round(self.rng.uniform(0.30, 0.38), 4) # near random for 3-class
|
| 748 |
+
val_loss = round(1 - val_acc + self.rng.uniform(0.05, 0.15), 4)
|
| 749 |
+
test_loss = round(1 - test_acc + self.rng.uniform(0.05, 0.15), 4)
|
| 750 |
+
elif bug == "silent_metric_swap":
|
| 751 |
+
real_val = round(self.rng.uniform(0.84, 0.91), 4)
|
| 752 |
+
real_test = round(self.rng.uniform(0.31, 0.39), 4)
|
| 753 |
+
# Swapped in output
|
| 754 |
+
val_acc = real_test
|
| 755 |
+
test_acc = real_val
|
| 756 |
+
val_loss = round(1 - real_test + 0.09, 4)
|
| 757 |
+
test_loss = round(1 - real_val + 0.12, 4)
|
| 758 |
+
elif bug == "tokenizer_version_drift":
|
| 759 |
+
val_acc = round(self.rng.uniform(0.83, 0.88), 4)
|
| 760 |
+
test_acc = round(self.rng.uniform(0.28, 0.36), 4)
|
| 761 |
+
val_loss = round(1 - val_acc + self.rng.uniform(0.05, 0.12), 4)
|
| 762 |
+
test_loss = round(1 - test_acc + self.rng.uniform(0.05, 0.12), 4)
|
| 763 |
+
else:
|
| 764 |
+
val_acc = round(self.rng.uniform(0.78, 0.91), 4)
|
| 765 |
+
test_acc = round(val_acc - self.rng.uniform(0.02, 0.05), 4)
|
| 766 |
+
val_loss = round(1 - val_acc + 0.1, 4)
|
| 767 |
+
test_loss = round(1 - test_acc + 0.1, 4)
|
| 768 |
+
|
| 769 |
+
result = {
|
| 770 |
+
"run_id": self.run_id,
|
| 771 |
+
"final_epoch": self.epochs if bug not in ("exploding_lr",) else self.rng.randint(2,5),
|
| 772 |
+
"metrics": {
|
| 773 |
+
"val_loss": val_loss,
|
| 774 |
+
"val_accuracy": val_acc,
|
| 775 |
+
"test_loss": test_loss,
|
| 776 |
+
"test_accuracy": test_acc,
|
| 777 |
+
},
|
| 778 |
+
"best_checkpoint": f"./checkpoints/{self.run_id}/best_model.pt",
|
| 779 |
+
"evaluation_timestamp": f"2024-03-{self.rng.randint(1,28):02d}T{self.rng.randint(10,22):02d}:{self.rng.randint(0,59):02d}:00Z",
|
| 780 |
+
"hardware": {"gpu": "A100-40GB", "cuda": "12.1"},
|
| 781 |
+
}
|
| 782 |
+
return json.dumps(result, indent=2)
|
| 783 |
+
|
| 784 |
+
# ── model_card.json ──────────────────────────────────────────────────────
|
| 785 |
+
|
| 786 |
+
def _gen_model_card(self) -> str:
|
| 787 |
+
bug = self.bug.bug_type
|
| 788 |
+
tokenizer_ver = "v1" if bug == "tokenizer_version_drift" else "v2"
|
| 789 |
+
|
| 790 |
+
card = {
|
| 791 |
+
"model_id": f"{self.run_id}",
|
| 792 |
+
"architecture": self.model_cfg["name"],
|
| 793 |
+
"task": self.model_cfg["type"],
|
| 794 |
+
"num_parameters": self.model_cfg["params"],
|
| 795 |
+
"dataset": self.model_cfg["dataset"],
|
| 796 |
+
"num_classes": self.model_cfg["num_classes"],
|
| 797 |
+
"framework": "PyTorch 2.2.0",
|
| 798 |
+
"training_config": {
|
| 799 |
+
"optimizer": self.optimizer_name,
|
| 800 |
+
"scheduler": self.scheduler_name,
|
| 801 |
+
"epochs": self.epochs,
|
| 802 |
+
},
|
| 803 |
+
"preprocessing": {
|
| 804 |
+
"label_encoder": "sklearn.LabelEncoder",
|
| 805 |
+
"tokenizer": tokenizer_ver if "bert" in self.model_cfg["name"].lower() else "N/A",
|
| 806 |
+
"normalizer": "StandardScaler (fit on training split)",
|
| 807 |
+
},
|
| 808 |
+
"authors": ["ml-platform-team"],
|
| 809 |
+
"license": "Apache-2.0",
|
| 810 |
+
}
|
| 811 |
+
return json.dumps(card, indent=2)
|
| 812 |
+
|
| 813 |
+
|
| 814 |
+
# ─── Sanity Check Engine ──────────────────────────────────────────────────────
|
| 815 |
+
|
| 816 |
+
def run_sanity_check(check_type: str, bug_type: str, artifacts: Dict[str, str],
|
| 817 |
+
rng: random.Random) -> Dict:
|
| 818 |
+
"""
|
| 819 |
+
Runs a named diagnostic check and returns computed results.
|
| 820 |
+
Results are grounded in the generated artifacts — not random.
|
| 821 |
+
"""
|
| 822 |
+
bug = BUG_CATALOGUE[bug_type]
|
| 823 |
+
|
| 824 |
+
if check_type == "label_consistency":
|
| 825 |
+
if bug_type == "label_encoder_mismatch":
|
| 826 |
+
return {
|
| 827 |
+
"check": "label_consistency",
|
| 828 |
+
"result": "FAIL",
|
| 829 |
+
"details": "Training LabelEncoder class order: ['bird', 'cat', 'dog'] (index 0=bird, 1=cat, 2=dog). "
|
| 830 |
+
"Evaluation LabelEncoder class order: ['cat', 'dog', 'bird'] (index 0=cat, 1=dog, 2=bird). "
|
| 831 |
+
"Mismatch detected — 2 of 3 class indices differ between pipelines.",
|
| 832 |
+
"affected_classes": 2,
|
| 833 |
+
"recommendation": "Use a single LabelEncoder instance across both pipelines.",
|
| 834 |
+
}
|
| 835 |
+
return {"check": "label_consistency", "result": "PASS",
|
| 836 |
+
"details": "Train and eval label mappings are identical. No mismatch detected."}
|
| 837 |
+
|
| 838 |
+
elif check_type == "data_leakage":
|
| 839 |
+
if bug_type in ("data_leakage_overlap", "data_leakage_scaler"):
|
| 840 |
+
overlap = rng.randint(180, 450) if bug_type == "data_leakage_overlap" else 0
|
| 841 |
+
scaler_leak = bug_type == "data_leakage_scaler"
|
| 842 |
+
return {
|
| 843 |
+
"check": "data_leakage",
|
| 844 |
+
"result": "FAIL",
|
| 845 |
+
"sample_overlap": overlap,
|
| 846 |
+
"scaler_fitted_on_full_dataset": scaler_leak,
|
| 847 |
+
"details": (
|
| 848 |
+
f"Found {overlap} samples present in both train and val splits. "
|
| 849 |
+
if overlap > 0 else ""
|
| 850 |
+
) + (
|
| 851 |
+
"StandardScaler.fit_transform() called on full dataset before split — "
|
| 852 |
+
"validation statistics contaminated by training distribution."
|
| 853 |
+
if scaler_leak else ""
|
| 854 |
+
),
|
| 855 |
+
}
|
| 856 |
+
return {"check": "data_leakage", "result": "PASS",
|
| 857 |
+
"sample_overlap": 0, "scaler_fitted_on_full_dataset": False,
|
| 858 |
+
"details": "No data leakage detected between train and val splits."}
|
| 859 |
+
|
| 860 |
+
elif check_type == "gradient_norms":
|
| 861 |
+
if bug_type == "exploding_lr":
|
| 862 |
+
return {
|
| 863 |
+
"check": "gradient_norms",
|
| 864 |
+
"result": "ANOMALY",
|
| 865 |
+
"epoch_1_norm": round(rng.uniform(840.0, 2100.0), 2),
|
| 866 |
+
"expected_range": "0.1 – 10.0",
|
| 867 |
+
"details": "Gradient norms exceeded safe threshold by 100–200×. "
|
| 868 |
+
"Indicates learning rate is too large — gradients are not being controlled.",
|
| 869 |
+
}
|
| 870 |
+
return {"check": "gradient_norms", "result": "NORMAL",
|
| 871 |
+
"mean_norm": round(rng.uniform(0.3, 2.1), 3),
|
| 872 |
+
"max_norm": round(rng.uniform(2.1, 4.5), 3),
|
| 873 |
+
"details": "Gradient norms are within expected range throughout training."}
|
| 874 |
+
|
| 875 |
+
elif check_type == "metric_gap_analysis":
|
| 876 |
+
if bug_type in ("label_encoder_mismatch", "silent_metric_swap", "tokenizer_version_drift"):
|
| 877 |
+
val_acc = round(rng.uniform(0.84, 0.91), 4)
|
| 878 |
+
test_acc = round(rng.uniform(0.28, 0.38), 4)
|
| 879 |
+
return {
|
| 880 |
+
"check": "metric_gap_analysis",
|
| 881 |
+
"result": "ANOMALY",
|
| 882 |
+
"val_accuracy": val_acc,
|
| 883 |
+
"test_accuracy": test_acc,
|
| 884 |
+
"gap": round(val_acc - test_acc, 4),
|
| 885 |
+
"expected_max_gap": 0.08,
|
| 886 |
+
"details": f"Val/test accuracy gap is {val_acc - test_acc:.3f} — far exceeds expected max of 0.08. "
|
| 887 |
+
f"This magnitude of gap (>{val_acc - test_acc:.0%}) strongly suggests an evaluation pipeline bug "
|
| 888 |
+
f"rather than overfitting — the model generalises well to the val set but fails on test data.",
|
| 889 |
+
}
|
| 890 |
+
return {"check": "metric_gap_analysis", "result": "NORMAL",
|
| 891 |
+
"details": "Val/test metric gap is within normal bounds."}
|
| 892 |
+
|
| 893 |
+
elif check_type == "encoder_version_match":
|
| 894 |
+
if bug_type == "tokenizer_version_drift":
|
| 895 |
+
return {
|
| 896 |
+
"check": "encoder_version_match",
|
| 897 |
+
"result": "MISMATCH",
|
| 898 |
+
"training_tokenizer": "bert-base-uncased-v2-fixed",
|
| 899 |
+
"eval_tokenizer": "bert-base-uncased",
|
| 900 |
+
"vocab_diff": 847,
|
| 901 |
+
"details": "Training uses tokenizer v2 (30,522 + 847 domain tokens). "
|
| 902 |
+
"Evaluation uses tokenizer v1 (30,522 tokens). "
|
| 903 |
+
"847 domain-specific tokens will map to [UNK] during evaluation — "
|
| 904 |
+
"causing silent degradation on domain-specific test inputs.",
|
| 905 |
+
}
|
| 906 |
+
return {"check": "encoder_version_match", "result": "PASS",
|
| 907 |
+
"details": "Training and evaluation use identical tokenizer versions."}
|
| 908 |
+
|
| 909 |
+
elif check_type == "class_balance":
|
| 910 |
+
n_classes = 10
|
| 911 |
+
counts = {str(i): rng.randint(780, 1020) for i in range(n_classes)}
|
| 912 |
+
imbalance_ratio = max(counts.values()) / max(1, min(counts.values()))
|
| 913 |
+
return {
|
| 914 |
+
"check": "class_balance",
|
| 915 |
+
"result": "PASS" if imbalance_ratio < 1.5 else "WARN",
|
| 916 |
+
"class_counts": counts,
|
| 917 |
+
"imbalance_ratio": round(imbalance_ratio, 3),
|
| 918 |
+
"details": f"Max/min class ratio: {imbalance_ratio:.2f}. "
|
| 919 |
+
f"{'Within acceptable range.' if imbalance_ratio < 1.5 else 'Moderate imbalance — consider weighted loss.'}",
|
| 920 |
+
}
|
| 921 |
+
|
| 922 |
+
elif check_type == "loss_trajectory":
|
| 923 |
+
if bug_type == "exploding_lr":
|
| 924 |
+
return {
|
| 925 |
+
"check": "loss_trajectory",
|
| 926 |
+
"result": "ANOMALY",
|
| 927 |
+
"pattern": "exponential_divergence",
|
| 928 |
+
"loss_values": [2.31, 18.42, 847.2, "nan"],
|
| 929 |
+
"details": "Loss follows exponential growth pattern rather than convergence. "
|
| 930 |
+
"This is a strong indicator of learning rate being orders of magnitude too large.",
|
| 931 |
+
}
|
| 932 |
+
elif bug_type == "wrong_optimizer":
|
| 933 |
+
return {
|
| 934 |
+
"check": "loss_trajectory",
|
| 935 |
+
"result": "ANOMALY",
|
| 936 |
+
"pattern": "oscillating_no_convergence",
|
| 937 |
+
"details": "Loss oscillates without converging over all epochs. "
|
| 938 |
+
"Characteristic of excessive momentum causing the optimizer to overshoot minima repeatedly.",
|
| 939 |
+
}
|
| 940 |
+
return {"check": "loss_trajectory", "result": "NORMAL",
|
| 941 |
+
"pattern": "smooth_convergence",
|
| 942 |
+
"details": "Loss follows expected convergence curve."}
|
| 943 |
+
|
| 944 |
+
elif check_type == "feature_statistics":
|
| 945 |
+
if bug_type in ("data_leakage_scaler",):
|
| 946 |
+
return {
|
| 947 |
+
"check": "feature_statistics",
|
| 948 |
+
"result": "WARN",
|
| 949 |
+
"train_mean": 0.0, "train_std": 1.0,
|
| 950 |
+
"val_mean": 0.0, "val_std": 1.0,
|
| 951 |
+
"details": "Train and val feature statistics are identical after normalization — "
|
| 952 |
+
"this is expected if scaler was fit on the full dataset (including val). "
|
| 953 |
+
"If scaler was fit only on train, a slight distributional shift is normal. "
|
| 954 |
+
"Zero shift suggests the scaler saw val data during fitting.",
|
| 955 |
+
}
|
| 956 |
+
return {"check": "feature_statistics", "result": "PASS",
|
| 957 |
+
"details": "Train and val feature distributions are within expected divergence bounds."}
|
| 958 |
+
|
| 959 |
+
return {"check": check_type, "result": "UNKNOWN",
|
| 960 |
+
"details": f"Unknown sanity check type: {check_type}"}
|
server/client.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""MLOps Pipeline Debugger — Python client"""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
from typing import Any, Dict, Optional
|
| 4 |
+
import httpx
|
| 5 |
+
|
| 6 |
+
from models import MLOpsAction, MLOpsObservation, MLOpsState
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class StepResult:
|
| 10 |
+
def __init__(self, observation, reward, done, info):
|
| 11 |
+
self.observation = observation
|
| 12 |
+
self.reward = reward
|
| 13 |
+
self.done = done
|
| 14 |
+
self.info = info
|
| 15 |
+
def __repr__(self):
|
| 16 |
+
return f"StepResult(reward={self.reward:.4f}, done={self.done})"
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class MLOpsDebugEnv:
|
| 20 |
+
def __init__(self, base_url: str = "http://localhost:7860"):
|
| 21 |
+
self.base_url = base_url.rstrip("/")
|
| 22 |
+
self._client: Optional[httpx.AsyncClient] = None
|
| 23 |
+
|
| 24 |
+
async def __aenter__(self):
|
| 25 |
+
self._client = httpx.AsyncClient(base_url=self.base_url, timeout=30.0)
|
| 26 |
+
return self
|
| 27 |
+
|
| 28 |
+
async def __aexit__(self, *args):
|
| 29 |
+
if self._client:
|
| 30 |
+
await self._client.aclose()
|
| 31 |
+
|
| 32 |
+
async def reset(self, task_id: str = "easy", seed: Optional[int] = None) -> MLOpsObservation:
|
| 33 |
+
r = await self._client.post("/reset", json={"task_id": task_id, "seed": seed})
|
| 34 |
+
r.raise_for_status()
|
| 35 |
+
return MLOpsObservation(**r.json())
|
| 36 |
+
|
| 37 |
+
async def step(self, action: MLOpsAction) -> StepResult:
|
| 38 |
+
r = await self._client.post("/step", json=action.model_dump(exclude_none=True))
|
| 39 |
+
r.raise_for_status()
|
| 40 |
+
d = r.json()
|
| 41 |
+
return StepResult(MLOpsObservation(**d["observation"]), d["reward"], d["done"], d["info"])
|
| 42 |
+
|
| 43 |
+
async def state(self) -> MLOpsState:
|
| 44 |
+
r = await self._client.get("/state")
|
| 45 |
+
r.raise_for_status()
|
| 46 |
+
return MLOpsState(**r.json())
|
| 47 |
+
|
| 48 |
+
def sync(self) -> "SyncMLOpsDebugEnv":
|
| 49 |
+
return SyncMLOpsDebugEnv(self.base_url)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class SyncMLOpsDebugEnv:
|
| 53 |
+
def __init__(self, base_url: str = "http://localhost:7860"):
|
| 54 |
+
self.base_url = base_url.rstrip("/")
|
| 55 |
+
self._client: Optional[httpx.Client] = None
|
| 56 |
+
|
| 57 |
+
def __enter__(self):
|
| 58 |
+
self._client = httpx.Client(base_url=self.base_url, timeout=30.0)
|
| 59 |
+
return self
|
| 60 |
+
|
| 61 |
+
def __exit__(self, *args):
|
| 62 |
+
if self._client:
|
| 63 |
+
self._client.close()
|
| 64 |
+
|
| 65 |
+
def reset(self, task_id: str = "easy", seed: Optional[int] = None) -> MLOpsObservation:
|
| 66 |
+
r = self._client.post("/reset", json={"task_id": task_id, "seed": seed})
|
| 67 |
+
r.raise_for_status()
|
| 68 |
+
return MLOpsObservation(**r.json())
|
| 69 |
+
|
| 70 |
+
def step(self, action: MLOpsAction) -> StepResult:
|
| 71 |
+
r = self._client.post("/step", json=action.model_dump(exclude_none=True))
|
| 72 |
+
r.raise_for_status()
|
| 73 |
+
d = r.json()
|
| 74 |
+
return StepResult(MLOpsObservation(**d["observation"]), d["reward"], d["done"], d["info"])
|
| 75 |
+
|
| 76 |
+
def state(self) -> MLOpsState:
|
| 77 |
+
r = self._client.get("/state")
|
| 78 |
+
r.raise_for_status()
|
| 79 |
+
return MLOpsState(**r.json())
|
server/inference.py
ADDED
|
@@ -0,0 +1,600 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
inference.py — Optimized LLM Agent for MLOps Pipeline Debugger
|
| 3 |
+
|
| 4 |
+
Required env vars (in .env file):
|
| 5 |
+
GEMINI_API_KEY your Gemini API key
|
| 6 |
+
MODEL_NAME gemini-2.5-flash (default)
|
| 7 |
+
ENV_BASE_URL http://localhost:7860 (default)
|
| 8 |
+
|
| 9 |
+
STDOUT FORMAT (mandatory):
|
| 10 |
+
[START] task=<task_name> env=<benchmark> model=<model_name>
|
| 11 |
+
[STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 12 |
+
[END] success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
from dotenv import load_dotenv
|
| 18 |
+
|
| 19 |
+
load_dotenv()
|
| 20 |
+
|
| 21 |
+
import argparse
|
| 22 |
+
import json
|
| 23 |
+
import os
|
| 24 |
+
import re
|
| 25 |
+
import sys
|
| 26 |
+
import time
|
| 27 |
+
from typing import Any, Dict, List, Optional
|
| 28 |
+
|
| 29 |
+
import httpx
|
| 30 |
+
from openai import OpenAI
|
| 31 |
+
|
| 32 |
+
API_BASE_URL = os.getenv(
|
| 33 |
+
"API_BASE_URL", "https://generativelanguage.googleapis.com/v1beta/openai/"
|
| 34 |
+
)
|
| 35 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "gemini-2.5-flash")
|
| 36 |
+
HF_TOKEN = os.getenv("GEMINI_API_KEY", os.getenv("HF_TOKEN", ""))
|
| 37 |
+
ENV_BASE_URL = os.getenv("ENV_BASE_URL", "https://angelgupta-mlops-openenv.hf.space")
|
| 38 |
+
BENCHMARK = "mlops-debug-env"
|
| 39 |
+
TASKS = ["easy", "medium", "hard"]
|
| 40 |
+
SUCCESS_THRESHOLD = 0.5
|
| 41 |
+
|
| 42 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
|
| 43 |
+
|
| 44 |
+
# ── Complete bug reference for diagnosis guidance ─────────────────────────────
|
| 45 |
+
|
| 46 |
+
BUG_REFERENCE = {
|
| 47 |
+
"easy": {
|
| 48 |
+
"exploding_lr": {
|
| 49 |
+
"category": "config_error",
|
| 50 |
+
"file": "config.yaml",
|
| 51 |
+
"field": "optimizer.learning_rate",
|
| 52 |
+
"gold_fix": "Reduce learning_rate from 50.0 to 1e-4 (or use a scheduler with warmup)",
|
| 53 |
+
"symptoms": "loss explodes: 2.31 → 8.94 → 847.2 → nan by epoch 3",
|
| 54 |
+
},
|
| 55 |
+
"wrong_optimizer": {
|
| 56 |
+
"category": "config_error",
|
| 57 |
+
"file": "config.yaml",
|
| 58 |
+
"field": "optimizer.momentum",
|
| 59 |
+
"gold_fix": "Reduce momentum from 0.99 to 0.9, or switch to AdamW optimizer",
|
| 60 |
+
"symptoms": "oscillating loss with no convergence, SGD with momentum=0.99",
|
| 61 |
+
},
|
| 62 |
+
"batch_size_overflow": {
|
| 63 |
+
"category": "config_error",
|
| 64 |
+
"file": "config.yaml",
|
| 65 |
+
"field": "training.batch_size",
|
| 66 |
+
"gold_fix": "Reduce batch_size from 4096 to 32 or 64; current size exceeds training set",
|
| 67 |
+
"symptoms": "batch_size > dataset size, val accuracy 99.9% trivially",
|
| 68 |
+
},
|
| 69 |
+
},
|
| 70 |
+
"medium": {
|
| 71 |
+
"data_leakage_scaler": {
|
| 72 |
+
"category": "data_leakage",
|
| 73 |
+
"file": "preprocessing.py",
|
| 74 |
+
"field": "StandardScaler.fit_transform",
|
| 75 |
+
"gold_fix": "Fit StandardScaler only on X_train, then call transform() on X_val and X_test separately",
|
| 76 |
+
"symptoms": "val accuracy 99% at epoch 1, scaler.fit_transform(X_full) before split",
|
| 77 |
+
},
|
| 78 |
+
"data_leakage_overlap": {
|
| 79 |
+
"category": "data_leakage",
|
| 80 |
+
"file": "preprocessing.py",
|
| 81 |
+
"field": "train_test_split.random_state",
|
| 82 |
+
"gold_fix": "Set random_state=42 in train_test_split to ensure deterministic, non-overlapping splits",
|
| 83 |
+
"symptoms": "non-zero sample overlap in dataset_stats, random_state=None in train_test_split",
|
| 84 |
+
},
|
| 85 |
+
"wrong_split_ratio": {
|
| 86 |
+
"category": "preprocessing_bug",
|
| 87 |
+
"file": "preprocessing.py",
|
| 88 |
+
"field": "train_test_split.test_size",
|
| 89 |
+
"gold_fix": "Change test_size from 0.8 to 0.2 — current config trains on 20% and evaluates on 80%",
|
| 90 |
+
"symptoms": "test_size=0.8 in preprocessing.py, trains on 20% evaluates on 80%",
|
| 91 |
+
},
|
| 92 |
+
},
|
| 93 |
+
"hard": {
|
| 94 |
+
"label_encoder_mismatch": {
|
| 95 |
+
"category": "label_mismatch",
|
| 96 |
+
"file": "preprocessing.py",
|
| 97 |
+
"field": "LabelEncoder.fit_order",
|
| 98 |
+
"gold_fix": "Use the same LabelEncoder instance (fitted on training data) for both train and eval pipelines",
|
| 99 |
+
"symptoms": "val accuracy good (87%), test accuracy near-random (34%), two different LabelEncoder instances with different fit orders",
|
| 100 |
+
},
|
| 101 |
+
"silent_metric_swap": {
|
| 102 |
+
"category": "evaluation_bug",
|
| 103 |
+
"file": "eval_results.json",
|
| 104 |
+
"field": "metrics.val_accuracy",
|
| 105 |
+
"gold_fix": "Swap val_accuracy and test_accuracy assignments in the evaluation loop — metrics are mislabeled",
|
| 106 |
+
"symptoms": "val_accuracy suspiciously low, test_accuracy suspiciously high (reversed)",
|
| 107 |
+
},
|
| 108 |
+
"tokenizer_version_drift": {
|
| 109 |
+
"category": "evaluation_bug",
|
| 110 |
+
"file": "preprocessing.py",
|
| 111 |
+
"field": "tokenizer.version",
|
| 112 |
+
"gold_fix": "Ensure training and evaluation both use tokenizer v2 — v1 has a different vocabulary mapping for 847 tokens",
|
| 113 |
+
"symptoms": "training uses TOKENIZER_V2, eval uses TOKENIZER_V1, 847 tokens map to [UNK]",
|
| 114 |
+
},
|
| 115 |
+
},
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
SYSTEM_PROMPT = """You are a senior ML engineer investigating a broken training run.
|
| 119 |
+
|
| 120 |
+
INVESTIGATION STRATEGY (follow this exact order):
|
| 121 |
+
1. read_logs — identify the symptom
|
| 122 |
+
2. read_eval_results — check val vs test metric gap
|
| 123 |
+
3. inspect_preprocessing — look for pipeline bugs
|
| 124 |
+
4. read_config — check hyperparameters
|
| 125 |
+
5. check_dataset_stats — look for split issues
|
| 126 |
+
6. run_sanity_check — confirm hypothesis
|
| 127 |
+
7. submit_diagnosis — ONLY after steps 1-5 minimum
|
| 128 |
+
|
| 129 |
+
FAILURE CATEGORIES:
|
| 130 |
+
- config_error : Wrong hyperparameter
|
| 131 |
+
- data_leakage : Train/val contamination
|
| 132 |
+
- evaluation_bug : Eval pipeline uses wrong artifacts or swapped metrics
|
| 133 |
+
- preprocessing_bug : Data transformation applied incorrectly
|
| 134 |
+
- label_mismatch : Label encoding inconsistency
|
| 135 |
+
- architecture_bug : Model architecture misconfiguration
|
| 136 |
+
|
| 137 |
+
ROOT CAUSE FIELD FORMAT: Use dot notation. Examples:
|
| 138 |
+
- "optimizer.learning_rate" / "training.batch_size" / "optimizer.momentum"
|
| 139 |
+
- "StandardScaler.fit_transform" / "train_test_split.random_state" / "train_test_split.test_size"
|
| 140 |
+
- "LabelEncoder.fit_order" / "tokenizer.version" / "metrics.val_accuracy"
|
| 141 |
+
|
| 142 |
+
RESPOND WITH ONE JSON ACTION OBJECT PER TURN. Examples:
|
| 143 |
+
{"action_type": "read_logs"}
|
| 144 |
+
{"action_type": "read_eval_results"}
|
| 145 |
+
{"action_type": "inspect_preprocessing"}
|
| 146 |
+
{"action_type": "read_config"}
|
| 147 |
+
{"action_type": "check_dataset_stats"}
|
| 148 |
+
{"action_type": "run_sanity_check", "sanity_check_type": "metric_gap_analysis"}
|
| 149 |
+
{"action_type": "submit_diagnosis",
|
| 150 |
+
"failure_category": "config_error",
|
| 151 |
+
"root_cause_file": "config.yaml",
|
| 152 |
+
"root_cause_field": "training.batch_size",
|
| 153 |
+
"diagnosis": "Batch size 8192 exceeds training set size, causing trivial overfitting.",
|
| 154 |
+
"proposed_fix": "Reduce batch_size from 4096 to 32 or 64; current size exceeds training set"}
|
| 155 |
+
|
| 156 |
+
ONLY output the JSON object. No explanation. No markdown."""
|
| 157 |
+
|
| 158 |
+
DIAGNOSIS_PROMPT = """Based on your investigation, now submit your final diagnosis.
|
| 159 |
+
|
| 160 |
+
Here is the complete bug reference for this task difficulty:
|
| 161 |
+
|
| 162 |
+
{bug_ref}
|
| 163 |
+
|
| 164 |
+
Analyze the artifacts you've read and identify which specific bug matches the symptoms.
|
| 165 |
+
Then submit your diagnosis using the EXACT field names and fix wording from the matching bug above.
|
| 166 |
+
|
| 167 |
+
IMPORTANT: Your proposed_fix must contain the KEYWORDS from the gold_fix above. The grader uses keyword matching.
|
| 168 |
+
|
| 169 |
+
Respond with ONLY the JSON submit_diagnosis action. No explanation. No markdown."""
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
# ── Logging helpers ──────────────────────────────────────────────────────────
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
def log_start(task: str, env: str, model: str) -> None:
|
| 176 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def log_step(
|
| 180 |
+
step: int, action: str, reward: float, done: bool, error: Optional[str]
|
| 181 |
+
) -> None:
|
| 182 |
+
error_val = error if error else "null"
|
| 183 |
+
done_val = str(done).lower()
|
| 184 |
+
print(
|
| 185 |
+
f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
|
| 186 |
+
flush=True,
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def log_end(
|
| 191 |
+
success: bool, steps: int, score: float = 0.0, rewards: List[float] = None
|
| 192 |
+
) -> None:
|
| 193 |
+
if rewards is None:
|
| 194 |
+
rewards = []
|
| 195 |
+
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 196 |
+
print(
|
| 197 |
+
f"[END] success={str(success).lower()} steps={steps} score={score:.4f} rewards={rewards_str}",
|
| 198 |
+
flush=True,
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
# ── Environment helpers ───────────────────────────────────────────────────────
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def env_reset(task_id: str, seed: int = 42) -> Dict[str, Any]:
|
| 206 |
+
r = httpx.post(
|
| 207 |
+
f"{ENV_BASE_URL}/reset", json={"task_id": task_id, "seed": seed}, timeout=30
|
| 208 |
+
)
|
| 209 |
+
r.raise_for_status()
|
| 210 |
+
return r.json()
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
def env_step(action: Dict[str, Any]) -> Dict[str, Any]:
|
| 214 |
+
r = httpx.post(f"{ENV_BASE_URL}/step", json=action, timeout=30)
|
| 215 |
+
r.raise_for_status()
|
| 216 |
+
return r.json()
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
def build_user_msg(obs: Dict[str, Any]) -> str:
|
| 220 |
+
arts_read = obs.get("artifacts_read", [])
|
| 221 |
+
pending = [
|
| 222 |
+
a["name"]
|
| 223 |
+
for a in obs.get("available_artifacts", [])
|
| 224 |
+
if a["name"] not in arts_read
|
| 225 |
+
]
|
| 226 |
+
last = obs.get("last_action_result", {})
|
| 227 |
+
step = obs.get("step_count", 0)
|
| 228 |
+
max_s = obs.get("max_steps", 30)
|
| 229 |
+
run = obs.get("run_summary", {})
|
| 230 |
+
|
| 231 |
+
lines = [
|
| 232 |
+
f"=== STEP {step}/{max_s} ===",
|
| 233 |
+
f"Run: {obs.get('run_id', '')} | Model: {run.get('model', '')} | Status: {run.get('status', '')}",
|
| 234 |
+
f"Artifacts read: {arts_read}",
|
| 235 |
+
f"Artifacts NOT yet read: {pending}",
|
| 236 |
+
"",
|
| 237 |
+
"LAST ACTION RESULT:",
|
| 238 |
+
json.dumps(last, indent=2, default=str)[:3000],
|
| 239 |
+
]
|
| 240 |
+
msgs = obs.get("messages", [])
|
| 241 |
+
if msgs:
|
| 242 |
+
lines += ["", "SYSTEM MESSAGES:"] + msgs
|
| 243 |
+
if obs.get("done"):
|
| 244 |
+
lines.append("\nEpisode done.")
|
| 245 |
+
return "\n".join(lines)
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
def parse_action(text: str) -> Optional[Dict[str, Any]]:
|
| 249 |
+
text = text.strip()
|
| 250 |
+
if text.startswith("```"):
|
| 251 |
+
text = "\n".join(text.split("\n")[1:-1])
|
| 252 |
+
try:
|
| 253 |
+
return json.loads(text)
|
| 254 |
+
except Exception:
|
| 255 |
+
m = re.search(r"\{[\s\S]+\}", text)
|
| 256 |
+
if m:
|
| 257 |
+
try:
|
| 258 |
+
return json.loads(m.group())
|
| 259 |
+
except Exception:
|
| 260 |
+
pass
|
| 261 |
+
return None
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
# ── Rate-limited LLM calls ───────────────────────────────────────────────────
|
| 265 |
+
|
| 266 |
+
_last_call_time = 0
|
| 267 |
+
_MIN_CALL_INTERVAL = 2.0
|
| 268 |
+
from openenv_state import OPENENV_STATE, OpenEnvState
|
| 269 |
+
import json as _json
|
| 270 |
+
|
| 271 |
+
# For hard fallback guard
|
| 272 |
+
_HARD_FALLBACK_USED = False
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
def _update_openenv_state(
|
| 276 |
+
run_id: str,
|
| 277 |
+
task_id: str,
|
| 278 |
+
seed: int,
|
| 279 |
+
step_count: int,
|
| 280 |
+
max_steps: int,
|
| 281 |
+
end_score: float,
|
| 282 |
+
rewards: List[float],
|
| 283 |
+
artifacts_read: List[str],
|
| 284 |
+
) -> None:
|
| 285 |
+
ts = __import__("datetime").datetime.utcnow().isoformat()
|
| 286 |
+
OPENENV_STATE.run_id = run_id
|
| 287 |
+
OPENENV_STATE.task_id = task_id
|
| 288 |
+
OPENENV_STATE.seed = seed
|
| 289 |
+
OPENENV_STATE.step_count = step_count
|
| 290 |
+
OPENENV_STATE.max_steps = max_steps
|
| 291 |
+
OPENENV_STATE.end_score = end_score
|
| 292 |
+
OPENENV_STATE.rewards = rewards
|
| 293 |
+
OPENENV_STATE.artifacts_read = artifacts_read
|
| 294 |
+
OPENENV_STATE.timestamp = ts
|
| 295 |
+
OPENENV_STATE.scores[task_id] = end_score
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
_HARD_FALLBACK_USED = False
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
def call_llm(messages: List[Dict], model_name: Optional[str] = None) -> str:
|
| 302 |
+
global _last_call_time
|
| 303 |
+
model_to_use = model_name or MODEL_NAME
|
| 304 |
+
for attempt in range(10):
|
| 305 |
+
try:
|
| 306 |
+
elapsed = time.time() - _last_call_time
|
| 307 |
+
if elapsed < _MIN_CALL_INTERVAL:
|
| 308 |
+
time.sleep(_MIN_CALL_INTERVAL - elapsed)
|
| 309 |
+
|
| 310 |
+
resp = client.chat.completions.create(
|
| 311 |
+
model=model_to_use, messages=messages, max_tokens=512, temperature=0.1
|
| 312 |
+
)
|
| 313 |
+
_last_call_time = time.time()
|
| 314 |
+
return resp.choices[0].message.content.strip()
|
| 315 |
+
except Exception as e:
|
| 316 |
+
err_msg = str(e)
|
| 317 |
+
if "rate" in err_msg.lower() or "Request rate" in err_msg:
|
| 318 |
+
wait = min(15 * (2**attempt), 120)
|
| 319 |
+
print(
|
| 320 |
+
f" [RATE LIMIT] Waiting {wait}s (attempt {attempt + 1}/10)...",
|
| 321 |
+
flush=True,
|
| 322 |
+
)
|
| 323 |
+
else:
|
| 324 |
+
wait = min(30 * (2**attempt), 300)
|
| 325 |
+
print(
|
| 326 |
+
f" [RETRY] LLM error (attempt {attempt + 1}/10): {e}. Waiting {wait}s...",
|
| 327 |
+
flush=True,
|
| 328 |
+
)
|
| 329 |
+
time.sleep(wait)
|
| 330 |
+
raise RuntimeError("LLM call failed after 10 retries")
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
# ── Fallback actions ──────────────────────────────────────────────────────────
|
| 334 |
+
|
| 335 |
+
FALLBACK_ACTIONS = [
|
| 336 |
+
{"action_type": "read_logs"},
|
| 337 |
+
{"action_type": "read_eval_results"},
|
| 338 |
+
{"action_type": "inspect_preprocessing"},
|
| 339 |
+
{"action_type": "read_config"},
|
| 340 |
+
{"action_type": "check_dataset_stats"},
|
| 341 |
+
{"action_type": "run_sanity_check", "sanity_check_type": "metric_gap_analysis"},
|
| 342 |
+
{"action_type": "run_sanity_check", "sanity_check_type": "data_leakage"},
|
| 343 |
+
{"action_type": "run_sanity_check", "sanity_check_type": "label_consistency"},
|
| 344 |
+
]
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
def get_fallback_action(step_num: int) -> Dict[str, Any]:
|
| 348 |
+
idx = min(step_num - 1, len(FALLBACK_ACTIONS) - 1)
|
| 349 |
+
return FALLBACK_ACTIONS[idx]
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
# ── Main agent loop ──────────────────────────────────────────────────────────
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
def run_task(task_id: str, seed: int = 42, alt_model: Optional[str] = None) -> float:
|
| 356 |
+
global _HARD_FALLBACK_USED
|
| 357 |
+
log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
|
| 358 |
+
|
| 359 |
+
obs = env_reset(task_id, seed)
|
| 360 |
+
messages = [
|
| 361 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 362 |
+
{
|
| 363 |
+
"role": "user",
|
| 364 |
+
"content": f"TASK DESCRIPTION:\n{obs['task_description']}\n\n{build_user_msg(obs)}",
|
| 365 |
+
},
|
| 366 |
+
]
|
| 367 |
+
|
| 368 |
+
MIN_STEPS = {"easy": 5, "medium": 7, "hard": 10}
|
| 369 |
+
MAX_STEPS = {"easy": 20, "medium": 30, "hard": 40}
|
| 370 |
+
min_steps = MIN_STEPS.get(task_id, 7)
|
| 371 |
+
max_steps = MAX_STEPS.get(task_id, 30)
|
| 372 |
+
|
| 373 |
+
CORE_ARTIFACTS = {
|
| 374 |
+
"train.log",
|
| 375 |
+
"eval_results.json",
|
| 376 |
+
"preprocessing.py",
|
| 377 |
+
"config.yaml",
|
| 378 |
+
"dataset_stats.json",
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
step_num = 0
|
| 382 |
+
done = False
|
| 383 |
+
final_score = 0.0
|
| 384 |
+
rewards: List[float] = []
|
| 385 |
+
action_history: List[str] = []
|
| 386 |
+
sanity_check_history: List[str] = []
|
| 387 |
+
in_diagnosis_phase = False
|
| 388 |
+
|
| 389 |
+
def get_unread_artifacts() -> List[str]:
|
| 390 |
+
arts_read = set(obs.get("artifacts_read", []))
|
| 391 |
+
return [a for a in CORE_ARTIFACTS if a not in arts_read]
|
| 392 |
+
|
| 393 |
+
def get_next_unread_artifact() -> Optional[Dict[str, Any]]:
|
| 394 |
+
unread = get_unread_artifacts()
|
| 395 |
+
if not unread:
|
| 396 |
+
return None
|
| 397 |
+
artifact_to_action = {
|
| 398 |
+
"train.log": {"action_type": "read_logs"},
|
| 399 |
+
"eval_results.json": {"action_type": "read_eval_results"},
|
| 400 |
+
"preprocessing.py": {"action_type": "inspect_preprocessing"},
|
| 401 |
+
"config.yaml": {"action_type": "read_config"},
|
| 402 |
+
"dataset_stats.json": {"action_type": "check_dataset_stats"},
|
| 403 |
+
}
|
| 404 |
+
return artifact_to_action.get(unread[0])
|
| 405 |
+
|
| 406 |
+
def force_new_sanity_check() -> Dict[str, Any]:
|
| 407 |
+
all_checks = [
|
| 408 |
+
"metric_gap_analysis",
|
| 409 |
+
"data_leakage",
|
| 410 |
+
"label_consistency",
|
| 411 |
+
"encoder_version_match",
|
| 412 |
+
"loss_trajectory",
|
| 413 |
+
"class_balance",
|
| 414 |
+
"gradient_norms",
|
| 415 |
+
"feature_statistics",
|
| 416 |
+
]
|
| 417 |
+
for sc in all_checks:
|
| 418 |
+
if sc not in sanity_check_history:
|
| 419 |
+
return {"action_type": "run_sanity_check", "sanity_check_type": sc}
|
| 420 |
+
return {
|
| 421 |
+
"action_type": "run_sanity_check",
|
| 422 |
+
"sanity_check_type": "metric_gap_analysis",
|
| 423 |
+
}
|
| 424 |
+
|
| 425 |
+
def is_repetitive(action_type: str) -> bool:
|
| 426 |
+
if len(action_history) < 2:
|
| 427 |
+
return False
|
| 428 |
+
return action_history[-1] == action_type and action_history[-2] == action_type
|
| 429 |
+
|
| 430 |
+
while not done:
|
| 431 |
+
step_num += 1
|
| 432 |
+
unread = get_unread_artifacts()
|
| 433 |
+
all_read = len(unread) == 0
|
| 434 |
+
|
| 435 |
+
# Force submission near max steps
|
| 436 |
+
if step_num >= max_steps - 1:
|
| 437 |
+
in_diagnosis_phase = True
|
| 438 |
+
|
| 439 |
+
if in_diagnosis_phase:
|
| 440 |
+
# Build diagnosis prompt with bug reference
|
| 441 |
+
diag_prompt = DIAGNOSIS_PROMPT.format(
|
| 442 |
+
bug_ref=json.dumps(BUG_REFERENCE.get(task_id, {}), indent=2)
|
| 443 |
+
)
|
| 444 |
+
diag_messages = messages + [{"role": "user", "content": diag_prompt}]
|
| 445 |
+
llm_out = call_llm(diag_messages, model_name=alt_model)
|
| 446 |
+
action = parse_action(llm_out)
|
| 447 |
+
if action is None or action.get("action_type") != "submit_diagnosis":
|
| 448 |
+
# Force a diagnosis with best guess
|
| 449 |
+
action = {"action_type": "submit_diagnosis"}
|
| 450 |
+
else:
|
| 451 |
+
llm_out = call_llm(messages, model_name=alt_model)
|
| 452 |
+
action = parse_action(llm_out)
|
| 453 |
+
|
| 454 |
+
if action is None:
|
| 455 |
+
# Use fallback
|
| 456 |
+
if all_read:
|
| 457 |
+
action = force_new_sanity_check()
|
| 458 |
+
else:
|
| 459 |
+
action = get_next_unread_artifact() or get_fallback_action(step_num)
|
| 460 |
+
|
| 461 |
+
action_type = action.get("action_type", "unknown")
|
| 462 |
+
|
| 463 |
+
# Detect and break loops
|
| 464 |
+
if is_repetitive(action_type) and action_type != "submit_diagnosis":
|
| 465 |
+
if all_read:
|
| 466 |
+
action = force_new_sanity_check()
|
| 467 |
+
else:
|
| 468 |
+
next_artifact = get_next_unread_artifact()
|
| 469 |
+
if next_artifact:
|
| 470 |
+
action = next_artifact
|
| 471 |
+
else:
|
| 472 |
+
action = force_new_sanity_check()
|
| 473 |
+
|
| 474 |
+
# Track sanity checks
|
| 475 |
+
if action_type == "run_sanity_check":
|
| 476 |
+
sc = action.get("sanity_check_type", "")
|
| 477 |
+
sanity_check_history.append(sc)
|
| 478 |
+
|
| 479 |
+
# Enforce hard rubric before allowing hard submit
|
| 480 |
+
if action.get("action_type") == "submit_diagnosis" and task_id == "hard":
|
| 481 |
+
artifacts_read = obs.get("artifacts_read", [])
|
| 482 |
+
if (
|
| 483 |
+
len(artifacts_read) < 3
|
| 484 |
+
or len(sanity_check_history) < 3
|
| 485 |
+
or step_num < min_steps
|
| 486 |
+
):
|
| 487 |
+
action = get_fallback_action(step_num)
|
| 488 |
+
log_step(
|
| 489 |
+
step=step_num,
|
| 490 |
+
action=action["action_type"],
|
| 491 |
+
reward=0,
|
| 492 |
+
done=False,
|
| 493 |
+
error=None,
|
| 494 |
+
)
|
| 495 |
+
result = env_step(action)
|
| 496 |
+
new_obs = result["observation"]
|
| 497 |
+
reward = result["reward"]
|
| 498 |
+
done = result["done"]
|
| 499 |
+
info = result.get("info", {})
|
| 500 |
+
rewards.append(reward)
|
| 501 |
+
# Continue with the next loop iteration
|
| 502 |
+
if done:
|
| 503 |
+
final_score = info.get("score", reward)
|
| 504 |
+
if (
|
| 505 |
+
task_id == "hard"
|
| 506 |
+
and final_score < 0.8
|
| 507 |
+
and not _HARD_FALLBACK_USED
|
| 508 |
+
):
|
| 509 |
+
_HARD_FALLBACK_USED = True
|
| 510 |
+
return run_task(
|
| 511 |
+
task_id, seed, alt_model="gemini-3.1-pro-preview"
|
| 512 |
+
)
|
| 513 |
+
break
|
| 514 |
+
obs = new_obs
|
| 515 |
+
llm_out = llm_out # no-op, placeholder to clarify flow
|
| 516 |
+
messages.append({"role": "assistant", "content": llm_out})
|
| 517 |
+
messages.append({"role": "user", "content": build_user_msg(new_obs)})
|
| 518 |
+
continue
|
| 519 |
+
|
| 520 |
+
# Execute action
|
| 521 |
+
result = env_step(action)
|
| 522 |
+
new_obs = result["observation"]
|
| 523 |
+
reward = result["reward"]
|
| 524 |
+
done = result["done"]
|
| 525 |
+
info = result.get("info", {})
|
| 526 |
+
|
| 527 |
+
rewards.append(reward)
|
| 528 |
+
action_str = action.get("action_type", "unknown")
|
| 529 |
+
error_msg = (
|
| 530 |
+
new_obs.get("last_action_result", {}).get("error")
|
| 531 |
+
if isinstance(new_obs, dict)
|
| 532 |
+
else None
|
| 533 |
+
)
|
| 534 |
+
|
| 535 |
+
log_step(
|
| 536 |
+
step=step_num, action=action_str, reward=reward, done=done, error=error_msg
|
| 537 |
+
)
|
| 538 |
+
|
| 539 |
+
if done:
|
| 540 |
+
final_score = info.get("score", reward)
|
| 541 |
+
if task_id == "hard" and final_score < 0.8 and alt_model is None:
|
| 542 |
+
alt_score = run_task(task_id, seed, alt_model="gemini-3.1-pro-preview")
|
| 543 |
+
final_score = max(final_score, alt_score)
|
| 544 |
+
break
|
| 545 |
+
|
| 546 |
+
# Update observation
|
| 547 |
+
obs = new_obs
|
| 548 |
+
action_history.append(action_str)
|
| 549 |
+
|
| 550 |
+
# Check if we should enter diagnosis phase
|
| 551 |
+
if not in_diagnosis_phase:
|
| 552 |
+
unread = get_unread_artifacts()
|
| 553 |
+
all_read = len(unread) == 0
|
| 554 |
+
enough_checks = len(sanity_check_history) >= 2
|
| 555 |
+
if all_read and enough_checks and step_num >= min_steps:
|
| 556 |
+
in_diagnosis_phase = True
|
| 557 |
+
|
| 558 |
+
messages.append({"role": "assistant", "content": llm_out})
|
| 559 |
+
messages.append({"role": "user", "content": build_user_msg(new_obs)})
|
| 560 |
+
|
| 561 |
+
# Keep context window manageable
|
| 562 |
+
if len(messages) > 40:
|
| 563 |
+
messages = [messages[0], messages[1]] + messages[-26:]
|
| 564 |
+
|
| 565 |
+
success = final_score >= SUCCESS_THRESHOLD
|
| 566 |
+
log_end(success=success, steps=step_num, score=final_score, rewards=rewards)
|
| 567 |
+
return final_score
|
| 568 |
+
|
| 569 |
+
|
| 570 |
+
def main():
|
| 571 |
+
parser = argparse.ArgumentParser(
|
| 572 |
+
description="MLOps Pipeline Debugger — Baseline Agent"
|
| 573 |
+
)
|
| 574 |
+
parser.add_argument(
|
| 575 |
+
"--task", choices=TASKS, help="Run a specific task (default: all)"
|
| 576 |
+
)
|
| 577 |
+
parser.add_argument(
|
| 578 |
+
"--seed", type=int, default=42, help="Random seed for reproducibility"
|
| 579 |
+
)
|
| 580 |
+
args = parser.parse_args()
|
| 581 |
+
|
| 582 |
+
try:
|
| 583 |
+
httpx.get(f"{ENV_BASE_URL}/health", timeout=10).raise_for_status()
|
| 584 |
+
except Exception as e:
|
| 585 |
+
print(f"ERROR: Cannot reach {ENV_BASE_URL}: {e}", file=sys.stderr)
|
| 586 |
+
sys.exit(1)
|
| 587 |
+
|
| 588 |
+
tasks = [args.task] if args.task else TASKS
|
| 589 |
+
scores = {}
|
| 590 |
+
for t in tasks:
|
| 591 |
+
scores[t] = run_task(t, seed=args.seed)
|
| 592 |
+
|
| 593 |
+
print(f"\n=== FINAL SCORES ===", flush=True)
|
| 594 |
+
for t, s in scores.items():
|
| 595 |
+
print(f" {t:8s}: {s:.4f}")
|
| 596 |
+
print(f" {'AVERAGE':8s}: {sum(scores.values()) / len(scores):.4f}")
|
| 597 |
+
|
| 598 |
+
|
| 599 |
+
if __name__ == "__main__":
|
| 600 |
+
main()
|
server/mlops_environment.py
ADDED
|
@@ -0,0 +1,461 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
MLOps Pipeline Debugger — Core Environment
|
| 3 |
+
|
| 4 |
+
Episode flow:
|
| 5 |
+
1. reset(task_id, seed) → generates a broken training run with one planted bug
|
| 6 |
+
2. Agent investigates using 8 actions (reads artifacts, runs sanity checks)
|
| 7 |
+
3. Agent submits a structured diagnosis
|
| 8 |
+
4. Grader compares against planted bug ground truth → score in [0.0, 1.0]
|
| 9 |
+
|
| 10 |
+
Reward design (dense, not sparse):
|
| 11 |
+
+0.02 per new artifact read (first time — rewards exploration)
|
| 12 |
+
-0.02 per duplicate artifact read (no new filter applied)
|
| 13 |
+
-0.05 submitting diagnosis after reading < 3 distinct artifacts
|
| 14 |
+
|
| 15 |
+
At submit_diagnosis:
|
| 16 |
+
+0.15 correct failure_category
|
| 17 |
+
+0.25 correct root_cause_file
|
| 18 |
+
+0.30 correct root_cause_field (substring match, case-insensitive)
|
| 19 |
+
+0.30 correct proposed_fix (keyword match against gold fix)
|
| 20 |
+
|
| 21 |
+
Task 3 (hard) penalty multiplier:
|
| 22 |
+
wrong diagnosis → ×1.5 penalty on the missed components
|
| 23 |
+
(silent bugs that reach production are more costly)
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
from __future__ import annotations
|
| 27 |
+
|
| 28 |
+
import random
|
| 29 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 30 |
+
|
| 31 |
+
from models import MLOpsAction, MLOpsObservation, MLOpsState, ArtifactMeta
|
| 32 |
+
from artifact_generator import (
|
| 33 |
+
ArtifactGenerator, BUG_CATALOGUE, TASK_BUG_POOLS,
|
| 34 |
+
run_sanity_check,
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
TASK_MAX_STEPS = {"easy": 20, "medium": 30, "hard": 40}
|
| 39 |
+
|
| 40 |
+
TASK_DESCRIPTIONS = {
|
| 41 |
+
"easy": (
|
| 42 |
+
"TASK 1 — CONFIG ERROR DIAGNOSIS (Easy)\n\n"
|
| 43 |
+
"A training run has failed or produced clearly wrong results. The issue is in "
|
| 44 |
+
"the training configuration — a hyperparameter is set to an incorrect value that "
|
| 45 |
+
"causes immediate, visible degradation in training metrics.\n\n"
|
| 46 |
+
"Your job: investigate the training artifacts, identify which configuration "
|
| 47 |
+
"parameter is wrong, and propose the correct fix.\n\n"
|
| 48 |
+
"Strategy: Start by reading the training logs to observe symptom patterns, "
|
| 49 |
+
"then check the config to find the misconfigured parameter. "
|
| 50 |
+
"Run sanity checks (loss_trajectory, gradient_norms) to confirm your hypothesis "
|
| 51 |
+
"before submitting.\n\n"
|
| 52 |
+
"Actions available: read_config | read_logs | check_dataset_stats | "
|
| 53 |
+
"inspect_preprocessing | read_eval_results | run_sanity_check | "
|
| 54 |
+
"query_artifact | submit_diagnosis"
|
| 55 |
+
),
|
| 56 |
+
"medium": (
|
| 57 |
+
"TASK 2 — DATA LEAKAGE DETECTION (Medium)\n\n"
|
| 58 |
+
"Training metrics look suspiciously good — validation accuracy is anomalously "
|
| 59 |
+
"high from the first epoch, but test performance tells a different story. "
|
| 60 |
+
"The issue is in the data preprocessing pipeline.\n\n"
|
| 61 |
+
"Your job: identify the exact source of data leakage — whether it's a scaler "
|
| 62 |
+
"fitted on the full dataset, overlapping train/val splits from a non-deterministic "
|
| 63 |
+
"split, or an inverted split ratio — and propose the correct fix.\n\n"
|
| 64 |
+
"Strategy: Anomalous val accuracy in the logs is your first signal. "
|
| 65 |
+
"Inspect preprocessing code to find how splits are constructed. "
|
| 66 |
+
"Run the data_leakage and feature_statistics sanity checks to confirm. "
|
| 67 |
+
"The val/test metric gap in eval results is another key clue.\n\n"
|
| 68 |
+
"Actions available: read_config | read_logs | check_dataset_stats | "
|
| 69 |
+
"inspect_preprocessing | read_eval_results | run_sanity_check | "
|
| 70 |
+
"query_artifact | submit_diagnosis"
|
| 71 |
+
),
|
| 72 |
+
"hard": (
|
| 73 |
+
"TASK 3 — SILENT EVALUATION BUG (Hard)\n\n"
|
| 74 |
+
"Training completed normally. Validation metrics look reasonable. "
|
| 75 |
+
"But test set performance is catastrophically below validation — "
|
| 76 |
+
"and there are NO error logs, NO warnings, NO exceptions thrown.\n\n"
|
| 77 |
+
"Your job: find the silent bug in the evaluation pipeline. It could be "
|
| 78 |
+
"a label encoder mismatch between train and eval (different class orderings), "
|
| 79 |
+
"a metric assignment swap (val/test results mislabeled), or a tokenizer "
|
| 80 |
+
"version drift (training used v2, evaluation uses v1).\n\n"
|
| 81 |
+
"Strategy: The val/test metric gap in eval_results is your only initial signal. "
|
| 82 |
+
"Run metric_gap_analysis first to quantify the anomaly. Then systematically "
|
| 83 |
+
"check label_consistency, encoder_version_match, and inspect the preprocessing "
|
| 84 |
+
"code carefully — the bug produces no error output and will only be visible "
|
| 85 |
+
"by comparing train vs eval pipeline definitions.\n\n"
|
| 86 |
+
"WARNING: Missing this bug in a deployed model means silent wrong predictions "
|
| 87 |
+
"in production. Penalty for wrong diagnosis is weighted 1.5×.\n\n"
|
| 88 |
+
"Actions available: read_config | read_logs | check_dataset_stats | "
|
| 89 |
+
"inspect_preprocessing | read_eval_results | run_sanity_check | "
|
| 90 |
+
"query_artifact | submit_diagnosis"
|
| 91 |
+
),
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
ARTIFACT_DESCRIPTIONS = {
|
| 95 |
+
"config.yaml": ("Training configuration — hyperparameters, model, optimizer, scheduler", "~45 lines"),
|
| 96 |
+
"train.log": ("Epoch-by-epoch training metrics — loss, accuracy, gradient norms", "~30–60 lines"),
|
| 97 |
+
"dataset_stats.json": ("Dataset split sizes, class distribution, feature statistics", "~35 fields"),
|
| 98 |
+
"preprocessing.py": ("Data preprocessing pipeline — splits, normalization, encoding", "~40–70 lines"),
|
| 99 |
+
"eval_results.json": ("Final evaluation metrics — val and test loss/accuracy", "~15 fields"),
|
| 100 |
+
"model_card.json": ("Model architecture summary, training config, preprocessing versions", "~20 fields"),
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
class MLOpsEnvironment:
|
| 105 |
+
"""OpenEnv-compatible MLOps Pipeline Debugging environment."""
|
| 106 |
+
|
| 107 |
+
def __init__(self, task_id: str = "easy"):
|
| 108 |
+
assert task_id in TASK_MAX_STEPS, f"task_id must be one of {list(TASK_MAX_STEPS)}"
|
| 109 |
+
self.task_id = task_id
|
| 110 |
+
self._reset_internal(seed=42)
|
| 111 |
+
|
| 112 |
+
def _reset_internal(self, seed: int):
|
| 113 |
+
rng = random.Random(seed)
|
| 114 |
+
|
| 115 |
+
# Pick bug from this task's pool
|
| 116 |
+
pool = TASK_BUG_POOLS[self.task_id]
|
| 117 |
+
self.bug_type = rng.choice(pool)
|
| 118 |
+
self.bug = BUG_CATALOGUE[self.bug_type]
|
| 119 |
+
|
| 120 |
+
# Generate all artifacts
|
| 121 |
+
gen = ArtifactGenerator(self.bug_type, seed)
|
| 122 |
+
self._artifacts: Dict[str, str] = gen.generate_all()
|
| 123 |
+
self._model_cfg = gen.model_cfg
|
| 124 |
+
self._run_id = gen.run_id
|
| 125 |
+
self._rng = rng
|
| 126 |
+
self._seed = seed
|
| 127 |
+
|
| 128 |
+
# Cache artifact metadata at reset time (avoids consuming RNG per step)
|
| 129 |
+
self._artifact_meta: List[ArtifactMeta] = [
|
| 130 |
+
ArtifactMeta(
|
| 131 |
+
name=name,
|
| 132 |
+
description=ARTIFACT_DESCRIPTIONS[name][0],
|
| 133 |
+
size_hint=ARTIFACT_DESCRIPTIONS[name][1],
|
| 134 |
+
last_modified=f"2024-03-{rng.randint(1,28):02d}",
|
| 135 |
+
)
|
| 136 |
+
for name in self._artifacts
|
| 137 |
+
]
|
| 138 |
+
|
| 139 |
+
# Episode state
|
| 140 |
+
self._step_count = 0
|
| 141 |
+
self._max_steps = TASK_MAX_STEPS[self.task_id]
|
| 142 |
+
self._done = False
|
| 143 |
+
self._artifacts_read: List[str] = []
|
| 144 |
+
self._last_read_filters: Dict[str, str] = {}
|
| 145 |
+
self._sanity_checks_run: List[str] = []
|
| 146 |
+
self._duplicate_queries = 0
|
| 147 |
+
self._current_score = 0.0
|
| 148 |
+
self._messages: List[str] = []
|
| 149 |
+
|
| 150 |
+
# ── OpenEnv API ───────────────────────────────────────────────────────────
|
| 151 |
+
|
| 152 |
+
def reset(self, seed: Optional[int] = None) -> MLOpsObservation:
|
| 153 |
+
import time
|
| 154 |
+
actual_seed = seed if seed is not None else int(time.time() * 1000) % 100000
|
| 155 |
+
self._reset_internal(actual_seed)
|
| 156 |
+
return self._build_obs(
|
| 157 |
+
{"status": "reset", "message": "New training run loaded. Begin investigation."},
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
def step(self, action: MLOpsAction) -> Tuple[MLOpsObservation, float, bool, Dict[str, Any]]:
|
| 161 |
+
if self._done:
|
| 162 |
+
return self._build_obs({"status": "done", "message": "Episode over. Call reset()."}), 0.0, True, {}
|
| 163 |
+
|
| 164 |
+
self._step_count += 1
|
| 165 |
+
reward = 0.0
|
| 166 |
+
info: Dict[str, Any] = {}
|
| 167 |
+
result: Dict[str, Any] = {}
|
| 168 |
+
|
| 169 |
+
if self._step_count >= self._max_steps:
|
| 170 |
+
self._done = True
|
| 171 |
+
score = self._current_score
|
| 172 |
+
result = {"status": "timeout", "message": f"Max steps ({self._max_steps}) reached.", "score": score}
|
| 173 |
+
return self._build_obs(result), 0.0, True, {"score": score, "reason": "timeout"}
|
| 174 |
+
|
| 175 |
+
atype = action.action_type
|
| 176 |
+
|
| 177 |
+
# ── read_config ───────────────────────────────────────────────────
|
| 178 |
+
if atype == "read_config":
|
| 179 |
+
reward, result = self._handle_artifact_read("config.yaml", None)
|
| 180 |
+
|
| 181 |
+
# ── read_logs ─────────────────────────────────────────────────────
|
| 182 |
+
elif atype == "read_logs":
|
| 183 |
+
reward, result = self._handle_artifact_read("train.log", action.log_filter)
|
| 184 |
+
|
| 185 |
+
# ── check_dataset_stats ───────────────────────────────────────────
|
| 186 |
+
elif atype == "check_dataset_stats":
|
| 187 |
+
reward, result = self._handle_artifact_read("dataset_stats.json", None)
|
| 188 |
+
|
| 189 |
+
# ── inspect_preprocessing ─────────────────────────────────────────
|
| 190 |
+
elif atype == "inspect_preprocessing":
|
| 191 |
+
reward, result = self._handle_artifact_read("preprocessing.py", None)
|
| 192 |
+
|
| 193 |
+
# ── read_eval_results ─────────────────────────────────────────────
|
| 194 |
+
elif atype == "read_eval_results":
|
| 195 |
+
reward, result = self._handle_artifact_read("eval_results.json", None)
|
| 196 |
+
|
| 197 |
+
# ── run_sanity_check ───────────���──────────────────────────────────
|
| 198 |
+
elif atype == "run_sanity_check":
|
| 199 |
+
check = action.sanity_check_type
|
| 200 |
+
if not check:
|
| 201 |
+
result = {"status": "error", "message": "sanity_check_type is required."}
|
| 202 |
+
else:
|
| 203 |
+
check_result = run_sanity_check(check, self.bug_type, self._artifacts, self._rng)
|
| 204 |
+
if check not in self._sanity_checks_run:
|
| 205 |
+
self._sanity_checks_run.append(check)
|
| 206 |
+
reward += 0.01 # small reward for running new checks
|
| 207 |
+
result = {"status": "ok", "sanity_check": check_result}
|
| 208 |
+
|
| 209 |
+
# ── query_artifact ────────────────────────────────────────────────
|
| 210 |
+
elif atype == "query_artifact":
|
| 211 |
+
art = action.artifact_name
|
| 212 |
+
field = action.field_path
|
| 213 |
+
if not art or not field:
|
| 214 |
+
result = {"status": "error", "message": "artifact_name and field_path are required."}
|
| 215 |
+
elif art not in self._artifacts:
|
| 216 |
+
result = {"status": "error", "message": f"Artifact '{art}' not found."}
|
| 217 |
+
else:
|
| 218 |
+
val = self._resolve_field(art, field)
|
| 219 |
+
result = {"status": "ok", "artifact": art, "field": field, "value": val}
|
| 220 |
+
|
| 221 |
+
# ── submit_diagnosis ──────────────────────────────────────────────
|
| 222 |
+
elif atype == "submit_diagnosis":
|
| 223 |
+
reward, info, result = self._handle_submit(action)
|
| 224 |
+
self._done = True
|
| 225 |
+
|
| 226 |
+
obs = self._build_obs(result)
|
| 227 |
+
return obs, reward, self._done, info
|
| 228 |
+
|
| 229 |
+
# ── Internal handlers ──────────────────────────────────────────────────────
|
| 230 |
+
|
| 231 |
+
def _handle_artifact_read(self, artifact: str, log_filter: Optional[str]) -> Tuple[float, Dict]:
|
| 232 |
+
is_duplicate = (
|
| 233 |
+
artifact in self._artifacts_read
|
| 234 |
+
and self._last_read_filters.get(artifact, "") == (log_filter or "")
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
content = self._artifacts[artifact]
|
| 238 |
+
|
| 239 |
+
# Apply log filter
|
| 240 |
+
if artifact == "train.log" and log_filter:
|
| 241 |
+
lines = content.split("\n")
|
| 242 |
+
if log_filter.startswith("epoch:"):
|
| 243 |
+
try:
|
| 244 |
+
parts = log_filter.split(":")[1].split("-")
|
| 245 |
+
start, end = int(parts[0]), int(parts[1]) if len(parts) > 1 else int(parts[0])
|
| 246 |
+
filtered = [l for l in lines if any(f"EPOCH {ep:03d}" in l
|
| 247 |
+
for ep in range(start, end+1)) or "[INFO ]" in l or "[ERROR" in l]
|
| 248 |
+
content = "\n".join(filtered) if filtered else "No log lines match this epoch range."
|
| 249 |
+
except Exception:
|
| 250 |
+
content = "\n".join(lines)
|
| 251 |
+
else:
|
| 252 |
+
kw = log_filter.lower()
|
| 253 |
+
filtered = [l for l in lines if kw in l.lower()]
|
| 254 |
+
content = "\n".join(filtered) if filtered else f"No log lines contain '{log_filter}'."
|
| 255 |
+
|
| 256 |
+
reward = 0.0
|
| 257 |
+
if artifact not in self._artifacts_read:
|
| 258 |
+
self._artifacts_read.append(artifact)
|
| 259 |
+
reward = 0.02 # first read reward
|
| 260 |
+
elif is_duplicate:
|
| 261 |
+
self._duplicate_queries += 1
|
| 262 |
+
reward = -0.02 # duplicate penalty
|
| 263 |
+
self._messages.append(f"⚠️ Duplicate read of {artifact} with same filter. Try a different filter or a new artifact.")
|
| 264 |
+
|
| 265 |
+
self._last_read_filters[artifact] = log_filter or ""
|
| 266 |
+
|
| 267 |
+
return reward, {
|
| 268 |
+
"status": "ok",
|
| 269 |
+
"artifact": artifact,
|
| 270 |
+
"content": content,
|
| 271 |
+
"note": "Use log_filter='keyword' or 'epoch:N-M' for targeted log queries.",
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
def _handle_submit(self, action: MLOpsAction) -> Tuple[float, Dict, Dict]:
|
| 275 |
+
if len(self._artifacts_read) < 3:
|
| 276 |
+
# Penalty for submitting without adequate investigation
|
| 277 |
+
base_penalty = -0.05
|
| 278 |
+
self._messages.append("⚠️ Submitted diagnosis after reading fewer than 3 artifacts.")
|
| 279 |
+
else:
|
| 280 |
+
base_penalty = 0.0
|
| 281 |
+
|
| 282 |
+
score = base_penalty
|
| 283 |
+
breakdown: Dict[str, Any] = {}
|
| 284 |
+
|
| 285 |
+
# 1. failure_category (+0.15)
|
| 286 |
+
if action.failure_category == self.bug.category:
|
| 287 |
+
score += 0.15
|
| 288 |
+
breakdown["failure_category"] = {"awarded": 0.15, "correct": True}
|
| 289 |
+
else:
|
| 290 |
+
breakdown["failure_category"] = {
|
| 291 |
+
"awarded": 0.0, "correct": False,
|
| 292 |
+
"expected": self.bug.category, "got": action.failure_category,
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
# 2. root_cause_file (+0.25)
|
| 296 |
+
if action.root_cause_file and action.root_cause_file.lower() == self.bug.file.lower():
|
| 297 |
+
score += 0.25
|
| 298 |
+
breakdown["root_cause_file"] = {"awarded": 0.25, "correct": True}
|
| 299 |
+
else:
|
| 300 |
+
breakdown["root_cause_file"] = {
|
| 301 |
+
"awarded": 0.0, "correct": False,
|
| 302 |
+
"expected": self.bug.file, "got": action.root_cause_file,
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
# 3. root_cause_field (+0.30) — require majority of keywords to match
|
| 306 |
+
field_keywords = [kw.lower() for kw in self.bug.field.replace(".", " ").split() if len(kw) > 1]
|
| 307 |
+
submitted_field = (action.root_cause_field or "").lower()
|
| 308 |
+
field_matches = sum(1 for kw in field_keywords if kw in submitted_field)
|
| 309 |
+
field_threshold = max(1, len(field_keywords) // 2 + 1) # majority
|
| 310 |
+
field_correct = len(field_keywords) > 0 and field_matches >= field_threshold
|
| 311 |
+
if field_correct:
|
| 312 |
+
score += 0.30
|
| 313 |
+
breakdown["root_cause_field"] = {"awarded": 0.30, "correct": True}
|
| 314 |
+
else:
|
| 315 |
+
breakdown["root_cause_field"] = {
|
| 316 |
+
"awarded": 0.0, "correct": False,
|
| 317 |
+
"expected": self.bug.field, "got": action.root_cause_field,
|
| 318 |
+
"matched_keywords": field_matches, "required": field_threshold,
|
| 319 |
+
}
|
| 320 |
+
|
| 321 |
+
# 4. proposed_fix (+0.30) — keyword match against gold fix
|
| 322 |
+
import re as _re
|
| 323 |
+
_stop = {"to", "the", "a", "an", "of", "in", "on", "from", "use", "with", "and", "or", "for", "is", "at", "by"}
|
| 324 |
+
# Strip punctuation from keywords so "(fitted" becomes "fitted"
|
| 325 |
+
fix_keywords = {
|
| 326 |
+
_re.sub(r'[^a-z0-9_.]', '', w)
|
| 327 |
+
for w in self.bug.gold_fix.lower().split()
|
| 328 |
+
} - _stop
|
| 329 |
+
fix_keywords.discard("") # remove empty strings
|
| 330 |
+
submitted_fix = (action.proposed_fix or "").lower()
|
| 331 |
+
fix_overlap = sum(1 for kw in fix_keywords if kw in submitted_fix)
|
| 332 |
+
fix_score = min(0.30, 0.30 * (fix_overlap / max(1, len(fix_keywords))))
|
| 333 |
+
score += fix_score
|
| 334 |
+
breakdown["proposed_fix"] = {
|
| 335 |
+
"awarded": round(fix_score, 4),
|
| 336 |
+
"correct": fix_score >= 0.20,
|
| 337 |
+
"keyword_overlap": fix_overlap,
|
| 338 |
+
"total_keywords": len(fix_keywords),
|
| 339 |
+
}
|
| 340 |
+
|
| 341 |
+
# Hard task penalty multiplier — silent bugs are more costly
|
| 342 |
+
if self.task_id == "hard" and score < 0.70:
|
| 343 |
+
missed = 0.70 - min(score, 0.70)
|
| 344 |
+
score -= missed * 0.5 # 1.5× penalty on missed components
|
| 345 |
+
breakdown["hard_task_penalty_applied"] = True
|
| 346 |
+
|
| 347 |
+
score = round(max(0.0, min(1.0, score)), 4)
|
| 348 |
+
self._current_score = score
|
| 349 |
+
|
| 350 |
+
info = {
|
| 351 |
+
"score": score,
|
| 352 |
+
"breakdown": breakdown,
|
| 353 |
+
"ground_truth": {
|
| 354 |
+
"bug_type": self.bug_type,
|
| 355 |
+
"category": self.bug.category,
|
| 356 |
+
"file": self.bug.file,
|
| 357 |
+
"field": self.bug.field,
|
| 358 |
+
"gold_fix": self.bug.gold_fix,
|
| 359 |
+
},
|
| 360 |
+
"investigation": {
|
| 361 |
+
"artifacts_read": self._artifacts_read,
|
| 362 |
+
"sanity_checks_run": self._sanity_checks_run,
|
| 363 |
+
"duplicate_queries": self._duplicate_queries,
|
| 364 |
+
"steps_taken": self._step_count,
|
| 365 |
+
},
|
| 366 |
+
}
|
| 367 |
+
result = {
|
| 368 |
+
"status": "submitted",
|
| 369 |
+
"score": score,
|
| 370 |
+
"breakdown": breakdown,
|
| 371 |
+
"message": f"Diagnosis submitted. Score: {score:.4f}/{1.0:.4f}",
|
| 372 |
+
}
|
| 373 |
+
return score, info, result
|
| 374 |
+
|
| 375 |
+
def _resolve_field(self, artifact: str, field_path: str) -> Any:
|
| 376 |
+
"""Resolve a dot-notation field path from a JSON artifact."""
|
| 377 |
+
import json as _json
|
| 378 |
+
content = self._artifacts[artifact]
|
| 379 |
+
if artifact.endswith(".json"):
|
| 380 |
+
try:
|
| 381 |
+
data = _json.loads(content)
|
| 382 |
+
parts = field_path.split(".")
|
| 383 |
+
val = data
|
| 384 |
+
for p in parts:
|
| 385 |
+
if isinstance(val, dict):
|
| 386 |
+
val = val.get(p, f"Field '{p}' not found")
|
| 387 |
+
else:
|
| 388 |
+
return f"Cannot traverse into non-dict at '{p}'"
|
| 389 |
+
return val
|
| 390 |
+
except Exception as e:
|
| 391 |
+
return f"Parse error: {e}"
|
| 392 |
+
elif artifact.endswith(".yaml"):
|
| 393 |
+
# Simple key search for YAML
|
| 394 |
+
for line in content.split("\n"):
|
| 395 |
+
target_key = field_path.split(".")[-1]
|
| 396 |
+
if f"{target_key}:" in line:
|
| 397 |
+
return line.strip()
|
| 398 |
+
return f"Field '{field_path}' not found in config"
|
| 399 |
+
else:
|
| 400 |
+
# For .py files, return lines containing the field name
|
| 401 |
+
target = field_path.split(".")[-1]
|
| 402 |
+
matches = [l.strip() for l in content.split("\n") if target in l]
|
| 403 |
+
return matches[:5] if matches else f"'{target}' not found in {artifact}"
|
| 404 |
+
|
| 405 |
+
def _build_obs(self, last_result: Dict[str, Any]) -> MLOpsObservation:
|
| 406 |
+
return MLOpsObservation(
|
| 407 |
+
task_id=self.task_id,
|
| 408 |
+
task_description=TASK_DESCRIPTIONS[self.task_id],
|
| 409 |
+
run_id=self._run_id,
|
| 410 |
+
run_summary={
|
| 411 |
+
"model": self._model_cfg["name"],
|
| 412 |
+
"dataset": self._model_cfg["dataset"],
|
| 413 |
+
"task": self._model_cfg["type"],
|
| 414 |
+
"status": "FAILED" if self.task_id == "easy" else "COMPLETED_WITH_ANOMALIES",
|
| 415 |
+
"note": "Investigate artifacts to determine root cause.",
|
| 416 |
+
},
|
| 417 |
+
available_artifacts=list(self._artifact_meta),
|
| 418 |
+
artifacts_read=list(self._artifacts_read),
|
| 419 |
+
last_action_result=last_result,
|
| 420 |
+
step_count=self._step_count,
|
| 421 |
+
max_steps=self._max_steps,
|
| 422 |
+
done=self._done,
|
| 423 |
+
messages=list(self._messages),
|
| 424 |
+
)
|
| 425 |
+
|
| 426 |
+
@property
|
| 427 |
+
def state(self) -> MLOpsState:
|
| 428 |
+
return MLOpsState(
|
| 429 |
+
task_id=self.task_id,
|
| 430 |
+
seed=self._seed,
|
| 431 |
+
step_count=self._step_count,
|
| 432 |
+
max_steps=self._max_steps,
|
| 433 |
+
episode_done=self._done,
|
| 434 |
+
bug_type=self.bug_type,
|
| 435 |
+
bug_category=self.bug.category,
|
| 436 |
+
bug_file=self.bug.file,
|
| 437 |
+
bug_field=self.bug.field,
|
| 438 |
+
gold_fix=self.bug.gold_fix,
|
| 439 |
+
artifacts=self._artifacts,
|
| 440 |
+
artifacts_read=list(self._artifacts_read),
|
| 441 |
+
sanity_checks_run=list(self._sanity_checks_run),
|
| 442 |
+
duplicate_queries=self._duplicate_queries,
|
| 443 |
+
current_score=self._current_score,
|
| 444 |
+
)
|
| 445 |
+
|
| 446 |
+
|
| 447 |
+
# ─── Standalone grader ────────────────────────────────────────────────────────
|
| 448 |
+
|
| 449 |
+
def grade_task(task_id: str, seed: int, diagnosis: Dict[str, Any]) -> float:
|
| 450 |
+
"""Deterministic grader callable by OpenEnv validation framework.
|
| 451 |
+
|
| 452 |
+
Bypasses the artifact-read penalty since the grader only evaluates
|
| 453 |
+
diagnosis quality, not investigation thoroughness.
|
| 454 |
+
"""
|
| 455 |
+
env = MLOpsEnvironment(task_id=task_id)
|
| 456 |
+
env.reset(seed=seed)
|
| 457 |
+
# Pre-populate artifact reads to avoid the < 3 artifacts penalty
|
| 458 |
+
env._artifacts_read = list(env._artifacts.keys())
|
| 459 |
+
action = MLOpsAction(action_type="submit_diagnosis", **diagnosis)
|
| 460 |
+
_, reward, _, info = env.step(action)
|
| 461 |
+
return info.get("score", 0.0)
|
server/models.py
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
MLOps Pipeline Debugger — Pydantic Models
|
| 3 |
+
|
| 4 |
+
The agent acts as an ML engineer investigating a broken training run.
|
| 5 |
+
It has access to training artifacts (logs, configs, dataset stats, preprocessing code)
|
| 6 |
+
and must diagnose the root cause through systematic investigation.
|
| 7 |
+
|
| 8 |
+
Action Space:
|
| 9 |
+
read_config → Get training configuration (hyperparams, model arch, optimizer)
|
| 10 |
+
read_logs → Get training logs (filterable by keyword/epoch range)
|
| 11 |
+
check_dataset_stats → Get dataset split sizes, class distribution, feature statistics
|
| 12 |
+
inspect_preprocessing → Read preprocessing pipeline code
|
| 13 |
+
read_eval_results → Get validation and test set evaluation metrics
|
| 14 |
+
run_sanity_check → Compute a specific diagnostic check (label overlap, class balance, etc.)
|
| 15 |
+
query_artifact → Fetch a specific field from any artifact
|
| 16 |
+
submit_diagnosis → Final answer — triggers grading
|
| 17 |
+
|
| 18 |
+
Observation Space:
|
| 19 |
+
task_id, task_description
|
| 20 |
+
available_artifacts — list of artifacts the agent can inspect
|
| 21 |
+
last_action_result — result of the most recent action
|
| 22 |
+
artifacts_read — which artifacts have been read so far (exploration tracking)
|
| 23 |
+
step_count, max_steps
|
| 24 |
+
done
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
from __future__ import annotations
|
| 28 |
+
from typing import Any, Dict, List, Literal, Optional
|
| 29 |
+
from pydantic import BaseModel, Field
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# ─── Action ──────────────────────────────────────────────────────────────────
|
| 33 |
+
|
| 34 |
+
class MLOpsAction(BaseModel):
|
| 35 |
+
"""
|
| 36 |
+
One action the agent can take per step.
|
| 37 |
+
|
| 38 |
+
action_type determines which fields are used:
|
| 39 |
+
read_config → (no extra fields)
|
| 40 |
+
read_logs → log_filter (optional keyword or "epoch:N-M")
|
| 41 |
+
check_dataset_stats → (no extra fields)
|
| 42 |
+
inspect_preprocessing → (no extra fields)
|
| 43 |
+
read_eval_results → (no extra fields)
|
| 44 |
+
run_sanity_check → sanity_check_type (required)
|
| 45 |
+
query_artifact → artifact_name + field_path (required)
|
| 46 |
+
submit_diagnosis → all diagnosis fields (required)
|
| 47 |
+
"""
|
| 48 |
+
action_type: Literal[
|
| 49 |
+
"read_config",
|
| 50 |
+
"read_logs",
|
| 51 |
+
"check_dataset_stats",
|
| 52 |
+
"inspect_preprocessing",
|
| 53 |
+
"read_eval_results",
|
| 54 |
+
"run_sanity_check",
|
| 55 |
+
"query_artifact",
|
| 56 |
+
"submit_diagnosis",
|
| 57 |
+
] = Field(..., description="Which action to perform")
|
| 58 |
+
|
| 59 |
+
# read_logs
|
| 60 |
+
log_filter: Optional[str] = Field(
|
| 61 |
+
None,
|
| 62 |
+
description="Filter logs by keyword (e.g. 'nan', 'warning', 'error') or epoch range 'epoch:1-5'"
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
# run_sanity_check
|
| 66 |
+
sanity_check_type: Optional[Literal[
|
| 67 |
+
"label_consistency", # Are train/eval label mappings identical?
|
| 68 |
+
"data_leakage", # Is there train/val sample overlap?
|
| 69 |
+
"gradient_norms", # Are gradient norms within normal range?
|
| 70 |
+
"class_balance", # Are classes balanced across splits?
|
| 71 |
+
"feature_statistics", # Do train/val feature distributions match?
|
| 72 |
+
"encoder_version_match", # Do all pipeline stages use the same encoder version?
|
| 73 |
+
"loss_trajectory", # Is the loss curve shape anomalous?
|
| 74 |
+
"metric_gap_analysis", # Is val vs test metric gap suspiciously large?
|
| 75 |
+
]] = Field(None, description="Type of sanity check to run")
|
| 76 |
+
|
| 77 |
+
# query_artifact
|
| 78 |
+
artifact_name: Optional[Literal[
|
| 79 |
+
"config.yaml",
|
| 80 |
+
"train.log",
|
| 81 |
+
"dataset_stats.json",
|
| 82 |
+
"preprocessing.py",
|
| 83 |
+
"eval_results.json",
|
| 84 |
+
"model_card.json",
|
| 85 |
+
]] = Field(None, description="Artifact to query a specific field from")
|
| 86 |
+
field_path: Optional[str] = Field(
|
| 87 |
+
None,
|
| 88 |
+
description="Dot-notation field path, e.g. 'optimizer.learning_rate' or 'metrics.val_accuracy'"
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
# submit_diagnosis
|
| 92 |
+
failure_category: Optional[Literal[
|
| 93 |
+
"config_error", # Wrong hyperparameter value
|
| 94 |
+
"data_leakage", # Train/val contamination
|
| 95 |
+
"evaluation_bug", # Eval pipeline uses wrong artifacts
|
| 96 |
+
"preprocessing_bug", # Data transformation applied incorrectly
|
| 97 |
+
"label_mismatch", # Label encoding inconsistency
|
| 98 |
+
"architecture_bug", # Model architecture misconfiguration
|
| 99 |
+
]] = Field(None, description="Category of the failure")
|
| 100 |
+
root_cause_file: Optional[str] = Field(
|
| 101 |
+
None, description="Which artifact file contains the root cause"
|
| 102 |
+
)
|
| 103 |
+
root_cause_field: Optional[str] = Field(
|
| 104 |
+
None, description="Specific parameter, function, or variable that is wrong"
|
| 105 |
+
)
|
| 106 |
+
diagnosis: Optional[str] = Field(
|
| 107 |
+
None, description="Natural language explanation of what went wrong and why"
|
| 108 |
+
)
|
| 109 |
+
proposed_fix: Optional[str] = Field(
|
| 110 |
+
None, description="Concrete change that would fix the issue"
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
# ─── Observation ─────────────���───────────────────────────────────────────────
|
| 115 |
+
|
| 116 |
+
class ArtifactMeta(BaseModel):
|
| 117 |
+
name: str
|
| 118 |
+
description: str
|
| 119 |
+
size_hint: str # e.g. "47 lines", "12 fields"
|
| 120 |
+
last_modified: str
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
class MLOpsObservation(BaseModel):
|
| 124 |
+
"""Everything the agent sees after each step / reset."""
|
| 125 |
+
task_id: str
|
| 126 |
+
task_description: str
|
| 127 |
+
|
| 128 |
+
# Run summary — always visible
|
| 129 |
+
run_id: str
|
| 130 |
+
run_summary: Dict[str, Any] = Field(
|
| 131 |
+
description="High-level run info: model, dataset, final loss, training status"
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
available_artifacts: List[ArtifactMeta]
|
| 135 |
+
artifacts_read: List[str] = Field(
|
| 136 |
+
default_factory=list,
|
| 137 |
+
description="Names of artifacts the agent has already read"
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
last_action_result: Dict[str, Any] = Field(default_factory=dict)
|
| 141 |
+
|
| 142 |
+
step_count: int = 0
|
| 143 |
+
max_steps: int = 30
|
| 144 |
+
done: bool = False
|
| 145 |
+
messages: List[str] = Field(default_factory=list)
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
# ─── State ───────────────────────────────────────────────────────────────────
|
| 149 |
+
|
| 150 |
+
class MLOpsState(BaseModel):
|
| 151 |
+
"""Full internal state — for RL harness and debugging."""
|
| 152 |
+
task_id: str
|
| 153 |
+
seed: int
|
| 154 |
+
step_count: int
|
| 155 |
+
max_steps: int
|
| 156 |
+
episode_done: bool
|
| 157 |
+
|
| 158 |
+
# Planted bug ground truth
|
| 159 |
+
bug_type: str
|
| 160 |
+
bug_category: str
|
| 161 |
+
bug_file: str
|
| 162 |
+
bug_field: str
|
| 163 |
+
gold_fix: str
|
| 164 |
+
|
| 165 |
+
# All generated artifacts (full text)
|
| 166 |
+
artifacts: Dict[str, str]
|
| 167 |
+
|
| 168 |
+
# Agent's investigation history
|
| 169 |
+
artifacts_read: List[str]
|
| 170 |
+
sanity_checks_run: List[str]
|
| 171 |
+
duplicate_queries: int
|
| 172 |
+
|
| 173 |
+
current_score: float
|
server/openenv.yaml
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: mlops-debug-env
|
| 2 |
+
version: "1.0.0"
|
| 3 |
+
description: >
|
| 4 |
+
MLOps Pipeline Debugger: an AI agent acts as a senior ML engineer
|
| 5 |
+
investigating a broken training run. The environment procedurally generates
|
| 6 |
+
realistic training artifacts (logs, configs, preprocessing code, eval results)
|
| 7 |
+
with one planted fault. The agent must systematically investigate and submit
|
| 8 |
+
a structured diagnosis. Three tasks: config error (easy) → data leakage (medium)
|
| 9 |
+
→ silent evaluation bug (hard). All graders are fully deterministic.
|
| 10 |
+
author: Mohit Goyal
|
| 11 |
+
license: MIT
|
| 12 |
+
tags: [openenv, rl, mlops, debugging, machine-learning, agents]
|
| 13 |
+
tasks:
|
| 14 |
+
- id: easy
|
| 15 |
+
name: Config Error Diagnosis
|
| 16 |
+
difficulty: easy
|
| 17 |
+
max_steps: 20
|
| 18 |
+
bug_pool: [exploding_lr, wrong_optimizer, batch_size_overflow]
|
| 19 |
+
reward_range: [0.0, 1.0]
|
| 20 |
+
- id: medium
|
| 21 |
+
name: Data Leakage Detection
|
| 22 |
+
difficulty: medium
|
| 23 |
+
max_steps: 30
|
| 24 |
+
bug_pool: [data_leakage_scaler, data_leakage_overlap, wrong_split_ratio]
|
| 25 |
+
reward_range: [0.0, 1.0]
|
| 26 |
+
- id: hard
|
| 27 |
+
name: Silent Evaluation Bug
|
| 28 |
+
difficulty: hard
|
| 29 |
+
max_steps: 40
|
| 30 |
+
bug_pool: [label_encoder_mismatch, silent_metric_swap, tokenizer_version_drift]
|
| 31 |
+
reward_range: [0.0, 1.0]
|
| 32 |
+
asymmetric_penalty: true
|
| 33 |
+
action_space:
|
| 34 |
+
type: discrete_structured
|
| 35 |
+
actions: [read_config, read_logs, check_dataset_stats, inspect_preprocessing,
|
| 36 |
+
read_eval_results, run_sanity_check, query_artifact, submit_diagnosis]
|
| 37 |
+
observation_space:
|
| 38 |
+
type: structured_text
|
| 39 |
+
fields: [task_id, run_summary, available_artifacts, artifacts_read,
|
| 40 |
+
last_action_result, step_count, max_steps, done, messages]
|
| 41 |
+
reward:
|
| 42 |
+
type: dense_and_terminal
|
| 43 |
+
per_step: "+0.02 new artifact read, -0.02 duplicate read, +0.01 new sanity check"
|
| 44 |
+
terminal: "0.15 category + 0.25 file + 0.30 field + 0.30 fix. Hard task 1.5x penalty."
|
| 45 |
+
api:
|
| 46 |
+
reset: POST /reset
|
| 47 |
+
step: POST /step
|
| 48 |
+
state: GET /state
|
| 49 |
+
health: GET /health
|
| 50 |
+
websocket: /ws
|
| 51 |
+
runtime:
|
| 52 |
+
port: 7860
|
| 53 |
+
workers: 1
|
| 54 |
+
framework: fastapi
|
| 55 |
+
python: "3.11"
|
server/openenv_state.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from typing import Dict, List
|
| 5 |
+
|
| 6 |
+
from pydantic import BaseModel
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class OpenEnvState(BaseModel):
|
| 10 |
+
run_id: str
|
| 11 |
+
task_id: str
|
| 12 |
+
seed: int
|
| 13 |
+
step_count: int
|
| 14 |
+
max_steps: int
|
| 15 |
+
scores: Dict[str, float]
|
| 16 |
+
end_score: float
|
| 17 |
+
rewards: List[float]
|
| 18 |
+
artifacts_read: List[str]
|
| 19 |
+
timestamp: str
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# Global current state (mutable during run)
|
| 23 |
+
OPENENV_STATE: OpenEnvState = OpenEnvState(
|
| 24 |
+
run_id="",
|
| 25 |
+
task_id="",
|
| 26 |
+
seed=0,
|
| 27 |
+
step_count=0,
|
| 28 |
+
max_steps=30,
|
| 29 |
+
scores={"easy": 0.0, "medium": 0.0, "hard": 0.0},
|
| 30 |
+
end_score=0.0,
|
| 31 |
+
rewards=[],
|
| 32 |
+
artifacts_read=[],
|
| 33 |
+
timestamp=datetime.utcnow().isoformat(),
|
| 34 |
+
)
|
server/pyproject.toml
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "mlops-openenv"
|
| 3 |
+
version = "1.0.0"
|
| 4 |
+
description = "MLOps Pipeline Debugger - OpenEnv-compatible RL environment for ML training debugging"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.11"
|
| 7 |
+
license = {text = "MIT"}
|
| 8 |
+
authors = [{name = "MLOps Team"}]
|
| 9 |
+
|
| 10 |
+
dependencies = [
|
| 11 |
+
"fastapi>=0.115.0",
|
| 12 |
+
"uvicorn[standard]>=0.30.0",
|
| 13 |
+
"pydantic>=2.9.0",
|
| 14 |
+
"httpx>=0.27.0",
|
| 15 |
+
"openai>=1.51.0",
|
| 16 |
+
"websockets>=13.0",
|
| 17 |
+
"numpy>=1.26.0",
|
| 18 |
+
"python-multipart>=0.0.12",
|
| 19 |
+
"dotenv>=1.0.0",
|
| 20 |
+
"openenv-core>=0.2.0",
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
[project.scripts]
|
| 24 |
+
server = "uvicorn:app --host 0.0.0.0 --port 7860"
|
| 25 |
+
|
| 26 |
+
[project.optional-dependencies]
|
| 27 |
+
dev = [
|
| 28 |
+
"pytest>=8.0.0",
|
| 29 |
+
"ruff>=0.6.0",
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
[build-system]
|
| 33 |
+
requires = ["setuptools>=61.0"]
|
| 34 |
+
build-backend = "setuptools.build_meta"
|
| 35 |
+
|
| 36 |
+
[tool.ruff]
|
| 37 |
+
line-length = 100
|
| 38 |
+
target-version = "py311"
|
| 39 |
+
|
| 40 |
+
[tool.pytest.ini_options]
|
| 41 |
+
testpaths = ["tests"]
|
| 42 |
+
python_files = ["test_*.py"]
|
server/requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.115.0
|
| 2 |
+
uvicorn[standard]==0.30.6
|
| 3 |
+
pydantic==2.9.2
|
| 4 |
+
httpx==0.27.2
|
| 5 |
+
openai==1.51.0
|
| 6 |
+
websockets==13.1
|
| 7 |
+
numpy==1.26.4
|
| 8 |
+
python-multipart==0.0.12
|
server/uv.lock
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Generated by uv
|
| 2 |
+
# This is a stub lockfile - real lockfile would be much larger
|
| 3 |
+
|
| 4 |
+
[[package]]
|
| 5 |
+
name = "openenv-core"
|
| 6 |
+
version = "0.2.0"
|
| 7 |
+
source = { registry = "https://pypi.org/simple" }
|
| 8 |
+
|
| 9 |
+
[[package]]
|
| 10 |
+
name = "fastapi"
|
| 11 |
+
version = "0.115.0"
|
| 12 |
+
source = { registry = "https://pypi.org/simple" }
|
| 13 |
+
|
| 14 |
+
[[package]]
|
| 15 |
+
name = "uvicorn"
|
| 16 |
+
version = "0.30.6"
|
| 17 |
+
source = { registry = "https://pypi.org/simple" }
|
| 18 |
+
|
| 19 |
+
[[package]]
|
| 20 |
+
name = "pydantic"
|
| 21 |
+
version = "2.9.2"
|
| 22 |
+
source = { registry = "https://pypi.org/simple" }
|
| 23 |
+
|
| 24 |
+
[[package]]
|
| 25 |
+
name = "httpx"
|
| 26 |
+
version = "0.27.2"
|
| 27 |
+
source = { registry = "https://pypi.org/simple" }
|
| 28 |
+
|
| 29 |
+
[[package]]
|
| 30 |
+
name = "openai"
|
| 31 |
+
version = "1.51.0"
|
| 32 |
+
source = { registry = "https://pypi.org/simple" }
|
uv.lock
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Generated by uv
|
| 2 |
+
# This is a stub lockfile - real lockfile would be much larger
|
| 3 |
+
|
| 4 |
+
[[package]]
|
| 5 |
+
name = "openenv-core"
|
| 6 |
+
version = "0.2.0"
|
| 7 |
+
source = { registry = "https://pypi.org/simple" }
|
| 8 |
+
|
| 9 |
+
[[package]]
|
| 10 |
+
name = "fastapi"
|
| 11 |
+
version = "0.115.0"
|
| 12 |
+
source = { registry = "https://pypi.org/simple" }
|
| 13 |
+
|
| 14 |
+
[[package]]
|
| 15 |
+
name = "uvicorn"
|
| 16 |
+
version = "0.30.6"
|
| 17 |
+
source = { registry = "https://pypi.org/simple" }
|
| 18 |
+
|
| 19 |
+
[[package]]
|
| 20 |
+
name = "pydantic"
|
| 21 |
+
version = "2.9.2"
|
| 22 |
+
source = { registry = "https://pypi.org/simple" }
|
| 23 |
+
|
| 24 |
+
[[package]]
|
| 25 |
+
name = "httpx"
|
| 26 |
+
version = "0.27.2"
|
| 27 |
+
source = { registry = "https://pypi.org/simple" }
|
| 28 |
+
|
| 29 |
+
[[package]]
|
| 30 |
+
name = "openai"
|
| 31 |
+
version = "1.51.0"
|
| 32 |
+
source = { registry = "https://pypi.org/simple" }
|