name: mlops-debug-env version: "1.0.0" description: > MLOps Pipeline Debugger: an AI agent acts as a senior ML engineer investigating a broken training run. The environment procedurally generates realistic training artifacts (logs, configs, preprocessing code, eval results) with one planted fault. The agent must systematically investigate and submit a structured diagnosis. Three tasks: config error (easy) → data leakage (medium) → silent evaluation bug (hard). All graders are fully deterministic. author: Mohit Goyal license: MIT tags: [openenv, rl, mlops, debugging, machine-learning, agents] tasks: - id: easy name: Config Error Diagnosis difficulty: easy max_steps: 20 bug_pool: [exploding_lr, wrong_optimizer, batch_size_overflow] reward_range: [0.0, 1.0] - id: medium name: Data Leakage Detection difficulty: medium max_steps: 30 bug_pool: [data_leakage_scaler, data_leakage_overlap, wrong_split_ratio] reward_range: [0.0, 1.0] - id: hard name: Silent Evaluation Bug difficulty: hard max_steps: 40 bug_pool: [label_encoder_mismatch, silent_metric_swap, tokenizer_version_drift] reward_range: [0.0, 1.0] asymmetric_penalty: true action_space: type: discrete_structured actions: [read_config, read_logs, check_dataset_stats, inspect_preprocessing, read_eval_results, run_sanity_check, query_artifact, submit_diagnosis] observation_space: type: structured_text fields: [task_id, run_summary, available_artifacts, artifacts_read, last_action_result, step_count, max_steps, done, messages] reward: type: dense_and_terminal per_step: "+0.02 new artifact read, -0.02 duplicate read, +0.01 new sanity check" terminal: "0.15 category + 0.25 file + 0.30 field + 0.30 fix. Hard task 1.5x penalty." api: reset: POST /reset step: POST /step state: GET /state health: GET /health websocket: /ws runtime: port: 7860 workers: 1 framework: fastapi python: "3.11"