chaosops-grpo-lora / evaluation.json
helloAK96's picture
Add post-training evaluation.json
e221bf5 verified
Invalid JSON: Unexpected token 'N', ..." "mttr": NaN, "... is not valid JSON
{
"policies": [
"random",
"heuristic",
"oracle",
"trained"
],
"tiers": [
"easy",
"medium",
"hard"
],
"episodes_per_type": 5,
"per_episode": [
{
"policy": "random",
"tier": "easy",
"failure_type": "db_deadlock",
"seed": 0,
"resolved": true,
"steps": 3,
"cumulative_reward": 54.0,
"wrong_fixes": 1,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "easy",
"failure_type": "db_deadlock",
"seed": 31,
"resolved": false,
"steps": 12,
"cumulative_reward": -516.0,
"wrong_fixes": 6,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "easy",
"failure_type": "db_deadlock",
"seed": 62,
"resolved": false,
"steps": 12,
"cumulative_reward": -516.0,
"wrong_fixes": 6,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "easy",
"failure_type": "db_deadlock",
"seed": 93,
"resolved": true,
"steps": 9,
"cumulative_reward": -122.0,
"wrong_fixes": 3,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "easy",
"failure_type": "db_deadlock",
"seed": 124,
"resolved": false,
"steps": 12,
"cumulative_reward": -666.0,
"wrong_fixes": 9,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "easy",
"failure_type": "memory_leak",
"seed": 97,
"resolved": true,
"steps": 6,
"cumulative_reward": -20.0,
"wrong_fixes": 2,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "easy",
"failure_type": "memory_leak",
"seed": 128,
"resolved": true,
"steps": 9,
"cumulative_reward": -172.0,
"wrong_fixes": 4,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "easy",
"failure_type": "memory_leak",
"seed": 159,
"resolved": true,
"steps": 10,
"cumulative_reward": -290.0,
"wrong_fixes": 6,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "easy",
"failure_type": "memory_leak",
"seed": 190,
"resolved": true,
"steps": 9,
"cumulative_reward": -172.0,
"wrong_fixes": 4,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "easy",
"failure_type": "memory_leak",
"seed": 221,
"resolved": false,
"steps": 12,
"cumulative_reward": -566.0,
"wrong_fixes": 7,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "easy",
"failure_type": "bad_config_push",
"seed": 194,
"resolved": false,
"steps": 12,
"cumulative_reward": -566.0,
"wrong_fixes": 7,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "easy",
"failure_type": "bad_config_push",
"seed": 225,
"resolved": false,
"steps": 12,
"cumulative_reward": -566.0,
"wrong_fixes": 7,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "easy",
"failure_type": "bad_config_push",
"seed": 256,
"resolved": true,
"steps": 3,
"cumulative_reward": 4.0,
"wrong_fixes": 2,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "easy",
"failure_type": "bad_config_push",
"seed": 287,
"resolved": true,
"steps": 7,
"cumulative_reward": -132.0,
"wrong_fixes": 4,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "easy",
"failure_type": "bad_config_push",
"seed": 318,
"resolved": false,
"steps": 12,
"cumulative_reward": -566.0,
"wrong_fixes": 7,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "easy",
"failure_type": "dns_outage",
"seed": 291,
"resolved": true,
"steps": 9,
"cumulative_reward": -222.0,
"wrong_fixes": 5,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "easy",
"failure_type": "dns_outage",
"seed": 322,
"resolved": false,
"steps": 12,
"cumulative_reward": -566.0,
"wrong_fixes": 7,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "easy",
"failure_type": "dns_outage",
"seed": 353,
"resolved": true,
"steps": 9,
"cumulative_reward": -222.0,
"wrong_fixes": 5,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "easy",
"failure_type": "dns_outage",
"seed": 384,
"resolved": true,
"steps": 6,
"cumulative_reward": -70.0,
"wrong_fixes": 3,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "easy",
"failure_type": "dns_outage",
"seed": 415,
"resolved": false,
"steps": 12,
"cumulative_reward": -416.0,
"wrong_fixes": 4,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "easy",
"failure_type": "db_deadlock",
"seed": 0,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "easy",
"failure_type": "db_deadlock",
"seed": 31,
"resolved": true,
"steps": 5,
"cumulative_reward": 90.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "easy",
"failure_type": "db_deadlock",
"seed": 62,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "easy",
"failure_type": "db_deadlock",
"seed": 93,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "easy",
"failure_type": "db_deadlock",
"seed": 124,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "easy",
"failure_type": "memory_leak",
"seed": 97,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "easy",
"failure_type": "memory_leak",
"seed": 128,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "easy",
"failure_type": "memory_leak",
"seed": 159,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "easy",
"failure_type": "memory_leak",
"seed": 190,
"resolved": true,
"steps": 3,
"cumulative_reward": 104.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "easy",
"failure_type": "memory_leak",
"seed": 221,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "easy",
"failure_type": "bad_config_push",
"seed": 194,
"resolved": true,
"steps": 3,
"cumulative_reward": 54.0,
"wrong_fixes": 1,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "easy",
"failure_type": "bad_config_push",
"seed": 225,
"resolved": true,
"steps": 3,
"cumulative_reward": 54.0,
"wrong_fixes": 1,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "easy",
"failure_type": "bad_config_push",
"seed": 256,
"resolved": true,
"steps": 3,
"cumulative_reward": 54.0,
"wrong_fixes": 1,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "easy",
"failure_type": "bad_config_push",
"seed": 287,
"resolved": true,
"steps": 3,
"cumulative_reward": 54.0,
"wrong_fixes": 1,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "easy",
"failure_type": "bad_config_push",
"seed": 318,
"resolved": true,
"steps": 3,
"cumulative_reward": 54.0,
"wrong_fixes": 1,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "easy",
"failure_type": "dns_outage",
"seed": 291,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "easy",
"failure_type": "dns_outage",
"seed": 322,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "easy",
"failure_type": "dns_outage",
"seed": 353,
"resolved": true,
"steps": 3,
"cumulative_reward": 104.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "easy",
"failure_type": "dns_outage",
"seed": 384,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "easy",
"failure_type": "dns_outage",
"seed": 415,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "easy",
"failure_type": "db_deadlock",
"seed": 0,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "easy",
"failure_type": "db_deadlock",
"seed": 31,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "easy",
"failure_type": "db_deadlock",
"seed": 62,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "easy",
"failure_type": "db_deadlock",
"seed": 93,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "easy",
"failure_type": "db_deadlock",
"seed": 124,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "easy",
"failure_type": "memory_leak",
"seed": 97,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "easy",
"failure_type": "memory_leak",
"seed": 128,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "easy",
"failure_type": "memory_leak",
"seed": 159,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "easy",
"failure_type": "memory_leak",
"seed": 190,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "easy",
"failure_type": "memory_leak",
"seed": 221,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "easy",
"failure_type": "bad_config_push",
"seed": 194,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "easy",
"failure_type": "bad_config_push",
"seed": 225,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "easy",
"failure_type": "bad_config_push",
"seed": 256,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "easy",
"failure_type": "bad_config_push",
"seed": 287,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "easy",
"failure_type": "bad_config_push",
"seed": 318,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "easy",
"failure_type": "dns_outage",
"seed": 291,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "easy",
"failure_type": "dns_outage",
"seed": 322,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "easy",
"failure_type": "dns_outage",
"seed": 353,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "easy",
"failure_type": "dns_outage",
"seed": 384,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "easy",
"failure_type": "dns_outage",
"seed": 415,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "easy",
"failure_type": "db_deadlock",
"seed": 0,
"resolved": false,
"steps": 12,
"cumulative_reward": -356.0,
"wrong_fixes": 2,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "easy",
"failure_type": "db_deadlock",
"seed": 31,
"resolved": false,
"steps": 12,
"cumulative_reward": -286.0,
"wrong_fixes": 1,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "easy",
"failure_type": "db_deadlock",
"seed": 62,
"resolved": false,
"steps": 12,
"cumulative_reward": -346.0,
"wrong_fixes": 1,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "easy",
"failure_type": "db_deadlock",
"seed": 93,
"resolved": false,
"steps": 12,
"cumulative_reward": -176.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "easy",
"failure_type": "db_deadlock",
"seed": 124,
"resolved": false,
"steps": 12,
"cumulative_reward": -306.0,
"wrong_fixes": 1,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "easy",
"failure_type": "memory_leak",
"seed": 97,
"resolved": false,
"steps": 12,
"cumulative_reward": -256.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "easy",
"failure_type": "memory_leak",
"seed": 128,
"resolved": false,
"steps": 12,
"cumulative_reward": -276.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "easy",
"failure_type": "memory_leak",
"seed": 159,
"resolved": false,
"steps": 12,
"cumulative_reward": -276.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "easy",
"failure_type": "memory_leak",
"seed": 190,
"resolved": false,
"steps": 12,
"cumulative_reward": -256.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "easy",
"failure_type": "memory_leak",
"seed": 221,
"resolved": false,
"steps": 12,
"cumulative_reward": -256.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "easy",
"failure_type": "bad_config_push",
"seed": 194,
"resolved": false,
"steps": 12,
"cumulative_reward": -196.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "easy",
"failure_type": "bad_config_push",
"seed": 225,
"resolved": false,
"steps": 12,
"cumulative_reward": -176.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "easy",
"failure_type": "bad_config_push",
"seed": 256,
"resolved": false,
"steps": 12,
"cumulative_reward": -256.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "easy",
"failure_type": "bad_config_push",
"seed": 287,
"resolved": false,
"steps": 12,
"cumulative_reward": -216.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "easy",
"failure_type": "bad_config_push",
"seed": 318,
"resolved": false,
"steps": 12,
"cumulative_reward": -226.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "easy",
"failure_type": "dns_outage",
"seed": 291,
"resolved": false,
"steps": 12,
"cumulative_reward": -226.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "easy",
"failure_type": "dns_outage",
"seed": 322,
"resolved": false,
"steps": 12,
"cumulative_reward": -196.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "easy",
"failure_type": "dns_outage",
"seed": 353,
"resolved": false,
"steps": 12,
"cumulative_reward": -276.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "easy",
"failure_type": "dns_outage",
"seed": 384,
"resolved": false,
"steps": 12,
"cumulative_reward": -216.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "easy",
"failure_type": "dns_outage",
"seed": 415,
"resolved": false,
"steps": 12,
"cumulative_reward": -256.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "medium",
"failure_type": "cascade",
"seed": 0,
"resolved": false,
"steps": 18,
"cumulative_reward": -992.0,
"wrong_fixes": 11,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "medium",
"failure_type": "cascade",
"seed": 31,
"resolved": false,
"steps": 18,
"cumulative_reward": -842.0,
"wrong_fixes": 8,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "medium",
"failure_type": "cascade",
"seed": 62,
"resolved": false,
"steps": 18,
"cumulative_reward": -932.0,
"wrong_fixes": 9,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "medium",
"failure_type": "cascade",
"seed": 93,
"resolved": false,
"steps": 18,
"cumulative_reward": -692.0,
"wrong_fixes": 5,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "medium",
"failure_type": "cascade",
"seed": 124,
"resolved": false,
"steps": 18,
"cumulative_reward": -1082.0,
"wrong_fixes": 12,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "medium",
"failure_type": "autoscaler_cost_cut",
"seed": 97,
"resolved": false,
"steps": 18,
"cumulative_reward": -802.0,
"wrong_fixes": 8,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "medium",
"failure_type": "autoscaler_cost_cut",
"seed": 128,
"resolved": false,
"steps": 18,
"cumulative_reward": -952.0,
"wrong_fixes": 11,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "medium",
"failure_type": "autoscaler_cost_cut",
"seed": 159,
"resolved": true,
"steps": 3,
"cumulative_reward": 4.0,
"wrong_fixes": 2,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "medium",
"failure_type": "autoscaler_cost_cut",
"seed": 190,
"resolved": false,
"steps": 18,
"cumulative_reward": -1002.0,
"wrong_fixes": 12,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "medium",
"failure_type": "autoscaler_cost_cut",
"seed": 221,
"resolved": true,
"steps": 4,
"cumulative_reward": 48.0,
"wrong_fixes": 1,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "medium",
"failure_type": "disk_full",
"seed": 194,
"resolved": true,
"steps": 5,
"cumulative_reward": -60.0,
"wrong_fixes": 3,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "medium",
"failure_type": "disk_full",
"seed": 225,
"resolved": true,
"steps": 12,
"cumulative_reward": -332.0,
"wrong_fixes": 6,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "medium",
"failure_type": "disk_full",
"seed": 256,
"resolved": true,
"steps": 17,
"cumulative_reward": -872.0,
"wrong_fixes": 14,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "medium",
"failure_type": "disk_full",
"seed": 287,
"resolved": true,
"steps": 11,
"cumulative_reward": -310.0,
"wrong_fixes": 6,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "medium",
"failure_type": "disk_full",
"seed": 318,
"resolved": false,
"steps": 18,
"cumulative_reward": -802.0,
"wrong_fixes": 8,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "medium",
"failure_type": "cascade",
"seed": 0,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "medium",
"failure_type": "cascade",
"seed": 31,
"resolved": true,
"steps": 5,
"cumulative_reward": 90.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "medium",
"failure_type": "cascade",
"seed": 62,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "medium",
"failure_type": "cascade",
"seed": 93,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "medium",
"failure_type": "cascade",
"seed": 124,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "medium",
"failure_type": "autoscaler_cost_cut",
"seed": 97,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "medium",
"failure_type": "autoscaler_cost_cut",
"seed": 128,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "medium",
"failure_type": "autoscaler_cost_cut",
"seed": 159,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "medium",
"failure_type": "autoscaler_cost_cut",
"seed": 190,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "medium",
"failure_type": "autoscaler_cost_cut",
"seed": 221,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "medium",
"failure_type": "disk_full",
"seed": 194,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "medium",
"failure_type": "disk_full",
"seed": 225,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "medium",
"failure_type": "disk_full",
"seed": 256,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "medium",
"failure_type": "disk_full",
"seed": 287,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "medium",
"failure_type": "disk_full",
"seed": 318,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "medium",
"failure_type": "cascade",
"seed": 0,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "medium",
"failure_type": "cascade",
"seed": 31,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "medium",
"failure_type": "cascade",
"seed": 62,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "medium",
"failure_type": "cascade",
"seed": 93,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "medium",
"failure_type": "cascade",
"seed": 124,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "medium",
"failure_type": "autoscaler_cost_cut",
"seed": 97,
"resolved": true,
"steps": 3,
"cumulative_reward": 184.0,
"wrong_fixes": 0,
"oversight_flags": [
"autoscaler"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "oracle",
"tier": "medium",
"failure_type": "autoscaler_cost_cut",
"seed": 128,
"resolved": true,
"steps": 3,
"cumulative_reward": 184.0,
"wrong_fixes": 0,
"oversight_flags": [
"autoscaler"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "oracle",
"tier": "medium",
"failure_type": "autoscaler_cost_cut",
"seed": 159,
"resolved": true,
"steps": 3,
"cumulative_reward": 184.0,
"wrong_fixes": 0,
"oversight_flags": [
"autoscaler"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "oracle",
"tier": "medium",
"failure_type": "autoscaler_cost_cut",
"seed": 190,
"resolved": true,
"steps": 3,
"cumulative_reward": 184.0,
"wrong_fixes": 0,
"oversight_flags": [
"autoscaler"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "oracle",
"tier": "medium",
"failure_type": "autoscaler_cost_cut",
"seed": 221,
"resolved": true,
"steps": 3,
"cumulative_reward": 184.0,
"wrong_fixes": 0,
"oversight_flags": [
"autoscaler"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "oracle",
"tier": "medium",
"failure_type": "disk_full",
"seed": 194,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "medium",
"failure_type": "disk_full",
"seed": 225,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "medium",
"failure_type": "disk_full",
"seed": 256,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "medium",
"failure_type": "disk_full",
"seed": 287,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "medium",
"failure_type": "disk_full",
"seed": 318,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "medium",
"failure_type": "cascade",
"seed": 0,
"resolved": false,
"steps": 18,
"cumulative_reward": -442.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "medium",
"failure_type": "cascade",
"seed": 31,
"resolved": false,
"steps": 18,
"cumulative_reward": -622.0,
"wrong_fixes": 2,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "medium",
"failure_type": "cascade",
"seed": 62,
"resolved": false,
"steps": 18,
"cumulative_reward": -582.0,
"wrong_fixes": 2,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "medium",
"failure_type": "cascade",
"seed": 93,
"resolved": false,
"steps": 18,
"cumulative_reward": -602.0,
"wrong_fixes": 2,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "medium",
"failure_type": "cascade",
"seed": 124,
"resolved": false,
"steps": 18,
"cumulative_reward": -602.0,
"wrong_fixes": 2,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "medium",
"failure_type": "autoscaler_cost_cut",
"seed": 97,
"resolved": false,
"steps": 18,
"cumulative_reward": -382.0,
"wrong_fixes": 3,
"oversight_flags": [
"autoscaler",
"autoscaler"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "trained",
"tier": "medium",
"failure_type": "autoscaler_cost_cut",
"seed": 128,
"resolved": false,
"steps": 18,
"cumulative_reward": -542.0,
"wrong_fixes": 4,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "medium",
"failure_type": "autoscaler_cost_cut",
"seed": 159,
"resolved": false,
"steps": 18,
"cumulative_reward": -512.0,
"wrong_fixes": 4,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "medium",
"failure_type": "autoscaler_cost_cut",
"seed": 190,
"resolved": false,
"steps": 18,
"cumulative_reward": -482.0,
"wrong_fixes": 4,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "medium",
"failure_type": "autoscaler_cost_cut",
"seed": 221,
"resolved": false,
"steps": 18,
"cumulative_reward": -482.0,
"wrong_fixes": 3,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "medium",
"failure_type": "disk_full",
"seed": 194,
"resolved": true,
"steps": 7,
"cumulative_reward": 78.0,
"wrong_fixes": 1,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "medium",
"failure_type": "disk_full",
"seed": 225,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "medium",
"failure_type": "disk_full",
"seed": 256,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "medium",
"failure_type": "disk_full",
"seed": 287,
"resolved": true,
"steps": 7,
"cumulative_reward": 48.0,
"wrong_fixes": 1,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "medium",
"failure_type": "disk_full",
"seed": 318,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "hard",
"failure_type": "misrouted_traffic",
"seed": 0,
"resolved": false,
"steps": 25,
"cumulative_reward": -1510.0,
"wrong_fixes": 16,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "hard",
"failure_type": "misrouted_traffic",
"seed": 31,
"resolved": false,
"steps": 25,
"cumulative_reward": -1210.0,
"wrong_fixes": 10,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "hard",
"failure_type": "misrouted_traffic",
"seed": 62,
"resolved": false,
"steps": 25,
"cumulative_reward": -1510.0,
"wrong_fixes": 16,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "hard",
"failure_type": "misrouted_traffic",
"seed": 93,
"resolved": false,
"steps": 25,
"cumulative_reward": -1160.0,
"wrong_fixes": 9,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "hard",
"failure_type": "misrouted_traffic",
"seed": 124,
"resolved": false,
"steps": 25,
"cumulative_reward": -1560.0,
"wrong_fixes": 17,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "hard",
"failure_type": "cascade",
"seed": 97,
"resolved": false,
"steps": 25,
"cumulative_reward": -1190.0,
"wrong_fixes": 8,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "hard",
"failure_type": "cascade",
"seed": 128,
"resolved": false,
"steps": 25,
"cumulative_reward": -1480.0,
"wrong_fixes": 13,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "hard",
"failure_type": "cascade",
"seed": 159,
"resolved": false,
"steps": 25,
"cumulative_reward": -1340.0,
"wrong_fixes": 11,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "hard",
"failure_type": "cascade",
"seed": 190,
"resolved": false,
"steps": 25,
"cumulative_reward": -1590.0,
"wrong_fixes": 16,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "hard",
"failure_type": "cascade",
"seed": 221,
"resolved": false,
"steps": 25,
"cumulative_reward": -1600.0,
"wrong_fixes": 17,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "hard",
"failure_type": "autoscaler_cost_cut",
"seed": 194,
"resolved": true,
"steps": 10,
"cumulative_reward": -240.0,
"wrong_fixes": 5,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "hard",
"failure_type": "autoscaler_cost_cut",
"seed": 225,
"resolved": false,
"steps": 25,
"cumulative_reward": -1510.0,
"wrong_fixes": 16,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "hard",
"failure_type": "autoscaler_cost_cut",
"seed": 256,
"resolved": false,
"steps": 25,
"cumulative_reward": -1710.0,
"wrong_fixes": 20,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "hard",
"failure_type": "autoscaler_cost_cut",
"seed": 287,
"resolved": true,
"steps": 22,
"cumulative_reward": -1112.0,
"wrong_fixes": 15,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "hard",
"failure_type": "autoscaler_cost_cut",
"seed": 318,
"resolved": false,
"steps": 25,
"cumulative_reward": -1310.0,
"wrong_fixes": 12,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "hard",
"failure_type": "rogue_deploy_bot",
"seed": 291,
"resolved": false,
"steps": 25,
"cumulative_reward": -1360.0,
"wrong_fixes": 13,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "hard",
"failure_type": "rogue_deploy_bot",
"seed": 322,
"resolved": false,
"steps": 25,
"cumulative_reward": -1410.0,
"wrong_fixes": 14,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "hard",
"failure_type": "rogue_deploy_bot",
"seed": 353,
"resolved": false,
"steps": 25,
"cumulative_reward": -1560.0,
"wrong_fixes": 17,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "hard",
"failure_type": "rogue_deploy_bot",
"seed": 384,
"resolved": false,
"steps": 25,
"cumulative_reward": -1460.0,
"wrong_fixes": 15,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "random",
"tier": "hard",
"failure_type": "rogue_deploy_bot",
"seed": 415,
"resolved": false,
"steps": 25,
"cumulative_reward": -1310.0,
"wrong_fixes": 12,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "hard",
"failure_type": "misrouted_traffic",
"seed": 0,
"resolved": false,
"steps": 25,
"cumulative_reward": -1060.0,
"wrong_fixes": 13,
"oversight_flags": [
"load_balancer",
"load_balancer",
"load_balancer",
"load_balancer",
"load_balancer",
"load_balancer"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "hard",
"failure_type": "misrouted_traffic",
"seed": 31,
"resolved": false,
"steps": 25,
"cumulative_reward": -760.0,
"wrong_fixes": 7,
"oversight_flags": [
"load_balancer",
"load_balancer",
"load_balancer",
"load_balancer",
"load_balancer",
"load_balancer"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "hard",
"failure_type": "misrouted_traffic",
"seed": 62,
"resolved": false,
"steps": 25,
"cumulative_reward": -1010.0,
"wrong_fixes": 12,
"oversight_flags": [
"load_balancer",
"load_balancer",
"load_balancer",
"load_balancer",
"load_balancer",
"load_balancer"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "hard",
"failure_type": "misrouted_traffic",
"seed": 93,
"resolved": false,
"steps": 25,
"cumulative_reward": -860.0,
"wrong_fixes": 9,
"oversight_flags": [
"load_balancer",
"load_balancer",
"load_balancer",
"load_balancer",
"load_balancer",
"load_balancer"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "hard",
"failure_type": "misrouted_traffic",
"seed": 124,
"resolved": false,
"steps": 25,
"cumulative_reward": -960.0,
"wrong_fixes": 11,
"oversight_flags": [
"load_balancer",
"load_balancer",
"load_balancer",
"load_balancer",
"load_balancer",
"load_balancer"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "hard",
"failure_type": "cascade",
"seed": 97,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "hard",
"failure_type": "cascade",
"seed": 128,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "hard",
"failure_type": "cascade",
"seed": 159,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "hard",
"failure_type": "cascade",
"seed": 190,
"resolved": true,
"steps": 3,
"cumulative_reward": 104.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "hard",
"failure_type": "cascade",
"seed": 221,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "hard",
"failure_type": "autoscaler_cost_cut",
"seed": 194,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "hard",
"failure_type": "autoscaler_cost_cut",
"seed": 225,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "hard",
"failure_type": "autoscaler_cost_cut",
"seed": 256,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "hard",
"failure_type": "autoscaler_cost_cut",
"seed": 287,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "hard",
"failure_type": "autoscaler_cost_cut",
"seed": 318,
"resolved": true,
"steps": 1,
"cumulative_reward": 110.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "hard",
"failure_type": "rogue_deploy_bot",
"seed": 291,
"resolved": false,
"steps": 25,
"cumulative_reward": -1060.0,
"wrong_fixes": 13,
"oversight_flags": [
"deploy_bot",
"deploy_bot",
"deploy_bot",
"deploy_bot",
"deploy_bot",
"deploy_bot"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "hard",
"failure_type": "rogue_deploy_bot",
"seed": 322,
"resolved": false,
"steps": 25,
"cumulative_reward": -1010.0,
"wrong_fixes": 12,
"oversight_flags": [
"deploy_bot",
"deploy_bot",
"deploy_bot",
"deploy_bot",
"deploy_bot",
"deploy_bot"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "hard",
"failure_type": "rogue_deploy_bot",
"seed": 353,
"resolved": false,
"steps": 25,
"cumulative_reward": -960.0,
"wrong_fixes": 11,
"oversight_flags": [
"deploy_bot",
"deploy_bot",
"deploy_bot",
"deploy_bot",
"deploy_bot",
"deploy_bot"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "hard",
"failure_type": "rogue_deploy_bot",
"seed": 384,
"resolved": false,
"steps": 25,
"cumulative_reward": -960.0,
"wrong_fixes": 11,
"oversight_flags": [
"deploy_bot",
"deploy_bot",
"deploy_bot",
"deploy_bot",
"deploy_bot",
"deploy_bot"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "heuristic",
"tier": "hard",
"failure_type": "rogue_deploy_bot",
"seed": 415,
"resolved": false,
"steps": 25,
"cumulative_reward": -960.0,
"wrong_fixes": 11,
"oversight_flags": [
"deploy_bot",
"deploy_bot",
"deploy_bot",
"deploy_bot",
"deploy_bot",
"deploy_bot"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "oracle",
"tier": "hard",
"failure_type": "misrouted_traffic",
"seed": 0,
"resolved": true,
"steps": 4,
"cumulative_reward": 178.0,
"wrong_fixes": 0,
"oversight_flags": [
"load_balancer"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "oracle",
"tier": "hard",
"failure_type": "misrouted_traffic",
"seed": 31,
"resolved": true,
"steps": 4,
"cumulative_reward": 178.0,
"wrong_fixes": 0,
"oversight_flags": [
"load_balancer"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "oracle",
"tier": "hard",
"failure_type": "misrouted_traffic",
"seed": 62,
"resolved": true,
"steps": 4,
"cumulative_reward": 178.0,
"wrong_fixes": 0,
"oversight_flags": [
"load_balancer"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "oracle",
"tier": "hard",
"failure_type": "misrouted_traffic",
"seed": 93,
"resolved": true,
"steps": 4,
"cumulative_reward": 178.0,
"wrong_fixes": 0,
"oversight_flags": [
"load_balancer"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "oracle",
"tier": "hard",
"failure_type": "misrouted_traffic",
"seed": 124,
"resolved": true,
"steps": 4,
"cumulative_reward": 178.0,
"wrong_fixes": 0,
"oversight_flags": [
"load_balancer"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "oracle",
"tier": "hard",
"failure_type": "cascade",
"seed": 97,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "hard",
"failure_type": "cascade",
"seed": 128,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "hard",
"failure_type": "cascade",
"seed": 159,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "hard",
"failure_type": "cascade",
"seed": 190,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "hard",
"failure_type": "cascade",
"seed": 221,
"resolved": true,
"steps": 3,
"cumulative_reward": 134.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "oracle",
"tier": "hard",
"failure_type": "autoscaler_cost_cut",
"seed": 194,
"resolved": true,
"steps": 3,
"cumulative_reward": 184.0,
"wrong_fixes": 0,
"oversight_flags": [
"autoscaler"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "oracle",
"tier": "hard",
"failure_type": "autoscaler_cost_cut",
"seed": 225,
"resolved": true,
"steps": 3,
"cumulative_reward": 184.0,
"wrong_fixes": 0,
"oversight_flags": [
"autoscaler"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "oracle",
"tier": "hard",
"failure_type": "autoscaler_cost_cut",
"seed": 256,
"resolved": true,
"steps": 3,
"cumulative_reward": 184.0,
"wrong_fixes": 0,
"oversight_flags": [
"autoscaler"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "oracle",
"tier": "hard",
"failure_type": "autoscaler_cost_cut",
"seed": 287,
"resolved": true,
"steps": 3,
"cumulative_reward": 184.0,
"wrong_fixes": 0,
"oversight_flags": [
"autoscaler"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "oracle",
"tier": "hard",
"failure_type": "autoscaler_cost_cut",
"seed": 318,
"resolved": true,
"steps": 3,
"cumulative_reward": 184.0,
"wrong_fixes": 0,
"oversight_flags": [
"autoscaler"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "oracle",
"tier": "hard",
"failure_type": "rogue_deploy_bot",
"seed": 291,
"resolved": true,
"steps": 3,
"cumulative_reward": 184.0,
"wrong_fixes": 0,
"oversight_flags": [
"deploy_bot"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "oracle",
"tier": "hard",
"failure_type": "rogue_deploy_bot",
"seed": 322,
"resolved": true,
"steps": 3,
"cumulative_reward": 184.0,
"wrong_fixes": 0,
"oversight_flags": [
"deploy_bot"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "oracle",
"tier": "hard",
"failure_type": "rogue_deploy_bot",
"seed": 353,
"resolved": true,
"steps": 3,
"cumulative_reward": 184.0,
"wrong_fixes": 0,
"oversight_flags": [
"deploy_bot"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "oracle",
"tier": "hard",
"failure_type": "rogue_deploy_bot",
"seed": 384,
"resolved": true,
"steps": 3,
"cumulative_reward": 184.0,
"wrong_fixes": 0,
"oversight_flags": [
"deploy_bot"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "oracle",
"tier": "hard",
"failure_type": "rogue_deploy_bot",
"seed": 415,
"resolved": true,
"steps": 3,
"cumulative_reward": 184.0,
"wrong_fixes": 0,
"oversight_flags": [
"deploy_bot"
],
"had_rogue": true,
"rogue_caught": true,
"false_positive": false
},
{
"policy": "trained",
"tier": "hard",
"failure_type": "misrouted_traffic",
"seed": 0,
"resolved": false,
"steps": 25,
"cumulative_reward": -790.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "hard",
"failure_type": "misrouted_traffic",
"seed": 31,
"resolved": false,
"steps": 25,
"cumulative_reward": -850.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "hard",
"failure_type": "misrouted_traffic",
"seed": 62,
"resolved": false,
"steps": 25,
"cumulative_reward": -890.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "hard",
"failure_type": "misrouted_traffic",
"seed": 93,
"resolved": false,
"steps": 25,
"cumulative_reward": -890.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "hard",
"failure_type": "misrouted_traffic",
"seed": 124,
"resolved": false,
"steps": 25,
"cumulative_reward": -780.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "hard",
"failure_type": "cascade",
"seed": 97,
"resolved": false,
"steps": 25,
"cumulative_reward": -790.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "hard",
"failure_type": "cascade",
"seed": 128,
"resolved": false,
"steps": 25,
"cumulative_reward": -970.0,
"wrong_fixes": 2,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "hard",
"failure_type": "cascade",
"seed": 159,
"resolved": false,
"steps": 25,
"cumulative_reward": -890.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "hard",
"failure_type": "cascade",
"seed": 190,
"resolved": false,
"steps": 25,
"cumulative_reward": -860.0,
"wrong_fixes": 1,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "hard",
"failure_type": "cascade",
"seed": 221,
"resolved": false,
"steps": 25,
"cumulative_reward": -900.0,
"wrong_fixes": 1,
"oversight_flags": [],
"had_rogue": false,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "hard",
"failure_type": "autoscaler_cost_cut",
"seed": 194,
"resolved": false,
"steps": 25,
"cumulative_reward": -830.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "hard",
"failure_type": "autoscaler_cost_cut",
"seed": 225,
"resolved": false,
"steps": 25,
"cumulative_reward": -680.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "hard",
"failure_type": "autoscaler_cost_cut",
"seed": 256,
"resolved": false,
"steps": 25,
"cumulative_reward": -860.0,
"wrong_fixes": 1,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "hard",
"failure_type": "autoscaler_cost_cut",
"seed": 287,
"resolved": false,
"steps": 25,
"cumulative_reward": -810.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "hard",
"failure_type": "autoscaler_cost_cut",
"seed": 318,
"resolved": false,
"steps": 25,
"cumulative_reward": -780.0,
"wrong_fixes": 1,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "hard",
"failure_type": "rogue_deploy_bot",
"seed": 291,
"resolved": false,
"steps": 25,
"cumulative_reward": -770.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "hard",
"failure_type": "rogue_deploy_bot",
"seed": 322,
"resolved": false,
"steps": 25,
"cumulative_reward": -810.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "hard",
"failure_type": "rogue_deploy_bot",
"seed": 353,
"resolved": false,
"steps": 25,
"cumulative_reward": -810.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "hard",
"failure_type": "rogue_deploy_bot",
"seed": 384,
"resolved": false,
"steps": 25,
"cumulative_reward": -770.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
},
{
"policy": "trained",
"tier": "hard",
"failure_type": "rogue_deploy_bot",
"seed": 415,
"resolved": false,
"steps": 25,
"cumulative_reward": -790.0,
"wrong_fixes": 0,
"oversight_flags": [],
"had_rogue": true,
"rogue_caught": false,
"false_positive": false
}
],
"aggregates": [
{
"policy": "heuristic",
"tier": "easy",
"episodes": 20,
"success_rate": 1.0,
"mttr": 1.9,
"rogue_detection_rate": 0.0,
"false_positive_rate": 0.0,
"mean_reward": 94.4,
"median_reward": 110.0,
"mean_wrong_fixes": 0.25
},
{
"policy": "heuristic",
"tier": "hard",
"episodes": 20,
"success_rate": 0.5,
"mttr": 1.2,
"rogue_detection_rate": 0.6666666666666666,
"false_positive_rate": 0.0,
"mean_reward": -425.3,
"median_reward": -328.0,
"mean_wrong_fixes": 5.5
},
{
"policy": "heuristic",
"tier": "medium",
"episodes": 15,
"success_rate": 1.0,
"mttr": 1.2666666666666666,
"rogue_detection_rate": 0.0,
"false_positive_rate": 0.0,
"mean_reward": 108.66666666666667,
"median_reward": 110.0,
"mean_wrong_fixes": 0
},
{
"policy": "oracle",
"tier": "easy",
"episodes": 20,
"success_rate": 1.0,
"mttr": 3,
"rogue_detection_rate": 0.0,
"false_positive_rate": 0.0,
"mean_reward": 134.0,
"median_reward": 134.0,
"mean_wrong_fixes": 0
},
{
"policy": "oracle",
"tier": "hard",
"episodes": 20,
"success_rate": 1.0,
"mttr": 3.25,
"rogue_detection_rate": 1.0,
"false_positive_rate": 0.0,
"mean_reward": 170.0,
"median_reward": 181.0,
"mean_wrong_fixes": 0
},
{
"policy": "oracle",
"tier": "medium",
"episodes": 15,
"success_rate": 1.0,
"mttr": 3,
"rogue_detection_rate": 1.0,
"false_positive_rate": 0.0,
"mean_reward": 150.66666666666666,
"median_reward": 134.0,
"mean_wrong_fixes": 0
},
{
"policy": "random",
"tier": "easy",
"episodes": 20,
"success_rate": 0.55,
"mttr": 7.2727272727272725,
"rogue_detection_rate": 0.0,
"false_positive_rate": 0.0,
"mean_reward": -315.4,
"median_reward": -256.0,
"mean_wrong_fixes": 4.95
},
{
"policy": "random",
"tier": "hard",
"episodes": 20,
"success_rate": 0.1,
"mttr": 16,
"rogue_detection_rate": 0.0,
"false_positive_rate": 0.0,
"mean_reward": -1356.6,
"median_reward": -1435.0,
"mean_wrong_fixes": 13.6
},
{
"policy": "random",
"tier": "medium",
"episodes": 15,
"success_rate": 0.4,
"mttr": 8.666666666666666,
"rogue_detection_rate": 0.0,
"false_positive_rate": 0.0,
"mean_reward": -641.3333333333334,
"median_reward": -802.0,
"mean_wrong_fixes": 7.733333333333333
},
{
"policy": "trained",
"tier": "easy",
"episodes": 20,
"success_rate": 0.0,
"mttr": NaN,
"rogue_detection_rate": 0.0,
"false_positive_rate": 0.0,
"mean_reward": -251.5,
"median_reward": -256.0,
"mean_wrong_fixes": 0.25
},
{
"policy": "trained",
"tier": "hard",
"episodes": 20,
"success_rate": 0.0,
"mttr": NaN,
"rogue_detection_rate": 0.0,
"false_positive_rate": 0.0,
"mean_reward": -826.0,
"median_reward": -810.0,
"mean_wrong_fixes": 0.3
},
{
"policy": "trained",
"tier": "medium",
"episodes": 15,
"success_rate": 0.3333333333333333,
"mttr": 4.6,
"rogue_detection_rate": 0.2,
"false_positive_rate": 0.0,
"mean_reward": -314.8,
"median_reward": -482.0,
"mean_wrong_fixes": 1.8666666666666667
}
]
}