File size: 1,786 Bytes
37204eb
d5fc8a7
 
 
37204eb
 
 
 
 
 
d5fc8a7
 
 
 
 
37204eb
d5fc8a7
37204eb
d5fc8a7
 
 
 
 
 
 
 
 
 
37204eb
 
d5fc8a7
 
 
 
 
 
 
37204eb
 
 
d5fc8a7
 
 
 
 
 
 
37204eb
 
d5fc8a7
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
name: cloud-incident-response
version: "0.1.0"
app_port: 7860
description: >
  OpenEnv environment simulating real-world cloud SRE on-call incident response.
  Distinct from Kubernetes ops — focuses on cross-service cascading failures,
  network partitions, OOM kills, and CDN storms across distributed systems.
  An AI agent classifies alert severity, performs root cause analysis through
  log/metric/dependency queries, and executes remediation sequences to resolve
  production incidents end-to-end.
author: Elliot89
license: MIT
tags:
  - openenv
  - sre
  - cloud
  - incident-response
  - devops
  - real-world
  - agentic

tasks:
  - id: alert_classification
    name: "Task 1: Alert Severity Classification"
    difficulty: easy
    max_steps: 3
    score_range: [0.0, 1.0]
    description: >
      Classify incoming alert severity (P1-P4) by querying
      logs and metrics across affected cloud services.

  - id: root_cause_analysis
    name: "Task 2: Root Cause Analysis"
    difficulty: medium
    max_steps: 10
    score_range: [0.0, 1.0]
    description: >
      Trace a live incident through logs, metrics, dependencies,
      and recent deploys to identify the exact root cause service
      and failure mode across a distributed system.

  - id: remediation_planning
    name: "Task 3: Incident Remediation"
    difficulty: hard
    max_steps: 15
    score_range: [0.0, 1.0]
    description: >
      Fully resolve a production incident end-to-end: diagnose
      the root cause, execute the correct remediation sequence,
      and submit a documented resolution summary.

endpoints:
  health:   "GET /health"
  reset:    "POST /reset"
  step:     "POST /step"
  state:    "GET /state"
  tasks:    "GET /tasks"
  grader:   "GET /grader"
  baseline: "POST /baseline"