File size: 2,224 Bytes
77eb356
 
 
 
 
 
a50dd28
 
 
 
8151d99
 
 
 
 
77eb356
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a50dd28
77eb356
 
 
 
 
 
 
 
 
a50dd28
 
77eb356
 
 
 
 
 
 
 
a50dd28
 
 
77eb356
 
 
 
 
 
 
 
8151d99
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
name: cloud-incident-response
version: "0.1.0"
app_port: 7860
description: >
  OpenEnv environment simulating real-world cloud SRE on-call incident response.
  Distinct from Kubernetes ops — focuses on cross-service cascading failures,
  network partitions, OOM kills, credential rotation failures, and CDN storms
  across distributed systems. An AI agent classifies alert severity, performs
  root cause analysis through log/metric/dependency queries, and executes
  remediation sequences to resolve production incidents end-to-end.
authors:
  - name: "Einstein"
    github: "MrEinsteinE"
  - name: "Sidra"
    github: "sidraaiman"
license: MIT
tags:
  - openenv
  - sre
  - cloud
  - incident-response
  - devops
  - real-world
  - agentic

tasks:
  - id: alert_classification
    name: "Task 1: Alert Severity Classification"
    difficulty: easy
    max_steps: 3
    score_range: [0.0, 1.0]
    description: >
      Classify incoming alert severity (P1-P4) by querying
      logs and metrics across affected cloud services.
      Target baseline: 0.75-1.0 with 8B model.

  - id: root_cause_analysis
    name: "Task 2: Root Cause Analysis"
    difficulty: medium
    max_steps: 10
    score_range: [0.0, 1.0]
    description: >
      Trace a live incident through logs, metrics, dependencies,
      and recent deploys to identify the exact root cause service
      and failure mode. Root cause is NOT in the alert.
      Target baseline: 0.35-0.60 with 8B model.

  - id: remediation_planning
    name: "Task 3: Incident Remediation"
    difficulty: hard
    max_steps: 15
    score_range: [0.0, 1.0]
    description: >
      Fully resolve a production incident end-to-end: diagnose
      the root cause, execute the correct multi-step remediation
      sequence, and submit a documented resolution summary.
      Wrong actions penalized. Target baseline: 0.20-0.45 with 8B model.

endpoints:
  health:   "GET /health"
  reset:    "POST /reset"
  step:     "POST /step"
  state:    "GET /state"
  tasks:    "GET /tasks"
  grader:   "GET /grader"
  baseline: "POST /baseline"

repo: "https://github.com/MrEinsteinE/cloud-incident-response-openenv"
space: "https://huggingface.co/spaces/Elliot89/cloud-incident-response"