File size: 1,293 Bytes
e270f30
 
 
 
 
 
 
 
3358379
e270f30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
name: logtriage-env
version: 1.0.0
description: >
  An OpenEnv environment where an AI agent acts as an on-call SRE.
  The agent receives live system logs from a simulated microservice cluster
  and must diagnose, prioritize, and resolve incidents across 3 tasks
  of increasing difficulty.
author: Rohit Patil
space_url: https://ogrohit-logtriage-env.hf.space
tags:
  - openenv
  - sre
  - log-analysis
  - incident-response
  - reinforcement-learning
tasks:
  - id: single_crash
    name: Single Service Crash
    difficulty: easy
    max_steps: 8
    description: One service crashes with clear error logs. Classify, identify root cause, remediate.
  - id: cascading_failure
    name: Cascading Failure
    difficulty: medium
    max_steps: 12
    description: Database slowdown causes upstream cascade. Find root cause, not just symptoms.
  - id: silent_degradation
    name: Silent Degradation with Noise
    difficulty: hard
    max_steps: 15
    description: Slow degradation hidden in 60% noise. Nuanced severity judgment required.
action_space:
  type: discrete
  description: SRE triage actions  classify, identify, escalate, remediate, resolve
observation_space:
  type: structured
  description: Log batches + system state + incident metadata per step
reward_range: [-0.5, 1.0]