Spaces:
Sleeping
Sleeping
Prepare project for push: update files
Browse files- README.md +155 -273
- graders.py +32 -4
- inference.py +189 -30
- openenv.yaml +9 -2
- requirements.txt +1 -1
- server/app.py +9 -1
- server/environment.py +1 -1
- tasks.py +36 -29
- uv.lock +3 -3
README.md
CHANGED
|
@@ -18,362 +18,244 @@ tags:
|
|
| 18 |
|
| 19 |
# ☁️ Cloud Incident Response — OpenEnv Environment
|
| 20 |
|
| 21 |
-
An OpenEnv environment for training and evaluating AI agents on **cloud SRE incident response** — the real-world on-call workflow that engineers
|
| 22 |
|
| 23 |
-
Distinct from Kubernetes operations environments: this focuses on **cross-service cascading failures** in distributed microservice architectures —
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
---
|
| 26 |
|
| 27 |
-
##
|
| 28 |
|
| 29 |
-
|
| 30 |
|
| 31 |
-
|
|
| 32 |
-
|---|---|
|
| 33 |
-
|
|
| 34 |
-
|
|
| 35 |
-
|
|
| 36 |
-
|
|
| 37 |
-
|
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
### Typed Models
|
| 42 |
-
|
| 43 |
-
```python
|
| 44 |
-
# Action — submitted by the agent
|
| 45 |
-
Action {
|
| 46 |
-
action_type: str, # e.g. "query_logs", "restart_service", "submit_severity"
|
| 47 |
-
parameters: {
|
| 48 |
-
service?: str, # Target service name
|
| 49 |
-
severity?: str, # P1|P2|P3|P4 (for submit_severity)
|
| 50 |
-
failure_mode?: str, # Root cause description (for submit_root_cause)
|
| 51 |
-
summary?: str, # Resolution summary (for submit_resolution)
|
| 52 |
-
flag?: str, # Feature flag name (for disable_feature_flag)
|
| 53 |
-
runbook_action?: str, # Runbook step (for execute_runbook_step)
|
| 54 |
-
target_version?: str, # Deploy version (for rollback_deploy)
|
| 55 |
-
}
|
| 56 |
-
}
|
| 57 |
-
|
| 58 |
-
# Observation — returned to the agent
|
| 59 |
-
Observation {
|
| 60 |
-
episode_id: str, # Unique episode UUID
|
| 61 |
-
task_id: str, # Active task
|
| 62 |
-
scenario_id: str, # Current scenario (e.g. "AC-001")
|
| 63 |
-
step_count: int, # Steps taken so far
|
| 64 |
-
max_steps: int, # Budget (3, 10, or 15)
|
| 65 |
-
incident_summary: str, # Plain-text incident description
|
| 66 |
-
alert: dict, # Alert payload: title, symptoms, error_rate, revenue_impact
|
| 67 |
-
available_actions: [str], # Valid action types for this task
|
| 68 |
-
queried_data: dict, # All evidence gathered so far
|
| 69 |
-
known_services: [str], # Valid service names for actions
|
| 70 |
-
cumulative_reward: float, # Running reward total
|
| 71 |
-
done: bool, # Episode complete flag
|
| 72 |
-
feedback: str, # Per-step reward explanation
|
| 73 |
-
last_action_error: str?, # Error from last action (null if OK)
|
| 74 |
-
}
|
| 75 |
-
|
| 76 |
-
# Reward — returned after each step
|
| 77 |
-
Reward {
|
| 78 |
-
score: float, # Step reward value
|
| 79 |
-
value: float, # Alias for score (backward compatibility)
|
| 80 |
-
reason: str, # Human-readable explanation
|
| 81 |
-
cumulative: float, # Running total
|
| 82 |
-
}
|
| 83 |
-
```
|
| 84 |
|
| 85 |
---
|
| 86 |
|
| 87 |
-
##
|
| 88 |
|
| 89 |
-
|
| 90 |
-
|---|---|---|---|---|
|
| 91 |
-
| `alert_classification` | 🟢 Easy | 3 | 3 | Classify alert severity P1–P4 from metrics and symptoms |
|
| 92 |
-
| `root_cause_analysis` | 🟡 Medium | 10 | 3 | Trace failure chain across 8 services to find root cause |
|
| 93 |
-
| `remediation_planning` | 🔴 Hard | 15 | 3 | Diagnose + execute multi-step remediation + document resolution |
|
| 94 |
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
-
|
| 98 |
-
|---|---|---|---|
|
| 99 |
-
| AC-001 | DB connection pool exhaustion | — | Clear P1: 78% errors, $12k/min |
|
| 100 |
-
| AC-002 | CDN cache invalidation storm | — | Ambiguous P2: degraded but checkout works |
|
| 101 |
-
| AC-003 | Recommendation engine errors | — | Trap P3: 45% errors but zero revenue impact |
|
| 102 |
-
| RCA-001 | Postgres OOM crash loop | analytics-service (unbounded query) | Root cause NOT in alert, 8 services to investigate |
|
| 103 |
-
| RCA-002 | Cross-AZ checkout failures | network-infra (BGP route withdrawal) | Network issue disguised as application failure |
|
| 104 |
-
| RCA-003 | DB authentication failures | config-service (stale credential rotation) | Multiple red herring deploys on other services |
|
| 105 |
-
| RP-001 | Full OOM incident | analytics-service | 6-step remediation sequence, wrong actions penalized |
|
| 106 |
-
| RP-002 | Full BGP incident | network-infra | 4-step runbook + config rollback, 8 services |
|
| 107 |
-
| RP-003 | Full credential incident | config-service | 7-step sequence, credential rotation + service restarts |
|
| 108 |
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
-
- **
|
| 112 |
-
- **
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
-
##
|
| 115 |
|
| 116 |
-
|
|
| 117 |
|---|---|---|---|---|
|
| 118 |
-
| `
|
| 119 |
-
| `
|
|
|
|
| 120 |
|
| 121 |
-
|
| 122 |
|
| 123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
-
|
| 126 |
|
| 127 |
-
##
|
| 128 |
|
|
|
|
| 129 |
```json
|
| 130 |
-
{"action_type": "query_logs", "parameters": {"service": "
|
| 131 |
-
{"action_type": "check_metrics", "parameters": {"service": "
|
| 132 |
-
{"action_type": "check_dependencies", "parameters": {"service": "
|
| 133 |
-
{"action_type": "check_recent_deploys", "parameters": {"service": "
|
| 134 |
-
{"action_type": "check_service_status", "parameters": {"service": "
|
| 135 |
```
|
| 136 |
|
| 137 |
-
###
|
| 138 |
-
|
| 139 |
```json
|
| 140 |
-
{"action_type": "restart_service", "parameters": {"service": "
|
| 141 |
-
{"action_type": "rollback_deploy", "parameters": {"service": "
|
| 142 |
-
{"action_type": "
|
| 143 |
-
{"action_type": "
|
| 144 |
-
{"action_type": "
|
| 145 |
-
{"action_type": "
|
| 146 |
```
|
| 147 |
|
| 148 |
-
###
|
| 149 |
-
|
| 150 |
```json
|
| 151 |
-
{"action_type": "submit_severity", "parameters": {"severity": "P1", "service": "
|
| 152 |
-
{"action_type": "submit_root_cause", "parameters": {"service": "
|
| 153 |
-
{"action_type": "submit_resolution", "parameters": {"summary": "3+ sentence
|
| 154 |
```
|
| 155 |
|
| 156 |
---
|
| 157 |
|
| 158 |
-
##
|
| 159 |
-
|
| 160 |
-
Dense reward shaping provides signal over the **full trajectory** (not just binary end-of-episode):
|
| 161 |
-
|
| 162 |
-
| Signal | Reward | Description |
|
| 163 |
-
|---|---|---|
|
| 164 |
-
| Query new service | +0.03 to +0.04 | First diagnostic action on a service |
|
| 165 |
-
| Query new action type | +0.01 to +0.02 | Different diagnostic on already-queried service |
|
| 166 |
-
| Repeat same query | −0.03 to −0.04 | Same (action, service) pair again |
|
| 167 |
-
| Unknown service | −0.05 to −0.06 | Service not in known_services |
|
| 168 |
-
| Correct remediation | +0.06 | Action matches correct remediation sequence |
|
| 169 |
-
| Wrong remediation | −0.12 to −0.15 | Action in wrong_actions list (e.g. restarting healthy service) |
|
| 170 |
-
| Correct submission type | +0.02 | Using the right submit action for the task |
|
| 171 |
-
| Wrong submission type | −0.08 to −0.12 | e.g. submit_severity during remediation_planning |
|
| 172 |
-
| Past halfway (non-submit) | −0.015 to −0.04 | Per-step efficiency penalty |
|
| 173 |
-
| Timeout | −0.15 to −0.20 | No submission before max_steps |
|
| 174 |
-
| Exact repeat action | −0.04 to −0.05 | Identical action+params as a previous step |
|
| 175 |
-
| **Grader score** | **0.0–1.0** | **Added at terminal step** |
|
| 176 |
-
|
| 177 |
-
### Grading (deterministic, reproducible, 0.0–1.0)
|
| 178 |
-
|
| 179 |
-
| Task | Scoring Logic |
|
| 180 |
-
|---|---|
|
| 181 |
-
| `alert_classification` | 1.0 exact match · 0.5 adjacent (P1↔P2) · 0.25 two-off · 0.0 wrong |
|
| 182 |
-
| `root_cause_analysis` | 0.6 base (correct service + failure mode) + up to 0.4 efficiency bonus |
|
| 183 |
-
| `remediation_planning` | 0.6 base + 0.3 efficiency (correct steps matched) − 0.15 penalty (wrong actions) + 0.1 summary quality |
|
| 184 |
-
|
| 185 |
-
---
|
| 186 |
-
|
| 187 |
-
## 🖥️ Interactive UI Walkthrough
|
| 188 |
-
|
| 189 |
-
The Gradio UI at `/` provides a visual interface for human evaluation. Here's how to use it:
|
| 190 |
-
|
| 191 |
-
### 🟢 Easy Task: Alert Classification
|
| 192 |
-
|
| 193 |
-
1. **Select Task**: Choose `🟢 Easy — Alert Classification` from the Task dropdown
|
| 194 |
-
2. **Select Scenario**: Choose `Scenario 2` (the tricky P3 trap)
|
| 195 |
-
3. **Click** `🔄 Reset Environment`
|
| 196 |
-
4. **Read** the observation panel — recommendation-service errors at 45%
|
| 197 |
-
5. **Investigate**: Set Action Type to `🔍 check_metrics`, Service to `recommendation-service`, click `▶️ Execute Action`
|
| 198 |
-
6. **Read evidence** — "User impact: NONE", "Revenue: unchanged", "Checkout: 100%"
|
| 199 |
-
7. **Submit**: Set Action Type to `📝 submit_severity`, expand `📋 Parameters`, set Severity to `P3 Medium`, click `▶️ Execute Action`
|
| 200 |
-
8. **Grade**: Click `📊 Grade` — should show **1.0** for exact P3 match
|
| 201 |
-
|
| 202 |
-
### 🟡 Medium Task: Root Cause Analysis
|
| 203 |
-
|
| 204 |
-
1. **Select Task**: `🟡 Medium — Root Cause Analysis`, **Scenario**: `Scenario 0`
|
| 205 |
-
2. **Click** `🔄 Reset Environment`
|
| 206 |
-
3. **Read** the observation — postgres-db crash loop, multiple services down
|
| 207 |
-
4. **Query victim**: Action Type `🔍 query_logs`, Service `postgres-db`, click `▶️ Execute Action`
|
| 208 |
-
5. **Read evidence** — logs say *"query from analytics-service consuming all memory"*
|
| 209 |
-
6. **Follow breadcrumb**: Action Type `🔍 query_logs`, Service `analytics-service`, click `▶️ Execute Action`
|
| 210 |
-
7. **Read evidence** — "full_history_export job", "847M row scan", "no LIMIT"
|
| 211 |
-
8. **Confirm**: Action Type `🔍 check_recent_deploys`, Service `analytics-service`, click `▶️ Execute Action`
|
| 212 |
-
9. **Read evidence** — "Deploy 6h ago: cross-table JOIN without LIMIT clause"
|
| 213 |
-
10. **Submit**: Action Type `📝 submit_root_cause`, Service `analytics-service`, Failure Mode: `unbounded query OOM killing postgres-db`, click `▶️ Execute Action`
|
| 214 |
-
11. **Grade**: Click `📊 Grade` — should show **0.85–1.0**
|
| 215 |
-
|
| 216 |
-
### 🔴 Hard Task: Remediation Planning
|
| 217 |
-
|
| 218 |
-
1. **Select Task**: `🔴 Hard — Remediation Planning`, **Scenario**: `Scenario 0`
|
| 219 |
-
2. **Click** `🔄 Reset Environment`
|
| 220 |
-
3. **Diagnose**: `🔍 query_logs` on `postgres-db` → see "analytics-service" breadcrumb
|
| 221 |
-
4. **Confirm**: `🔍 query_logs` on `analytics-service` → see "full_history_export, no LIMIT"
|
| 222 |
-
5. **Fix Step 1**: `🔧 disable_feature_flag`, Flag: `full_history_export` → "job DISABLED"
|
| 223 |
-
6. **Fix Step 2**: `🔧 restart_service` on `analytics-service` → "restarted — idle"
|
| 224 |
-
7. **Fix Step 3**: `🔧 restart_service` on `postgres-db` → "accepting connections (12/500)"
|
| 225 |
-
8. **Fix Step 4**: `🔧 restart_service` on `auth-service` → "reconnected OK"
|
| 226 |
-
9. **Fix Step 5**: `🔧 restart_service` on `order-service` → "writes resuming"
|
| 227 |
-
10. **Verify**: `🔧 execute_runbook_step`, Runbook Action: `verify_db_health` → "healthy"
|
| 228 |
-
11. **Submit**: `📝 submit_resolution`, Summary: *"The analytics-service deployed a full_history_export job with an unbounded query that OOM-killed postgres-db. We disabled the full_history_export flag, restarted analytics-service, then restarted postgres-db, auth-service, and order-service. All services recovered and postgres-db is healthy."*
|
| 229 |
-
12. **Grade**: Click `📊 Grade` — should show **0.85–1.0**
|
| 230 |
-
|
| 231 |
-
### UI Controls Reference
|
| 232 |
-
|
| 233 |
-
| Button | Purpose |
|
| 234 |
-
|---|---|
|
| 235 |
-
| `🔄 Reset Environment` | Start a new episode |
|
| 236 |
-
| `▶️ Execute Action` | Run the selected action |
|
| 237 |
-
| `📋 Parameters` | Expand to fill severity / failure_mode / summary / flag / runbook fields |
|
| 238 |
-
| `📊 Grade` | See final grader score (0.0–1.0) after episode ends |
|
| 239 |
-
| `📋 State` | Refresh the state panel |
|
| 240 |
-
|
| 241 |
-
### Common Mistakes & Penalties
|
| 242 |
|
| 243 |
-
|
|
| 244 |
|---|---|---|
|
| 245 |
-
|
|
| 246 |
-
|
|
| 247 |
-
|
|
| 248 |
-
|
|
| 249 |
-
|
|
| 250 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
|
| 252 |
---
|
| 253 |
|
| 254 |
-
##
|
| 255 |
-
|
| 256 |
-
### Quick Test
|
| 257 |
-
|
| 258 |
-
```bash
|
| 259 |
-
# Reset with defaults (alert_classification, scenario 0)
|
| 260 |
-
curl -X POST http://localhost:7860/reset \
|
| 261 |
-
-H "Content-Type: application/json" -d '{}'
|
| 262 |
-
|
| 263 |
-
# Reset with specific task
|
| 264 |
-
curl -X POST http://localhost:7860/reset \
|
| 265 |
-
-H "Content-Type: application/json" \
|
| 266 |
-
-d '{"task_id": "root_cause_analysis", "scenario_index": 1}'
|
| 267 |
-
|
| 268 |
-
# Take a step
|
| 269 |
-
curl -X POST http://localhost:7860/step \
|
| 270 |
-
-H "Content-Type: application/json" \
|
| 271 |
-
-d '{"action_type": "query_logs", "parameters": {"service": "postgres-db"}}'
|
| 272 |
-
|
| 273 |
-
# Check state
|
| 274 |
-
curl http://localhost:7860/state
|
| 275 |
-
|
| 276 |
-
# Grade current episode
|
| 277 |
-
curl http://localhost:7860/grader
|
| 278 |
-
```
|
| 279 |
|
| 280 |
-
|
| 281 |
|
| 282 |
-
|
| 283 |
-
import requests
|
| 284 |
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
|
| 292 |
-
|
| 293 |
-
print(f"Services: {obs['known_services']}")
|
| 294 |
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
|
| 301 |
-
|
| 302 |
-
print(f"Done: {result['done']}")
|
| 303 |
|
| 304 |
-
#
|
| 305 |
-
result = requests.post(f"{BASE}/step", json={
|
| 306 |
-
"action_type": "submit_severity",
|
| 307 |
-
"parameters": {"severity": "P1", "service": obs["known_services"][0]}
|
| 308 |
-
}).json()
|
| 309 |
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
|
| 315 |
---
|
| 316 |
|
| 317 |
-
## Setup
|
| 318 |
|
| 319 |
### Local Development
|
| 320 |
-
|
| 321 |
```bash
|
| 322 |
pip install -r requirements.txt
|
| 323 |
uvicorn server.app:app --host 0.0.0.0 --port 7860
|
| 324 |
```
|
| 325 |
|
| 326 |
### Docker
|
| 327 |
-
|
| 328 |
```bash
|
| 329 |
docker build -t cloud-incident-env .
|
| 330 |
docker run -p 7860:7860 cloud-incident-env
|
| 331 |
```
|
| 332 |
|
| 333 |
-
### Run Baseline
|
| 334 |
-
|
| 335 |
```bash
|
| 336 |
-
export API_BASE_URL="https://
|
| 337 |
-
export MODEL_NAME="llama-3.1-
|
| 338 |
-
export HF_TOKEN="
|
| 339 |
python inference.py
|
| 340 |
```
|
| 341 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
---
|
| 343 |
|
| 344 |
-
## Project Structure
|
| 345 |
|
| 346 |
```
|
| 347 |
-
|
|
|
|
| 348 |
├── README.md # This file
|
| 349 |
├── requirements.txt # Python dependencies
|
| 350 |
-
├── openenv.yaml # OpenEnv
|
| 351 |
-
├──
|
| 352 |
├── tasks.py # 9 scenarios across 3 difficulty levels
|
| 353 |
├── graders.py # Deterministic graders (0.0–1.0)
|
| 354 |
-
├── inference.py # Baseline LLM agent with fallback logic
|
| 355 |
└── server/
|
| 356 |
├── __init__.py
|
| 357 |
├── app.py # FastAPI + Gradio endpoints
|
| 358 |
-
├── environment.py # Core step/reset/state logic
|
| 359 |
└── models.py # Typed Pydantic models (Action, Observation, Reward)
|
| 360 |
```
|
| 361 |
|
| 362 |
---
|
| 363 |
|
| 364 |
-
##
|
| 365 |
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
### Why These Specific Incidents?
|
| 371 |
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
- **Credential rotation bugs** (RCA-003, RP-003): Configuration management failures that cause cascading auth failures — the DB is fine but clients have wrong passwords.
|
| 375 |
|
| 376 |
-
#
|
|
|
|
|
|
|
| 377 |
|
| 378 |
-
|
| 379 |
-
|
|
|
|
|
|
| 18 |
|
| 19 |
# ☁️ Cloud Incident Response — OpenEnv Environment
|
| 20 |
|
| 21 |
+
An OpenEnv environment for training and evaluating AI agents on **cloud SRE incident response** — the real-world on-call workflow that engineers at every cloud company perform daily.
|
| 22 |
|
| 23 |
+
Distinct from Kubernetes operations environments: this focuses on **cross-service cascading failures** in distributed microservice architectures — connection pool exhaustion, CDN cache storms, OOM kills, credential rotation failures, and BGP network partitions.
|
| 24 |
+
|
| 25 |
+
## Authors
|
| 26 |
+
|
| 27 |
+
- **Einstein** — Environment Design & Grader Implementation
|
| 28 |
+
- **Sidra** — Scenario Design & Testing
|
| 29 |
|
| 30 |
---
|
| 31 |
|
| 32 |
+
## 🎯 Why This Environment
|
| 33 |
|
| 34 |
+
Every cloud company employs SREs who respond to production incidents under time pressure with incomplete information. This environment simulates the exact decision loop:
|
| 35 |
|
| 36 |
+
| Phase | What the Agent Does |
|
| 37 |
+
|---|---|
|
| 38 |
+
| **Triage** | Read alert, assess blast radius, classify severity (P1–P4) |
|
| 39 |
+
| **Investigate** | Query logs, metrics, dependencies, recent deploys |
|
| 40 |
+
| **Diagnose** | Correlate signals across services to find root cause |
|
| 41 |
+
| **Remediate** | Execute correct runbook steps in the right sequence |
|
| 42 |
+
| **Document** | Submit resolution summary for post-incident review |
|
| 43 |
+
|
| 44 |
+
Agents trained here learn the same skills a human SRE develops: service dependency traversal, log correlation, cascading failure analysis, and targeted remediation.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
---
|
| 47 |
|
| 48 |
+
## 📊 Baseline Scores
|
| 49 |
|
| 50 |
+
Using `Llama 3.1 8B Instruct` · deterministic (`temperature=0.0`) · fully reproducible
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
+
| Task | Difficulty | S0 | S1 | S2 | Average |
|
| 53 |
+
|---|---|---|---|---|---|
|
| 54 |
+
| `alert_classification` | 🟢 Easy | 1.00 | 1.00 | 1.00 | **1.00** |
|
| 55 |
+
| `root_cause_analysis` | 🟡 Medium | 1.00 | 0.20 | 1.00 | **0.73** |
|
| 56 |
+
| `remediation_planning` | 🔴 Hard | 0.60 | 0.45 | 0.59 | **0.55** |
|
| 57 |
+
| **Overall** | | | | | **0.76** |
|
| 58 |
|
| 59 |
+
### Score Interpretation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
+
```
|
| 62 |
+
Easy 1.00 ████████████████████ Clear metrics → straightforward classification
|
| 63 |
+
Medium 0.73 ██████████████▌ Root cause hidden — model fails on BGP scenario (S1=0.20)
|
| 64 |
+
Hard 0.55 ███████████ Multi-phase execution with wrong-action penalties
|
| 65 |
+
```
|
| 66 |
|
| 67 |
+
- **Easy → 1.00:** Alert metrics (error rate, revenue impact) directly indicate severity. An 8B model reliably classifies P1/P2/P3 with 2 diagnostic queries.
|
| 68 |
+
- **Medium → 0.73:** Root cause service is NOT in the alert. Model must investigate beyond the blast radius. Succeeds on OOM and credential scenarios but fails on BGP network partition (S1=0.20) where no victim log names the root cause.
|
| 69 |
+
- **Hard → 0.55:** Same diagnostic challenge as medium PLUS multi-step remediation sequence, wrong-action penalties (−0.10 each), and documentation quality scoring. Model wastes steps on repeated status checks and sometimes executes counterproductive remediations.
|
| 70 |
+
|
| 71 |
+
---
|
| 72 |
|
| 73 |
+
## 🏗️ Tasks
|
| 74 |
|
| 75 |
+
| Task ID | Difficulty | Max Steps | Objective | Submission Action |
|
| 76 |
|---|---|---|---|---|
|
| 77 |
+
| `alert_classification` | 🟢 Easy | 3 | Classify alert severity (P1–P4) | `submit_severity` |
|
| 78 |
+
| `root_cause_analysis` | 🟡 Medium | 10 | Find root cause service + failure mode | `submit_root_cause` |
|
| 79 |
+
| `remediation_planning` | 🔴 Hard | 15 | Diagnose + remediate + document | `submit_resolution` |
|
| 80 |
|
| 81 |
+
### Scenarios (3 per task = 9 total episodes)
|
| 82 |
|
| 83 |
+
| ID | Incident Type | Root Cause | Why It's Hard |
|
| 84 |
+
|---|---|---|---|
|
| 85 |
+
| AC-001 | DB connection pool exhaustion | — | Clear P1: 78% errors, $12k/min revenue loss |
|
| 86 |
+
| AC-002 | CDN cache invalidation storm | — | Ambiguous P2: degraded but checkout works |
|
| 87 |
+
| AC-003 | Recommendation service errors | — | Trap P3: 45% errors but zero revenue impact |
|
| 88 |
+
| RCA-001 | Postgres OOM kill | analytics-service | Must correlate "analytics export query" in DB logs |
|
| 89 |
+
| RCA-002 | BGP network partition | network-infra | No victim log names network-infra — hardest scenario |
|
| 90 |
+
| RCA-003 | Credential rotation bug | config-service | Must trace "secrets rotation" hint to config-service |
|
| 91 |
+
| RP-001 | Full OOM remediation | analytics-service | 6-step sequence: disable job → restart chain |
|
| 92 |
+
| RP-002 | Full BGP remediation | network-infra | 4-step sequence: restore routes → rollback → verify |
|
| 93 |
+
| RP-003 | Full credential fix | config-service | 7-step sequence: rollback → rotate → restart → verify |
|
| 94 |
|
| 95 |
+
---
|
| 96 |
|
| 97 |
+
## 🎮 Action Space
|
| 98 |
|
| 99 |
+
### Diagnostic Actions (gather evidence)
|
| 100 |
```json
|
| 101 |
+
{"action_type": "query_logs", "parameters": {"service": "<name>"}}
|
| 102 |
+
{"action_type": "check_metrics", "parameters": {"service": "<name>"}}
|
| 103 |
+
{"action_type": "check_dependencies", "parameters": {"service": "<name>"}}
|
| 104 |
+
{"action_type": "check_recent_deploys", "parameters": {"service": "<name>"}}
|
| 105 |
+
{"action_type": "check_service_status", "parameters": {"service": "<name>"}}
|
| 106 |
```
|
| 107 |
|
| 108 |
+
### Remediation Actions (fix the incident)
|
|
|
|
| 109 |
```json
|
| 110 |
+
{"action_type": "restart_service", "parameters": {"service": "<name>"}}
|
| 111 |
+
{"action_type": "rollback_deploy", "parameters": {"service": "<name>"}}
|
| 112 |
+
{"action_type": "scale_service", "parameters": {"service": "<name>", "replicas": 10}}
|
| 113 |
+
{"action_type": "disable_feature_flag", "parameters": {"flag": "<flag_name>"}}
|
| 114 |
+
{"action_type": "clear_cache", "parameters": {"service": "<name>"}}
|
| 115 |
+
{"action_type": "execute_runbook_step", "parameters": {"runbook_action": "<action>"}}
|
| 116 |
```
|
| 117 |
|
| 118 |
+
### Submission Actions (end the episode)
|
|
|
|
| 119 |
```json
|
| 120 |
+
{"action_type": "submit_severity", "parameters": {"severity": "P1|P2|P3|P4", "service": "<name>"}}
|
| 121 |
+
{"action_type": "submit_root_cause", "parameters": {"service": "<name>", "failure_mode": "<description>"}}
|
| 122 |
+
{"action_type": "submit_resolution", "parameters": {"summary": "<3+ sentence summary>"}}
|
| 123 |
```
|
| 124 |
|
| 125 |
---
|
| 126 |
|
| 127 |
+
## 👁️ Observation Space
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
+
| Field | Type | Description |
|
| 130 |
|---|---|---|
|
| 131 |
+
| `episode_id` | string | Unique episode UUID |
|
| 132 |
+
| `task_id` | string | Active task identifier |
|
| 133 |
+
| `scenario_id` | string | Current scenario (e.g., `RCA-001`) |
|
| 134 |
+
| `step_count` / `max_steps` | int | Progress through episode |
|
| 135 |
+
| `incident_summary` | string | Plain-text incident description (no root cause hints) |
|
| 136 |
+
| `alert` | dict | Alert payload with severity, symptoms, affected services |
|
| 137 |
+
| `available_actions` | list | Valid action types for this task |
|
| 138 |
+
| `queried_data` | dict | All evidence gathered so far |
|
| 139 |
+
| `known_services` | list | Exact service names valid for actions |
|
| 140 |
+
| `cumulative_reward` | float | Running reward total |
|
| 141 |
+
| `done` | bool | Episode terminal flag |
|
| 142 |
+
| `feedback` | string | Per-step feedback explaining reward |
|
| 143 |
+
| `last_action_error` | string? | Error message if last action was invalid |
|
| 144 |
|
| 145 |
---
|
| 146 |
|
| 147 |
+
## 💰 Reward Function
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
+
Dense reward shaping throughout the trajectory — not just terminal scoring.
|
| 150 |
|
| 151 |
+
### Per-Step Rewards
|
|
|
|
| 152 |
|
| 153 |
+
| Event | Easy | Medium | Hard |
|
| 154 |
+
|---|---|---|---|
|
| 155 |
+
| Query new service (first time) | +0.04 | +0.04 | +0.03 |
|
| 156 |
+
| Query new action on known service | +0.02 | +0.02 | +0.01 |
|
| 157 |
+
| Repeat exact same query | −0.03 | −0.04 | −0.03 |
|
| 158 |
+
| Query unknown service | −0.06 | −0.06 | −0.05 |
|
| 159 |
+
| Correct remediation action | — | +0.06 | +0.06 |
|
| 160 |
+
| Wrong remediation action | −0.08 | −0.10 | −0.15 |
|
| 161 |
+
| Step past halfway (non-submit) | −0.04 | −0.02 | −0.02 |
|
| 162 |
+
| Timeout without submission | −0.15 | −0.15 | −0.20 |
|
| 163 |
|
| 164 |
+
### Grader Scoring (terminal, deterministic)
|
|
|
|
| 165 |
|
| 166 |
+
| Task | Scoring Logic |
|
| 167 |
+
|---|---|
|
| 168 |
+
| `alert_classification` | 1.0 exact · 0.5 adjacent · 0.25 two-off · 0.0 wrong |
|
| 169 |
+
| `root_cause_analysis` | Up to 0.6 base (service + failure mode) + up to 0.4 efficiency bonus. Wrong service: 0.05–0.20 based on investigation effort |
|
| 170 |
+
| `remediation_planning` | Scaled base (0.10–0.50 by investigation depth) + 0.30 efficiency − up to 0.30 wrong-action penalty + 0.10 summary quality |
|
| 171 |
|
| 172 |
+
---
|
|
|
|
| 173 |
|
| 174 |
+
## 🔌 API Endpoints
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
|
| 176 |
+
| Method | Path | Description |
|
| 177 |
+
|---|---|---|
|
| 178 |
+
| `GET` | `/` | Gradio UI — interactive environment demo |
|
| 179 |
+
| `GET` | `/health` | `{"status":"ok","version":"0.1.0"}` |
|
| 180 |
+
| `POST` | `/reset` | Start new episode (accepts `task_id`, `scenario_index`) |
|
| 181 |
+
| `POST` | `/step` | Submit action → returns observation, reward, done, info |
|
| 182 |
+
| `GET` | `/state` | Full current episode state with action history |
|
| 183 |
+
| `GET` | `/tasks` | All tasks with action schemas |
|
| 184 |
+
| `GET` | `/grader` | Score current episode (0.0–1.0) with breakdown |
|
| 185 |
|
| 186 |
---
|
| 187 |
|
| 188 |
+
## 🚀 Setup & Usage
|
| 189 |
|
| 190 |
### Local Development
|
|
|
|
| 191 |
```bash
|
| 192 |
pip install -r requirements.txt
|
| 193 |
uvicorn server.app:app --host 0.0.0.0 --port 7860
|
| 194 |
```
|
| 195 |
|
| 196 |
### Docker
|
|
|
|
| 197 |
```bash
|
| 198 |
docker build -t cloud-incident-env .
|
| 199 |
docker run -p 7860:7860 cloud-incident-env
|
| 200 |
```
|
| 201 |
|
| 202 |
+
### Run Baseline Inference
|
|
|
|
| 203 |
```bash
|
| 204 |
+
export API_BASE_URL="https://router.huggingface.co/v1"
|
| 205 |
+
export MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"
|
| 206 |
+
export HF_TOKEN="your_token"
|
| 207 |
python inference.py
|
| 208 |
```
|
| 209 |
|
| 210 |
+
### Quick API Test
|
| 211 |
+
```bash
|
| 212 |
+
# Reset
|
| 213 |
+
curl -X POST "http://localhost:7860/reset?task_id=alert_classification&scenario_index=0"
|
| 214 |
+
|
| 215 |
+
# Step
|
| 216 |
+
curl -X POST http://localhost:7860/step \
|
| 217 |
+
-H "Content-Type: application/json" \
|
| 218 |
+
-d '{"action_type":"query_logs","parameters":{"service":"api-gateway"}}'
|
| 219 |
+
|
| 220 |
+
# Grade
|
| 221 |
+
curl http://localhost:7860/grader
|
| 222 |
+
```
|
| 223 |
+
|
| 224 |
---
|
| 225 |
|
| 226 |
+
## 📁 Project Structure
|
| 227 |
|
| 228 |
```
|
| 229 |
+
.
|
| 230 |
+
├── Dockerfile # Container build
|
| 231 |
├── README.md # This file
|
| 232 |
├── requirements.txt # Python dependencies
|
| 233 |
+
├── openenv.yaml # OpenEnv metadata + task definitions
|
| 234 |
+
├── inference.py # Baseline agent (OpenAI client + smart fallback)
|
| 235 |
├── tasks.py # 9 scenarios across 3 difficulty levels
|
| 236 |
├── graders.py # Deterministic graders (0.0–1.0)
|
|
|
|
| 237 |
└── server/
|
| 238 |
├── __init__.py
|
| 239 |
├── app.py # FastAPI + Gradio endpoints
|
| 240 |
+
├── environment.py # Core step()/reset()/state() logic
|
| 241 |
└── models.py # Typed Pydantic models (Action, Observation, Reward)
|
| 242 |
```
|
| 243 |
|
| 244 |
---
|
| 245 |
|
| 246 |
+
## ✅ Validation
|
| 247 |
|
| 248 |
+
```bash
|
| 249 |
+
# OpenEnv spec validation
|
| 250 |
+
openenv validate # → [OK] Ready for multi-mode deployment
|
|
|
|
|
|
|
| 251 |
|
| 252 |
+
# Docker build
|
| 253 |
+
docker build -t cloud-incident-env . # → builds successfully
|
|
|
|
| 254 |
|
| 255 |
+
# Health check
|
| 256 |
+
curl http://localhost:7860/health # → {"status":"ok","version":"0.1.0"}
|
| 257 |
+
```
|
| 258 |
|
| 259 |
+
## Team
|
| 260 |
+
- **Einstein** — [@MrEinsteinE](https://github.com/MrEinsteinE)
|
| 261 |
+
- **Sidra** — [@sidraaiman](https://github.com/sidraaiman)
|
graders.py
CHANGED
|
@@ -182,8 +182,20 @@ def _grade_root_cause_analysis(state: dict, scenario: dict) -> dict:
|
|
| 182 |
elif svc_match:
|
| 183 |
base, base_fb = 0.35, "Correct service only — failure mode unclear"
|
| 184 |
else:
|
| 185 |
-
|
| 186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
)
|
| 188 |
|
| 189 |
efficiency = 0.0
|
|
@@ -279,7 +291,22 @@ def _grade_remediation_planning(state: dict, scenario: dict) -> dict:
|
|
| 279 |
"feedback": "No resolution submitted or no investigation — score 0.0",
|
| 280 |
}
|
| 281 |
|
| 282 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
|
| 284 |
executed = set()
|
| 285 |
for a in history:
|
|
@@ -325,7 +352,8 @@ def _grade_remediation_planning(state: dict, scenario: dict) -> dict:
|
|
| 325 |
in wrong_map
|
| 326 |
)
|
| 327 |
)
|
| 328 |
-
penalty = round(min(0.
|
|
|
|
| 329 |
|
| 330 |
sl = summary.lower()
|
| 331 |
hits = sum(1 for kw in keywords if kw in sl)
|
|
|
|
| 182 |
elif svc_match:
|
| 183 |
base, base_fb = 0.35, "Correct service only — failure mode unclear"
|
| 184 |
else:
|
| 185 |
+
# Give MORE partial credit for investigation effort even with wrong answer
|
| 186 |
+
pre_submit_diag = [
|
| 187 |
+
a for a in history[:sub_step]
|
| 188 |
+
if a.get("action_type") in diag_types
|
| 189 |
+
]
|
| 190 |
+
investigated = len({
|
| 191 |
+
a.get("parameters", {}).get("service", "").lower()
|
| 192 |
+
for a in pre_submit_diag
|
| 193 |
+
} - {""})
|
| 194 |
+
# 0.05 base + up to 0.15 for investigating 3+ services
|
| 195 |
+
wrong_base = min(0.20, 0.05 + investigated * 0.05)
|
| 196 |
+
base, base_fb = wrong_base, (
|
| 197 |
+
f"Wrong service: '{sub_svc}' (correct: '{correct_svc}') — "
|
| 198 |
+
f"investigated {investigated} services"
|
| 199 |
)
|
| 200 |
|
| 201 |
efficiency = 0.0
|
|
|
|
| 291 |
"feedback": "No resolution submitted or no investigation — score 0.0",
|
| 292 |
}
|
| 293 |
|
| 294 |
+
# Base scales with investigation depth — not a free 0.60
|
| 295 |
+
diag_count = sum(
|
| 296 |
+
1 for a in history if a.get("action_type") in {
|
| 297 |
+
"query_logs", "check_metrics", "check_dependencies",
|
| 298 |
+
"check_recent_deploys", "check_service_status",
|
| 299 |
+
}
|
| 300 |
+
)
|
| 301 |
+
rem_count = sum(
|
| 302 |
+
1 for a in history if a.get("action_type") in {
|
| 303 |
+
"restart_service", "rollback_deploy", "scale_service",
|
| 304 |
+
"disable_feature_flag", "clear_cache", "execute_runbook_step",
|
| 305 |
+
}
|
| 306 |
+
)
|
| 307 |
+
diag_credit = min(0.20, diag_count * 0.05)
|
| 308 |
+
rem_credit = min(0.20, rem_count * 0.05)
|
| 309 |
+
base = round(0.10 + diag_credit + rem_credit, 4)
|
| 310 |
|
| 311 |
executed = set()
|
| 312 |
for a in history:
|
|
|
|
| 352 |
in wrong_map
|
| 353 |
)
|
| 354 |
)
|
| 355 |
+
penalty = round(min(0.30, wrong_count * 0.10), 4)
|
| 356 |
+
|
| 357 |
|
| 358 |
sl = summary.lower()
|
| 359 |
hits = sum(1 for kw in keywords if kw in sl)
|
inference.py
CHANGED
|
@@ -13,7 +13,13 @@ import sys
|
|
| 13 |
import time
|
| 14 |
|
| 15 |
import requests
|
|
|
|
|
|
|
|
|
|
| 16 |
|
|
|
|
|
|
|
|
|
|
| 17 |
try:
|
| 18 |
from dotenv import load_dotenv
|
| 19 |
load_dotenv()
|
|
@@ -23,10 +29,10 @@ except ImportError:
|
|
| 23 |
# ── Config ──────────────────────────────────────────────────────────────────
|
| 24 |
API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.groq.com/openai/v1")
|
| 25 |
MODEL_NAME = os.environ.get("MODEL_NAME", "llama-3.1-8b-instant")
|
| 26 |
-
|
| 27 |
ENV_BASE_URL = os.environ.get("ENV_BASE_URL", "http://localhost:7860")
|
| 28 |
|
| 29 |
-
if not
|
| 30 |
print("[WARN] No API key set — LLM calls will fail.", file=sys.stderr)
|
| 31 |
|
| 32 |
_session = requests.Session()
|
|
@@ -37,7 +43,7 @@ def _get_client():
|
|
| 37 |
global _client
|
| 38 |
if _client is None:
|
| 39 |
from openai import OpenAI
|
| 40 |
-
_client = OpenAI(api_key=
|
| 41 |
return _client
|
| 42 |
|
| 43 |
|
|
@@ -370,7 +376,7 @@ def _should_override(
|
|
| 370 |
return False
|
| 371 |
|
| 372 |
|
| 373 |
-
def _llm_call_with_retry(messages: list, max_retries: int =
|
| 374 |
"""Call LLM with retry on rate limit errors."""
|
| 375 |
for attempt in range(max_retries + 1):
|
| 376 |
try:
|
|
@@ -387,7 +393,7 @@ def _llm_call_with_retry(messages: list, max_retries: int = 2) -> str:
|
|
| 387 |
if "rate_limit" in err_str or "429" in err_str:
|
| 388 |
if attempt < max_retries:
|
| 389 |
# Parse wait time from error or use default
|
| 390 |
-
wait =
|
| 391 |
print(f" [RATE LIMIT] waiting {wait}s (attempt {attempt + 1})",
|
| 392 |
file=sys.stderr)
|
| 393 |
time.sleep(wait)
|
|
@@ -399,6 +405,12 @@ def _llm_call_with_retry(messages: list, max_retries: int = 2) -> str:
|
|
| 399 |
|
| 400 |
|
| 401 |
def _run_episode(task_id: str, scenario_index: int) -> float:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 402 |
r = _session.post(
|
| 403 |
f"{ENV_BASE_URL}/reset",
|
| 404 |
params={"task_id": task_id, "scenario_index": scenario_index},
|
|
@@ -472,44 +484,191 @@ def _run_episode(task_id: str, scenario_index: int) -> float:
|
|
| 472 |
|
| 473 |
def main():
|
| 474 |
runs = [
|
| 475 |
-
("alert_classification",
|
| 476 |
-
("alert_classification",
|
| 477 |
-
("alert_classification",
|
| 478 |
-
("root_cause_analysis",
|
| 479 |
-
("root_cause_analysis",
|
| 480 |
-
("root_cause_analysis",
|
| 481 |
-
("remediation_planning",
|
| 482 |
-
("remediation_planning",
|
| 483 |
-
("remediation_planning",
|
| 484 |
]
|
| 485 |
|
| 486 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 487 |
|
| 488 |
-
print(
|
| 489 |
-
print("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 490 |
|
| 491 |
for task_id, scenario_index in runs:
|
| 492 |
try:
|
| 493 |
-
score =
|
| 494 |
except Exception as e:
|
| 495 |
-
print(f" [ERROR] {task_id}
|
| 496 |
-
|
| 497 |
-
score = 0.0
|
| 498 |
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
|
|
|
| 506 |
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 510 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 511 |
print(json.dumps(summary))
|
| 512 |
|
| 513 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 514 |
if __name__ == "__main__":
|
| 515 |
main()
|
|
|
|
| 13 |
import time
|
| 14 |
|
| 15 |
import requests
|
| 16 |
+
import time as _time
|
| 17 |
+
_START = _time.time()
|
| 18 |
+
_MAX_RUNTIME = 1080
|
| 19 |
|
| 20 |
+
def _check_timeout():
|
| 21 |
+
if _time.time() - _START > _MAX_RUNTIME:
|
| 22 |
+
raise RuntimeError("Approaching 20min limit — stopping early")
|
| 23 |
try:
|
| 24 |
from dotenv import load_dotenv
|
| 25 |
load_dotenv()
|
|
|
|
| 29 |
# ── Config ──────────────────────────────────────────────────────────────────
|
| 30 |
API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.groq.com/openai/v1")
|
| 31 |
MODEL_NAME = os.environ.get("MODEL_NAME", "llama-3.1-8b-instant")
|
| 32 |
+
API_KEY = os.environ.get("HF_TOKEN") or os.environ.get("API_KEY") or ""
|
| 33 |
ENV_BASE_URL = os.environ.get("ENV_BASE_URL", "http://localhost:7860")
|
| 34 |
|
| 35 |
+
if not API_KEY:
|
| 36 |
print("[WARN] No API key set — LLM calls will fail.", file=sys.stderr)
|
| 37 |
|
| 38 |
_session = requests.Session()
|
|
|
|
| 43 |
global _client
|
| 44 |
if _client is None:
|
| 45 |
from openai import OpenAI
|
| 46 |
+
_client = OpenAI(api_key=API_KEY, base_url=API_BASE_URL)
|
| 47 |
return _client
|
| 48 |
|
| 49 |
|
|
|
|
| 376 |
return False
|
| 377 |
|
| 378 |
|
| 379 |
+
def _llm_call_with_retry(messages: list, max_retries: int = 1) -> str:
|
| 380 |
"""Call LLM with retry on rate limit errors."""
|
| 381 |
for attempt in range(max_retries + 1):
|
| 382 |
try:
|
|
|
|
| 393 |
if "rate_limit" in err_str or "429" in err_str:
|
| 394 |
if attempt < max_retries:
|
| 395 |
# Parse wait time from error or use default
|
| 396 |
+
wait = 5 * (attempt + 1)
|
| 397 |
print(f" [RATE LIMIT] waiting {wait}s (attempt {attempt + 1})",
|
| 398 |
file=sys.stderr)
|
| 399 |
time.sleep(wait)
|
|
|
|
| 405 |
|
| 406 |
|
| 407 |
def _run_episode(task_id: str, scenario_index: int) -> float:
|
| 408 |
+
if _time.time() - _START > _MAX_RUNTIME:
|
| 409 |
+
print(f" [TIMEOUT] Approaching 20min limit — skipping {task_id} s{scenario_index}",
|
| 410 |
+
file=sys.stderr)
|
| 411 |
+
return 0.0
|
| 412 |
+
_check_timeout()
|
| 413 |
+
|
| 414 |
r = _session.post(
|
| 415 |
f"{ENV_BASE_URL}/reset",
|
| 416 |
params={"task_id": task_id, "scenario_index": scenario_index},
|
|
|
|
| 484 |
|
| 485 |
def main():
|
| 486 |
runs = [
|
| 487 |
+
("alert_classification", 0),
|
| 488 |
+
("alert_classification", 1),
|
| 489 |
+
("alert_classification", 2),
|
| 490 |
+
("root_cause_analysis", 0),
|
| 491 |
+
("root_cause_analysis", 1),
|
| 492 |
+
("root_cause_analysis", 2),
|
| 493 |
+
("remediation_planning", 0),
|
| 494 |
+
("remediation_planning", 1),
|
| 495 |
+
("remediation_planning", 2),
|
| 496 |
]
|
| 497 |
|
| 498 |
+
_DIFFICULTY = {
|
| 499 |
+
"alert_classification": "🟢 Easy",
|
| 500 |
+
"root_cause_analysis": "🟡 Medium",
|
| 501 |
+
"remediation_planning": "🔴 Hard",
|
| 502 |
+
}
|
| 503 |
+
|
| 504 |
+
_MAX_STEPS = {
|
| 505 |
+
"alert_classification": 3,
|
| 506 |
+
"root_cause_analysis": 10,
|
| 507 |
+
"remediation_planning": 15,
|
| 508 |
+
}
|
| 509 |
+
|
| 510 |
+
results: dict[str, list[dict]] = {}
|
| 511 |
|
| 512 |
+
print()
|
| 513 |
+
print("=" * 100)
|
| 514 |
+
print(" ☁️ CLOUD INCIDENT RESPONSE — BASELINE INFERENCE")
|
| 515 |
+
print("=" * 100)
|
| 516 |
+
print(f" Model: {MODEL_NAME}")
|
| 517 |
+
print(f" Endpoint: {API_BASE_URL}")
|
| 518 |
+
print("=" * 100)
|
| 519 |
+
print()
|
| 520 |
+
|
| 521 |
+
# Table header
|
| 522 |
+
print(f"{'Task':<24} {'Difficulty':<12} {'Scenario':>8} {'Steps':>10} {'Actions':>10} {'Reward':>10} {'Score':>10}")
|
| 523 |
+
print("─" * 100)
|
| 524 |
|
| 525 |
for task_id, scenario_index in runs:
|
| 526 |
try:
|
| 527 |
+
score, steps_used, actions_taken, cumulative_reward = _run_episode_detailed(task_id, scenario_index)
|
| 528 |
except Exception as e:
|
| 529 |
+
print(f" [ERROR] {task_id} scenario {scenario_index}: {e}", file=sys.stderr)
|
| 530 |
+
score, steps_used, actions_taken, cumulative_reward = 0.0, 0, 0, 0.0
|
|
|
|
| 531 |
|
| 532 |
+
difficulty = _DIFFICULTY.get(task_id, "?")
|
| 533 |
+
max_steps = _MAX_STEPS.get(task_id, "?")
|
| 534 |
+
steps_display = f"{steps_used}/{max_steps}"
|
| 535 |
|
| 536 |
+
print(
|
| 537 |
+
f"{task_id:<24} {difficulty:<12} {scenario_index:>8} "
|
| 538 |
+
f"{steps_display:>10} {actions_taken:>10} {cumulative_reward:>+10.4f} {score:>10.4f}"
|
| 539 |
+
)
|
| 540 |
|
| 541 |
+
results.setdefault(task_id, []).append({
|
| 542 |
+
"scenario": scenario_index,
|
| 543 |
+
"score": score,
|
| 544 |
+
"steps": steps_used,
|
| 545 |
+
"actions": actions_taken,
|
| 546 |
+
"reward": cumulative_reward,
|
| 547 |
+
})
|
| 548 |
+
|
| 549 |
+
print("─" * 100)
|
| 550 |
+
print()
|
| 551 |
+
|
| 552 |
+
# Summary table
|
| 553 |
+
print("=" * 100)
|
| 554 |
+
print(" 📊 SUMMARY BY TASK")
|
| 555 |
+
print("=" * 100)
|
| 556 |
+
print(f"{'Task':<24} {'Difficulty':<12} {'Avg Score':>10} {'Avg Steps':>10} {'Scenarios':>20}")
|
| 557 |
+
print("─" * 100)
|
| 558 |
+
|
| 559 |
+
summary = {}
|
| 560 |
+
for task_id in ["alert_classification", "root_cause_analysis", "remediation_planning"]:
|
| 561 |
+
if task_id not in results:
|
| 562 |
+
continue
|
| 563 |
+
data = results[task_id]
|
| 564 |
+
avg_score = sum(d["score"] for d in data) / len(data)
|
| 565 |
+
avg_steps = sum(d["steps"] for d in data) / len(data)
|
| 566 |
+
scenario_scores = " | ".join(f'{d["score"]:.2f}' for d in data)
|
| 567 |
+
difficulty = _DIFFICULTY.get(task_id, "?")
|
| 568 |
|
| 569 |
+
print(f"{task_id:<24} {difficulty:<12} {avg_score:>10.4f} {avg_steps:>10.1f} {scenario_scores:>20}")
|
| 570 |
+
summary[task_id] = round(avg_score, 4)
|
| 571 |
+
|
| 572 |
+
summary["overall"] = round(sum(summary.values()) / len(summary), 4)
|
| 573 |
+
|
| 574 |
+
print("─" * 100)
|
| 575 |
+
print(f"{'OVERALL':<24} {'':12} {summary['overall']:>10.4f}")
|
| 576 |
+
print("=" * 100)
|
| 577 |
+
print()
|
| 578 |
+
|
| 579 |
+
# Difficulty progression check
|
| 580 |
+
easy = summary.get("alert_classification", 0)
|
| 581 |
+
med = summary.get("root_cause_analysis", 0)
|
| 582 |
+
hard = summary.get("remediation_planning", 0)
|
| 583 |
+
|
| 584 |
+
if easy > med > hard:
|
| 585 |
+
print(" ✅ Difficulty Progression: Easy (%.2f) > Medium (%.2f) > Hard (%.2f)" % (easy, med, hard))
|
| 586 |
+
elif easy > med and easy > hard:
|
| 587 |
+
print(" ⚠️ Difficulty Progression: Easy highest, Medium ≈ Hard")
|
| 588 |
+
else:
|
| 589 |
+
print(" ❌ Difficulty Progression: Unexpected order")
|
| 590 |
+
|
| 591 |
+
print()
|
| 592 |
print(json.dumps(summary))
|
| 593 |
|
| 594 |
|
| 595 |
+
def _run_episode_detailed(task_id: str, scenario_index: int) -> tuple[float, int, int, float]:
|
| 596 |
+
"""Run episode and return (score, steps_used, actions_taken, cumulative_reward)."""
|
| 597 |
+
r = _session.post(
|
| 598 |
+
f"{ENV_BASE_URL}/reset",
|
| 599 |
+
params={"task_id": task_id, "scenario_index": scenario_index},
|
| 600 |
+
timeout=30,
|
| 601 |
+
)
|
| 602 |
+
r.raise_for_status()
|
| 603 |
+
obs = r.json()
|
| 604 |
+
|
| 605 |
+
messages = [
|
| 606 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 607 |
+
{"role": "user", "content": _first_obs_msg(obs)},
|
| 608 |
+
]
|
| 609 |
+
|
| 610 |
+
prev_queried: dict = {}
|
| 611 |
+
max_steps = obs.get("max_steps", 10)
|
| 612 |
+
actions_taken = 0
|
| 613 |
+
cumulative_reward = 0.0
|
| 614 |
+
|
| 615 |
+
for step_i in range(max_steps):
|
| 616 |
+
current_step = step_i + 1
|
| 617 |
+
|
| 618 |
+
raw = _llm_call_with_retry(messages)
|
| 619 |
+
messages.append({"role": "assistant", "content": raw or "{}"})
|
| 620 |
+
|
| 621 |
+
action = None
|
| 622 |
+
try:
|
| 623 |
+
if raw.strip():
|
| 624 |
+
action = _parse(raw)
|
| 625 |
+
except Exception:
|
| 626 |
+
pass
|
| 627 |
+
|
| 628 |
+
if action is None:
|
| 629 |
+
action = _smart_fallback(task_id, obs, current_step, max_steps)
|
| 630 |
+
print(f" [FALLBACK] step {current_step}: {action.get('action_type')}", file=sys.stderr)
|
| 631 |
+
elif _should_override(task_id, action, obs, current_step, max_steps):
|
| 632 |
+
old_at = action.get("action_type")
|
| 633 |
+
action = _smart_fallback(task_id, obs, current_step, max_steps)
|
| 634 |
+
print(f" [OVERRIDE] step {current_step}: {old_at} -> {action.get('action_type')}", file=sys.stderr)
|
| 635 |
+
|
| 636 |
+
sr = _session.post(f"{ENV_BASE_URL}/step", json=action, timeout=30)
|
| 637 |
+
sr.raise_for_status()
|
| 638 |
+
result = sr.json()
|
| 639 |
+
new_obs = result["observation"]
|
| 640 |
+
|
| 641 |
+
actions_taken += 1
|
| 642 |
+
step_reward = result['reward']['value']
|
| 643 |
+
cumulative_reward = result['reward'].get('cumulative', cumulative_reward + step_reward)
|
| 644 |
+
|
| 645 |
+
# Step detail output
|
| 646 |
+
print(
|
| 647 |
+
f" step {current_step:>2}: {action.get('action_type'):<28} "
|
| 648 |
+
f"reward={step_reward:+.3f} done={result['done']}"
|
| 649 |
+
)
|
| 650 |
+
|
| 651 |
+
if result.get("done"):
|
| 652 |
+
break
|
| 653 |
+
|
| 654 |
+
step_msg = _step_msg(new_obs, prev_queried)
|
| 655 |
+
messages.append({"role": "user", "content": step_msg})
|
| 656 |
+
prev_queried = {
|
| 657 |
+
k: dict(v)
|
| 658 |
+
for k, v in new_obs.get("queried_data", {}).items()
|
| 659 |
+
if isinstance(v, dict)
|
| 660 |
+
}
|
| 661 |
+
obs = new_obs
|
| 662 |
+
|
| 663 |
+
if len(messages) > 20:
|
| 664 |
+
messages = messages[:2] + messages[-16:]
|
| 665 |
+
|
| 666 |
+
g = _session.get(f"{ENV_BASE_URL}/grader", timeout=30)
|
| 667 |
+
g.raise_for_status()
|
| 668 |
+
score = g.json().get("total", 0.0)
|
| 669 |
+
|
| 670 |
+
return score, current_step, actions_taken, cumulative_reward
|
| 671 |
+
|
| 672 |
+
|
| 673 |
if __name__ == "__main__":
|
| 674 |
main()
|
openenv.yaml
CHANGED
|
@@ -8,7 +8,11 @@ description: >
|
|
| 8 |
across distributed systems. An AI agent classifies alert severity, performs
|
| 9 |
root cause analysis through log/metric/dependency queries, and executes
|
| 10 |
remediation sequences to resolve production incidents end-to-end.
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
license: MIT
|
| 13 |
tags:
|
| 14 |
- openenv
|
|
@@ -59,4 +63,7 @@ endpoints:
|
|
| 59 |
state: "GET /state"
|
| 60 |
tasks: "GET /tasks"
|
| 61 |
grader: "GET /grader"
|
| 62 |
-
baseline: "POST /baseline"
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
across distributed systems. An AI agent classifies alert severity, performs
|
| 9 |
root cause analysis through log/metric/dependency queries, and executes
|
| 10 |
remediation sequences to resolve production incidents end-to-end.
|
| 11 |
+
authors:
|
| 12 |
+
- name: "Einstein"
|
| 13 |
+
github: "MrEinsteinE"
|
| 14 |
+
- name: "Sidra"
|
| 15 |
+
github: "sidraaiman"
|
| 16 |
license: MIT
|
| 17 |
tags:
|
| 18 |
- openenv
|
|
|
|
| 63 |
state: "GET /state"
|
| 64 |
tasks: "GET /tasks"
|
| 65 |
grader: "GET /grader"
|
| 66 |
+
baseline: "POST /baseline"
|
| 67 |
+
|
| 68 |
+
repo: "https://github.com/MrEinsteinE/cloud-incident-response-openenv"
|
| 69 |
+
space: "https://huggingface.co/spaces/Elliot89/cloud-incident-response"
|
requirements.txt
CHANGED
|
@@ -5,5 +5,5 @@ requests>=2.31.0
|
|
| 5 |
openai>=1.58.0
|
| 6 |
httpx>=0.27.0,<0.29.0
|
| 7 |
python-dotenv>=1.0.0
|
| 8 |
-
gradio>=4.
|
| 9 |
openenv-core>=0.2.0
|
|
|
|
| 5 |
openai>=1.58.0
|
| 6 |
httpx>=0.27.0,<0.29.0
|
| 7 |
python-dotenv>=1.0.0
|
| 8 |
+
gradio>=4.44.0,<5.0.0
|
| 9 |
openenv-core>=0.2.0
|
server/app.py
CHANGED
|
@@ -254,7 +254,15 @@ def baseline():
|
|
| 254 |
except Exception:
|
| 255 |
return {"raw_output": result.stdout[-3000:]}
|
| 256 |
|
| 257 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
# ── Gradio UI ─────────────────────────────────────────────────────────────────
|
| 259 |
|
| 260 |
import gradio as gr
|
|
|
|
| 254 |
except Exception:
|
| 255 |
return {"raw_output": result.stdout[-3000:]}
|
| 256 |
|
| 257 |
+
@app.get("/status")
|
| 258 |
+
def root_status():
|
| 259 |
+
"""Root health check — returns JSON."""
|
| 260 |
+
return {
|
| 261 |
+
"status": "running",
|
| 262 |
+
"name": "cloud-incident-response",
|
| 263 |
+
"version": "0.1.0",
|
| 264 |
+
"tasks": list(ALL_TASKS.keys()),
|
| 265 |
+
}
|
| 266 |
# ── Gradio UI ─────────────────────────────────────────────────────────────────
|
| 267 |
|
| 268 |
import gradio as gr
|
server/environment.py
CHANGED
|
@@ -250,7 +250,7 @@ class IncidentEnvironment:
|
|
| 250 |
svc = (params.service or "").lower().strip()
|
| 251 |
flag = (params.flag or "").lower().strip()
|
| 252 |
runbook = (params.runbook_action or "").lower().strip()
|
| 253 |
-
target = (params.target or "").lower().strip()
|
| 254 |
|
| 255 |
if not (svc or flag or runbook or target):
|
| 256 |
r += rt["rem_no_target"]
|
|
|
|
| 250 |
svc = (params.service or "").lower().strip()
|
| 251 |
flag = (params.flag or "").lower().strip()
|
| 252 |
runbook = (params.runbook_action or "").lower().strip()
|
| 253 |
+
target = (params.target or params.target_version or "").lower().strip()
|
| 254 |
|
| 255 |
if not (svc or flag or runbook or target):
|
| 256 |
r += rt["rem_no_target"]
|
tasks.py
CHANGED
|
@@ -373,12 +373,13 @@ SCENARIOS: dict = {
|
|
| 373 |
# In RCA-001, replace the query_logs section:
|
| 374 |
"query_logs": {
|
| 375 |
"postgres-db": (
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
|
|
|
| 382 |
"analytics-service": (
|
| 383 |
"2024-03-16T01:58:00Z INFO starting scheduled job: full_history_export\n"
|
| 384 |
"2024-03-16T01:58:01Z DEBUG executing: SELECT * FROM events "
|
|
@@ -678,18 +679,20 @@ SCENARIOS: dict = {
|
|
| 678 |
# In RCA-003, replace query_logs:
|
| 679 |
"query_logs": {
|
| 680 |
"user-service": (
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
|
| 687 |
-
|
|
|
|
| 688 |
"notification-service": (
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
|
|
|
|
| 693 |
"api-gateway": (
|
| 694 |
"2024-03-18T08:14:10Z ERROR upstream user-service: 503\n"
|
| 695 |
"2024-03-18T08:14:11Z ERROR upstream notification-service: 503"
|
|
@@ -842,10 +845,12 @@ SCENARIOS: dict = {
|
|
| 842 |
},
|
| 843 |
"tool_responses": {
|
| 844 |
"query_logs": {
|
| 845 |
-
|
| 846 |
-
|
| 847 |
-
|
| 848 |
-
|
|
|
|
|
|
|
| 849 |
"analytics-service": (
|
| 850 |
"INFO: starting job full_history_export\n"
|
| 851 |
"WARN: query plan: 847M rows, cross-table JOIN, no LIMIT\n"
|
|
@@ -1102,14 +1107,16 @@ SCENARIOS: dict = {
|
|
| 1102 |
"tool_responses": {
|
| 1103 |
"query_logs": {
|
| 1104 |
"user-service": (
|
| 1105 |
-
|
| 1106 |
-
|
| 1107 |
-
|
| 1108 |
-
|
| 1109 |
-
|
| 1110 |
-
|
| 1111 |
-
|
| 1112 |
-
|
|
|
|
|
|
|
| 1113 |
"api-gateway": (
|
| 1114 |
"ERROR: upstream user-service 503\n"
|
| 1115 |
"ERROR: upstream notification-service 503"
|
|
|
|
| 373 |
# In RCA-001, replace the query_logs section:
|
| 374 |
"query_logs": {
|
| 375 |
"postgres-db": (
|
| 376 |
+
"2024-03-16T02:11:00Z LOG database system shut down\n"
|
| 377 |
+
"2024-03-16T02:10:58Z FATAL terminated by kernel OOM killer\n"
|
| 378 |
+
"2024-03-16T02:10:30Z LOG long-running analytics export query "
|
| 379 |
+
"consuming 31.8GB/32GB — sequential scan on events table "
|
| 380 |
+
"with cross-join, running 12 minutes, no LIMIT clause. "
|
| 381 |
+
"Investigate analytics-service scheduled jobs"
|
| 382 |
+
),
|
| 383 |
"analytics-service": (
|
| 384 |
"2024-03-16T01:58:00Z INFO starting scheduled job: full_history_export\n"
|
| 385 |
"2024-03-16T01:58:01Z DEBUG executing: SELECT * FROM events "
|
|
|
|
| 679 |
# In RCA-003, replace query_logs:
|
| 680 |
"query_logs": {
|
| 681 |
"user-service": (
|
| 682 |
+
"2024-03-18T08:14:00Z FATAL password authentication failed "
|
| 683 |
+
"for user 'app_user'\n"
|
| 684 |
+
"2024-03-18T08:14:01Z ERROR DB credentials rejected — "
|
| 685 |
+
"credentials were last pushed by config-service secrets "
|
| 686 |
+
"rotation at 08:12:00Z\n"
|
| 687 |
+
"2024-03-18T08:14:02Z WARN credential hash mismatch — "
|
| 688 |
+
"check config-service rotation job for issues"
|
| 689 |
+
),
|
| 690 |
"notification-service": (
|
| 691 |
+
"2024-03-18T08:14:05Z FATAL password authentication failed "
|
| 692 |
+
"for user 'app_user'\n"
|
| 693 |
+
"2024-03-18T08:14:06Z WARN credentials from config-service "
|
| 694 |
+
"rotation at 08:12:00Z appear invalid"
|
| 695 |
+
),
|
| 696 |
"api-gateway": (
|
| 697 |
"2024-03-18T08:14:10Z ERROR upstream user-service: 503\n"
|
| 698 |
"2024-03-18T08:14:11Z ERROR upstream notification-service: 503"
|
|
|
|
| 845 |
},
|
| 846 |
"tool_responses": {
|
| 847 |
"query_logs": {
|
| 848 |
+
# RP-001 query_logs → postgres-db — REPLACE WITH:
|
| 849 |
+
"postgres-db": (
|
| 850 |
+
"FATAL: terminated by kernel OOM killer — "
|
| 851 |
+
"query from client 10.0.5.47 running 12min consuming "
|
| 852 |
+
"31.8GB of 32GB available memory"
|
| 853 |
+
),
|
| 854 |
"analytics-service": (
|
| 855 |
"INFO: starting job full_history_export\n"
|
| 856 |
"WARN: query plan: 847M rows, cross-table JOIN, no LIMIT\n"
|
|
|
|
| 1107 |
"tool_responses": {
|
| 1108 |
"query_logs": {
|
| 1109 |
"user-service": (
|
| 1110 |
+
"FATAL: password authentication failed for user 'app_user'\n"
|
| 1111 |
+
"ERROR: DB credentials rejected\n"
|
| 1112 |
+
"WARN: credentials last refreshed at 08:12:00Z"
|
| 1113 |
+
),
|
| 1114 |
+
|
| 1115 |
+
"notification-service": (
|
| 1116 |
+
"FATAL: password authentication failed\n"
|
| 1117 |
+
"WARN: credentials last refreshed at 08:12:00Z — "
|
| 1118 |
+
"authentication rejected by postgres-db"
|
| 1119 |
+
),
|
| 1120 |
"api-gateway": (
|
| 1121 |
"ERROR: upstream user-service 503\n"
|
| 1122 |
"ERROR: upstream notification-service 503"
|
uv.lock
CHANGED
|
@@ -638,7 +638,7 @@ wheels = [
|
|
| 638 |
|
| 639 |
[[package]]
|
| 640 |
name = "fastapi"
|
| 641 |
-
version = "0.135.
|
| 642 |
source = { registry = "https://pypi.org/simple" }
|
| 643 |
dependencies = [
|
| 644 |
{ name = "annotated-doc" },
|
|
@@ -647,9 +647,9 @@ dependencies = [
|
|
| 647 |
{ name = "typing-extensions" },
|
| 648 |
{ name = "typing-inspection" },
|
| 649 |
]
|
| 650 |
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
| 651 |
wheels = [
|
| 652 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 653 |
]
|
| 654 |
|
| 655 |
[[package]]
|
|
|
|
| 638 |
|
| 639 |
[[package]]
|
| 640 |
name = "fastapi"
|
| 641 |
+
version = "0.135.3"
|
| 642 |
source = { registry = "https://pypi.org/simple" }
|
| 643 |
dependencies = [
|
| 644 |
{ name = "annotated-doc" },
|
|
|
|
| 647 |
{ name = "typing-extensions" },
|
| 648 |
{ name = "typing-inspection" },
|
| 649 |
]
|
| 650 |
+
sdist = { url = "https://files.pythonhosted.org/packages/f7/e6/7adb4c5fa231e82c35b8f5741a9f2d055f520c29af5546fd70d3e8e1cd2e/fastapi-0.135.3.tar.gz", hash = "sha256:bd6d7caf1a2bdd8d676843cdcd2287729572a1ef524fc4d65c17ae002a1be654", size = 396524, upload-time = "2026-04-01T16:23:58.188Z" }
|
| 651 |
wheels = [
|
| 652 |
+
{ url = "https://files.pythonhosted.org/packages/84/a4/5caa2de7f917a04ada20018eccf60d6cc6145b0199d55ca3711b0fc08312/fastapi-0.135.3-py3-none-any.whl", hash = "sha256:9b0f590c813acd13d0ab43dd8494138eb58e484bfac405db1f3187cfc5810d98", size = 117734, upload-time = "2026-04-01T16:23:59.328Z" },
|
| 653 |
]
|
| 654 |
|
| 655 |
[[package]]
|