Spaces:
Running
Running
| """ | |
| Task 2 β Cascading Failure (Medium) | |
| Scenario: user-db develops a slow query that exhausts the auth-service connection pool, | |
| which then causes the api-gateway to return timeouts to all users. | |
| Surface logs show gateway errors most loudly (symptom), but root cause is hidden (user-db). | |
| Agent must trace backward through the cascade chain β NOT treat symptoms as root cause. | |
| Ground truth: | |
| - severity: P1 | |
| - root_cause: user-db | |
| - remediation: kill-query:user-db OR restart:user-db | |
| - correct_teams: dba-team, sre-team | |
| - noise_ratio: 30% | |
| """ | |
| from __future__ import annotations | |
| import random | |
| from datetime import datetime | |
| from server.models import LogLine, ServiceStatus | |
| from server.log_generator import ( | |
| generate_log_batch, | |
| generate_healthy_system_state, | |
| _make_timestamp, | |
| ) | |
| # βββ GROUND TRUTH βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| GROUND_TRUTH = { | |
| "severity": "P1", | |
| "root_cause": "user-db", | |
| "remediation_prefixes": {"kill-query", "restart"}, | |
| "remediation_service": "user-db", | |
| "correct_teams": {"dba-team", "sre-team"}, | |
| "max_steps": 12, | |
| "noise_ratio": 0.30, | |
| } | |
| # βββ STEP-BY-STEP SIGNAL PLAN βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Cascade chain: user-db slow query β auth-service pool exhausted β api-gateway timeouts | |
| # Steps 0-1: Gateway errors surface (symptoms only β most visible) | |
| # Steps 2-3: Auth-service DB pressure becomes visible | |
| # Steps 4-5: user-db slow queries exposed; circuit breaker opens | |
| # Steps 6-7: Full cascade β all 3 services degraded/down | |
| # Steps 8-11: Escalating alerts; root cause becomes unmistakable | |
| STEP_SIGNALS = [ | |
| # Step 0: Gateway errors first to appear (surface symptom) | |
| [ | |
| ("api-gateway", "ERROR", "upstream timeout from auth-service: 5002ms"), | |
| ("api-gateway", "WARN", "error rate: 8.3% on /auth/* routes"), | |
| ], | |
| # Step 1: More gateway errors; first hints of auth-service pressure | |
| [ | |
| ("api-gateway", "ERROR", "upstream timeout from auth-service: 30007ms"), | |
| ("api-gateway", "WARN", "error rate: 15.7% β auth-service latency climbing"), | |
| ], | |
| # Step 2: Auth-service connection pool pressure visible | |
| [ | |
| ("auth-service", "WARN", "db connection pool at 42/50 β pressure building"), | |
| ("api-gateway", "ERROR", "upstream timeout from auth-service: 30005ms"), | |
| ("auth-service", "ERROR", "db query timeout: SELECT session WHERE user_id=? [5001ms]"), | |
| ], | |
| # Step 3: Auth-service pool nearly exhausted | |
| [ | |
| ("auth-service", "ERROR", "db connection pool EXHAUSTED (50/50) β blocking new requests"), | |
| ("api-gateway", "ERROR", "auth-service unavailable: connection pool full"), | |
| ("auth-service", "WARN", "request queue depth: 127 β approaching overflow"), | |
| ], | |
| # Step 4: user-db slow query finally exposed | |
| [ | |
| ("user-db", "WARN", "slow query detected: SELECT * FROM sessions WHERE user_id=? [2847ms]"), | |
| ("auth-service", "ERROR", "db connection timeout after 5000ms β query hanging"), | |
| ("user-db", "ERROR", "lock wait timeout: session table β blocking reads"), | |
| ], | |
| # Step 5: user-db circuit breaker opens; auth-service starts failing fast | |
| [ | |
| ("user-db", "WARN", "slow query: 4500ms β circuit breaker approaching threshold"), | |
| ("auth-service", "ERROR", "circuit breaker OPEN for user-db: latency exceeded 5000ms"), | |
| ("api-gateway", "ERROR", "all /auth/* requests failing β upstream unavailable"), | |
| ], | |
| # Step 6: Full cascade β all 3 services degraded | |
| [ | |
| ("api-gateway", "ERROR", "error rate: 67.4% β multiple upstreams timing out"), | |
| ("auth-service", "ERROR", "health check FAILED: cannot reach user-db"), | |
| ("user-db", "ERROR", "connection pool saturated: 95/100 connections in use"), | |
| ], | |
| # Step 7: api-gateway now fully symptomatic | |
| [ | |
| ("api-gateway", "FATAL", "SLA breach: /auth endpoint availability < 95%"), | |
| ("auth-service", "ERROR", "auth-service DOWN: 3/3 health checks failed"), | |
| ("user-db", "WARN", "slow query count: 847 in last 60s β severe degradation"), | |
| ], | |
| # Step 8: Database fully exposed as root cause | |
| [ | |
| ("user-db", "ERROR", "CRITICAL: user-db query latency 8000ms+ β active sessions timing out"), | |
| ("auth-service", "ERROR", "rejected: user-db connection pool exhausted"), | |
| ("api-gateway", "ERROR", "user-auth endpoint returning 503 β cascade failure"), | |
| ], | |
| # Step 9: Escalating | |
| [ | |
| ("user-db", "FATAL", "user-db DOWN: connection pool 100/100 β no connections available"), | |
| ("api-gateway", "ERROR", "error rate: 89.2% β auth-service and user-db both unreachable"), | |
| ], | |
| # Step 10: Critical | |
| [ | |
| ("api-gateway", "FATAL", "CRITICAL: auth-service DOWN for 90s β 100% of login attempts failing"), | |
| ("user-db", "ERROR", "lock contention: session table fully locked β queries timing out"), | |
| ], | |
| # Step 11: Maximum severity | |
| [ | |
| ("user-db", "FATAL", "user-db unresponsive for 180s β database crisis"), | |
| ("api-gateway", "FATAL", "SLA_BREACH: auth availability 0% β complete user-auth outage"), | |
| ], | |
| ] | |
| def get_system_state(step: int, base_time: datetime) -> dict[str, ServiceStatus]: | |
| """Return system state for this step. Cascade: user-db β auth-service β api-gateway.""" | |
| now = _make_timestamp(base_time, step * 30) | |
| state = generate_healthy_system_state(base_time) | |
| # Escalating degradation based on step | |
| if step <= 1: | |
| # Gateway just starting to see issues | |
| state["api-gateway"] = ServiceStatus( | |
| name="api-gateway", status="degraded", error_rate=0.083, latency_p99_ms=2500, last_updated=now | |
| ) | |
| elif step <= 3: | |
| # Auth-service pool pressure | |
| state["api-gateway"] = ServiceStatus( | |
| name="api-gateway", status="degraded", error_rate=0.157, latency_p99_ms=5000, last_updated=now | |
| ) | |
| state["auth-service"] = ServiceStatus( | |
| name="auth-service", status="degraded", error_rate=0.15, latency_p99_ms=5000, last_updated=now | |
| ) | |
| elif step <= 5: | |
| # user-db slow queries exposed | |
| state["api-gateway"] = ServiceStatus( | |
| name="api-gateway", status="degraded", error_rate=0.45, latency_p99_ms=8000, last_updated=now | |
| ) | |
| state["auth-service"] = ServiceStatus( | |
| name="auth-service", status="down", error_rate=0.85, latency_p99_ms=10000, last_updated=now | |
| ) | |
| state["user-db"] = ServiceStatus( | |
| name="user-db", status="degraded", error_rate=0.30, latency_p99_ms=4500, last_updated=now | |
| ) | |
| elif step <= 7: | |
| # Full cascade | |
| state["api-gateway"] = ServiceStatus( | |
| name="api-gateway", status="down", error_rate=0.89, latency_p99_ms=10000, last_updated=now | |
| ) | |
| state["auth-service"] = ServiceStatus( | |
| name="auth-service", status="down", error_rate=0.95, latency_p99_ms=10000, last_updated=now | |
| ) | |
| state["user-db"] = ServiceStatus( | |
| name="user-db", status="down", error_rate=0.50, latency_p99_ms=8000, last_updated=now | |
| ) | |
| else: | |
| # Maximum severity | |
| state["api-gateway"] = ServiceStatus( | |
| name="api-gateway", status="down", error_rate=0.99, latency_p99_ms=10000, last_updated=now | |
| ) | |
| state["auth-service"] = ServiceStatus( | |
| name="auth-service", status="down", error_rate=1.0, latency_p99_ms=10000, last_updated=now | |
| ) | |
| state["user-db"] = ServiceStatus( | |
| name="user-db", status="down", error_rate=0.75, latency_p99_ms=10000, last_updated=now | |
| ) | |
| return state | |
| def get_step_data(step: int, base_time: datetime, rng: random.Random) -> tuple[list[LogLine], dict[str, ServiceStatus]]: | |
| """ | |
| Returns (logs, system_state) for the given step. | |
| Signal gets louder over time if agent hasn't acted. | |
| """ | |
| signal_idx = min(step, len(STEP_SIGNALS) - 1) | |
| signals = STEP_SIGNALS[signal_idx] | |
| logs = generate_log_batch( | |
| scenario_signals=signals, | |
| step=step, | |
| base_time=base_time, | |
| noise_ratio=GROUND_TRUTH["noise_ratio"], | |
| batch_size=10, | |
| rng=rng, | |
| ) | |
| system_state = get_system_state(step, base_time) | |
| return logs, system_state | |
| def get_active_alerts(step: int) -> list[str]: | |
| """Return active alerts for this step.""" | |
| alerts = [] | |
| if step >= 0: | |
| alerts.append("api-gateway: elevated error rate on /auth/* routes") | |
| if step >= 2: | |
| alerts.append("auth-service: db connection pool pressure") | |
| if step >= 4: | |
| alerts.append("user-db: slow queries detected β latency 2000ms+") | |
| if step >= 5: | |
| alerts.append("auth-service: circuit breaker OPEN for user-db") | |
| if step >= 6: | |
| alerts.append("SLA_BREACH: /auth availability < 90%") | |
| if step >= 8: | |
| alerts.append("CRITICAL: user-db connection pool saturated") | |
| if step >= 10: | |
| alerts.append("CRITICAL: full auth cascade failure β P1 incident") | |
| return alerts | |