Spaces:

aamrinder
/

sre-incident-env

Sleeping

App Files Files Community

aamrinder commited on about 1 month ago

Commit

edf8eb5

verified ·

1 Parent(s): f3ea120

Upload server/grader.py with huggingface_hub

Browse files

Files changed (1) hide show

server/grader.py +38 -17

server/grader.py CHANGED Viewed

@@ -4,6 +4,8 @@ from __future__ import annotations
 from typing import Any, Dict, List
 def grade_task(task_id: str, cluster_snapshot: Dict[str, Any], action_history: List[Dict]) -> Dict[str, Any]:
     """Grade a completed episode. Returns {"reward": float, "metadata": dict}."""
@@ -14,17 +16,22 @@ def grade_task(task_id: str, cluster_snapshot: Dict[str, Any], action_history: L
     }
     grader = grader_map.get(task_id)
     if not grader:
-        return {"reward": 0.0, "metadata": {"error": f"Unknown task: {task_id}"}}
     return grader(cluster_snapshot, action_history)
 def _weighted_score(results: List[Dict]) -> float:
     total_weight = sum(r["weight"] for r in results)
     if total_weight == 0:
-        return 0.001
-    raw = sum(r["score"] * r["weight"] for r in results) / total_weight
-    # Clamp to strict (0, 1) — validator rejects exactly 0.0 or 1.0
-    return min(max(raw, 0.001), 0.999)
 # ═══════════════════════════════════════════════════════════════════
@@ -33,6 +40,7 @@ def _weighted_score(results: List[Dict]) -> float:
 def _grade_easy(snapshot: Dict, history: List[Dict]) -> Dict[str, Any]:
     services = snapshot.get("services", {})
     results = []
     # 1. Did agent investigate the root cause service?
@@ -48,12 +56,12 @@ def _grade_easy(snapshot: Dict, history: List[Dict]) -> Dict[str, Any]:
         h["command"] == "restart_service" and h["target"] == "api-gateway"
         for h in history
     )
-    results.append({"name": "Restarted api-gateway", "score": 1.0 if restarted else 0.0, "weight": 0.4})
     # 3. Is api-gateway healthy now?
     gw = services.get("api-gateway", {})
     healthy = gw.get("status") == "healthy"
-    results.append({"name": "api-gateway is healthy", "score": 1.0 if healthy else 0.0, "weight": 0.3})
     # 4. Didn't restart healthy services unnecessarily
     unnecessary_restarts = sum(
@@ -63,6 +71,10 @@ def _grade_easy(snapshot: Dict, history: List[Dict]) -> Dict[str, Any]:
     no_waste = unnecessary_restarts == 0
     results.append({"name": "No unnecessary restarts", "score": 1.0 if no_waste else 0.0, "weight": 0.1})
     return {"reward": round(_weighted_score(results), 4), "metadata": {"evaluations": results}}
@@ -72,6 +84,7 @@ def _grade_easy(snapshot: Dict, history: List[Dict]) -> Dict[str, Any]:
 def _grade_medium(snapshot: Dict, history: List[Dict]) -> Dict[str, Any]:
     services = snapshot.get("services", {})
     results = []
     # 1. Traced dependencies
@@ -79,7 +92,7 @@ def _grade_medium(snapshot: Dict, history: List[Dict]) -> Dict[str, Any]:
         h["command"] == "check_dependencies"
         for h in history
     )
-    results.append({"name": "Traced dependency graph", "score": 1.0 if traced else 0.0, "weight": 0.15})
     # 2. Identified postgres as root cause (investigated it)
     db_investigated = any(
@@ -87,7 +100,7 @@ def _grade_medium(snapshot: Dict, history: List[Dict]) -> Dict[str, Any]:
         and h["target"] == "postgres-primary"
         for h in history
     )
-    results.append({"name": "Investigated postgres-primary", "score": 1.0 if db_investigated else 0.0, "weight": 0.2})
     # 3. Fixed postgres (restarted or updated config)
     db_fixed = any(
@@ -97,13 +110,13 @@ def _grade_medium(snapshot: Dict, history: List[Dict]) -> Dict[str, Any]:
     )
     db_svc = services.get("postgres-primary", {})
     db_healthy = db_svc.get("status") == "healthy"
-    results.append({"name": "Fixed postgres-primary", "score": 1.0 if (db_fixed and db_healthy) else 0.0, "weight": 0.25})
     # 4. Downstream services recovered
     downstream_names = ["user-service", "order-service", "payment-service"]
     recovered = sum(1 for n in downstream_names if services.get(n, {}).get("status") == "healthy")
     downstream_score = recovered / len(downstream_names)
-    results.append({"name": "Downstream services recovered", "score": round(downstream_score, 2), "weight": 0.2})
     # 5. Didn't act on red herring alerts (didn't restart cache-service)
     acted_on_noise = any(
@@ -111,7 +124,11 @@ def _grade_medium(snapshot: Dict, history: List[Dict]) -> Dict[str, Any]:
         and h["target"] == "cache-service"
         for h in history
     )
-    results.append({"name": "Ignored red herring alerts", "score": 0.0 if acted_on_noise else 1.0, "weight": 0.2})
     return {"reward": round(_weighted_score(results), 4), "metadata": {"evaluations": results}}
@@ -122,6 +139,7 @@ def _grade_medium(snapshot: Dict, history: List[Dict]) -> Dict[str, Any]:
 def _grade_hard(snapshot: Dict, history: List[Dict]) -> Dict[str, Any]:
     services = snapshot.get("services", {})
     results = []
     # 1. Investigated worker-service (the compromised service)
@@ -138,11 +156,11 @@ def _grade_hard(snapshot: Dict, history: List[Dict]) -> Dict[str, Any]:
         p.get("name") in ("xmrig", "kworker/u8:2") and p.get("status") == "killed"
         for p in worker.get("processes", [])
     )
-    results.append({"name": "Killed crypto miner", "score": 1.0 if miner_killed else 0.0, "weight": 0.15})
     # 3. Rolled back compromised deployment
     rolled_back = worker.get("was_rolled_back", False)
-    results.append({"name": "Rolled back worker-service", "score": 1.0 if rolled_back else 0.0, "weight": 0.15})
     # 4. Rotated credentials
     creds_rotated = worker.get("credentials_rotated", False)
@@ -151,11 +169,11 @@ def _grade_hard(snapshot: Dict, history: List[Dict]) -> Dict[str, Any]:
     # 5. Fixed disk issue on log-aggregator
     log_agg = services.get("log-aggregator", {})
     disk_ok = log_agg.get("disk_usage_percent", 100) < 80
-    results.append({"name": "Cleared log-aggregator disk", "score": 1.0 if disk_ok else 0.0, "weight": 0.15})
     # 6. All services healthy
     mostly_healthy = sum(1 for s in services.values() if s.get("status") == "healthy") / max(len(services), 1)
-    results.append({"name": "Cluster health restored", "score": round(mostly_healthy, 2), "weight": 0.2})
     # 7. Submitted correct diagnosis
     diagnosis = snapshot.get("diagnosis_submitted")
@@ -165,5 +183,8 @@ def _grade_hard(snapshot: Dict, history: List[Dict]) -> Dict[str, Any]:
         correct_diagnosis = any(kw in rc for kw in ["crypto", "mining", "xmrig", "malicious", "compromised", "unauthorized"])
     results.append({"name": "Correct diagnosis submitted", "score": 1.0 if correct_diagnosis else 0.0, "weight": 0.15})
-    return {"reward": round(_weighted_score(results), 4), "metadata": {"evaluations": results}}

 from typing import Any, Dict, List
+OPTIMAL_STEPS = {"easy": 5, "medium": 10, "hard": 15}
 def grade_task(task_id: str, cluster_snapshot: Dict[str, Any], action_history: List[Dict]) -> Dict[str, Any]:
     """Grade a completed episode. Returns {"reward": float, "metadata": dict}."""
     }
     grader = grader_map.get(task_id)
     if not grader:
+        return {"reward": 0.5, "metadata": {"error": f"Unknown task: {task_id}"}}
     return grader(cluster_snapshot, action_history)
+def _efficiency_score(task_id: str, steps_taken: int) -> float:
+    """Score based on how efficiently the agent solved the task.
+    Returns value in (0, 1) — mathematically cannot be 0.0 or 1.0."""
+    optimal = OPTIMAL_STEPS.get(task_id, 10)
+    return optimal / (steps_taken + optimal)
 def _weighted_score(results: List[Dict]) -> float:
     total_weight = sum(r["weight"] for r in results)
     if total_weight == 0:
+        return 0.5
+    return sum(r["score"] * r["weight"] for r in results) / total_weight
 # ═══════════════════════════════════════════════════════════════════
 def _grade_easy(snapshot: Dict, history: List[Dict]) -> Dict[str, Any]:
     services = snapshot.get("services", {})
+    steps = snapshot.get("step_count", len(history))
     results = []
     # 1. Did agent investigate the root cause service?
         h["command"] == "restart_service" and h["target"] == "api-gateway"
         for h in history
     )
+    results.append({"name": "Restarted api-gateway", "score": 1.0 if restarted else 0.0, "weight": 0.3})
     # 3. Is api-gateway healthy now?
     gw = services.get("api-gateway", {})
     healthy = gw.get("status") == "healthy"
+    results.append({"name": "api-gateway is healthy", "score": 1.0 if healthy else 0.0, "weight": 0.2})
     # 4. Didn't restart healthy services unnecessarily
     unnecessary_restarts = sum(
     no_waste = unnecessary_restarts == 0
     results.append({"name": "No unnecessary restarts", "score": 1.0 if no_waste else 0.0, "weight": 0.1})
+    # 5. Efficiency — always in (0, 1), prevents total from hitting 0.0 or 1.0
+    eff = _efficiency_score("easy", steps)
+    results.append({"name": "Resolution efficiency", "score": round(eff, 4), "weight": 0.2})
     return {"reward": round(_weighted_score(results), 4), "metadata": {"evaluations": results}}
 def _grade_medium(snapshot: Dict, history: List[Dict]) -> Dict[str, Any]:
     services = snapshot.get("services", {})
+    steps = snapshot.get("step_count", len(history))
     results = []
     # 1. Traced dependencies
         h["command"] == "check_dependencies"
         for h in history
     )
+    results.append({"name": "Traced dependency graph", "score": 1.0 if traced else 0.0, "weight": 0.1})
     # 2. Identified postgres as root cause (investigated it)
     db_investigated = any(
         and h["target"] == "postgres-primary"
         for h in history
     )
+    results.append({"name": "Investigated postgres-primary", "score": 1.0 if db_investigated else 0.0, "weight": 0.15})
     # 3. Fixed postgres (restarted or updated config)
     db_fixed = any(
     )
     db_svc = services.get("postgres-primary", {})
     db_healthy = db_svc.get("status") == "healthy"
+    results.append({"name": "Fixed postgres-primary", "score": 1.0 if (db_fixed and db_healthy) else 0.0, "weight": 0.2})
     # 4. Downstream services recovered
     downstream_names = ["user-service", "order-service", "payment-service"]
     recovered = sum(1 for n in downstream_names if services.get(n, {}).get("status") == "healthy")
     downstream_score = recovered / len(downstream_names)
+    results.append({"name": "Downstream services recovered", "score": round(downstream_score, 4), "weight": 0.2})
     # 5. Didn't act on red herring alerts (didn't restart cache-service)
     acted_on_noise = any(
         and h["target"] == "cache-service"
         for h in history
     )
+    results.append({"name": "Ignored red herring alerts", "score": 0.0 if acted_on_noise else 1.0, "weight": 0.15})
+    # 6. Efficiency
+    eff = _efficiency_score("medium", steps)
+    results.append({"name": "Resolution efficiency", "score": round(eff, 4), "weight": 0.2})
     return {"reward": round(_weighted_score(results), 4), "metadata": {"evaluations": results}}
 def _grade_hard(snapshot: Dict, history: List[Dict]) -> Dict[str, Any]:
     services = snapshot.get("services", {})
+    steps = snapshot.get("step_count", len(history))
     results = []
     # 1. Investigated worker-service (the compromised service)
         p.get("name") in ("xmrig", "kworker/u8:2") and p.get("status") == "killed"
         for p in worker.get("processes", [])
     )
+    results.append({"name": "Killed crypto miner", "score": 1.0 if miner_killed else 0.0, "weight": 0.1})
     # 3. Rolled back compromised deployment
     rolled_back = worker.get("was_rolled_back", False)
+    results.append({"name": "Rolled back worker-service", "score": 1.0 if rolled_back else 0.0, "weight": 0.1})
     # 4. Rotated credentials
     creds_rotated = worker.get("credentials_rotated", False)
     # 5. Fixed disk issue on log-aggregator
     log_agg = services.get("log-aggregator", {})
     disk_ok = log_agg.get("disk_usage_percent", 100) < 80
+    results.append({"name": "Cleared log-aggregator disk", "score": 1.0 if disk_ok else 0.0, "weight": 0.1})
     # 6. All services healthy
     mostly_healthy = sum(1 for s in services.values() if s.get("status") == "healthy") / max(len(services), 1)
+    results.append({"name": "Cluster health restored", "score": round(mostly_healthy, 4), "weight": 0.15})
     # 7. Submitted correct diagnosis
     diagnosis = snapshot.get("diagnosis_submitted")
         correct_diagnosis = any(kw in rc for kw in ["crypto", "mining", "xmrig", "malicious", "compromised", "unauthorized"])
     results.append({"name": "Correct diagnosis submitted", "score": 1.0 if correct_diagnosis else 0.0, "weight": 0.15})
+    # 8. Efficiency
+    eff = _efficiency_score("hard", steps)
+    results.append({"name": "Resolution efficiency", "score": round(eff, 4), "weight": 0.2})
+    return {"reward": round(_weighted_score(results), 4), "metadata": {"evaluations": results}}