Spaces:

ZeroTsai0308
/

sre-agent

Sleeping

App Files Files Community

ZeroTsai0308 commited on 15 days ago

Commit

60d4ff4

verified ·

1 Parent(s): 3691744

Add sre_agent/tools/infrastructure_tools.py

Browse files

Files changed (1) hide show

sre_agent/tools/infrastructure_tools.py +250 -0

sre_agent/tools/infrastructure_tools.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""
+Infrastructure Tools for SRE Agent
+Resource utilization monitoring and service health checking.
+"""
+import json
+from datetime import datetime, timedelta
+import random
+from smolagents import Tool
+class ResourceUtilizationTool(Tool):
+    """Check current resource utilization for services/hosts."""
+    name = "resource_utilization"
+    description = """Checks current resource utilization for a service or infrastructure component.
+    Returns real-time (simulated) metrics:
+    - CPU utilization per pod/instance
+    - Memory usage per pod/instance
+    - Disk I/O and usage
+    - Network throughput
+    - Pod/container status (running, pending, crash-looping)
+    Use this to get a quick snapshot of resource health for a service.
+    """
+    inputs = {
+        "service_name": {
+            "type": "string",
+            "description": "Service or component to check, e.g. 'payment-service', 'database-primary'.",
+        },
+        "resource_type": {
+            "type": "string",
+            "description": "Specific resource to check: 'all', 'cpu', 'memory', 'disk', 'network'. Default: 'all'.",
+            "nullable": True,
+        },
+    }
+    output_type = "string"
+    def forward(self, service_name: str, resource_type: str = "all") -> str:
+        print(f"[ResourceUtilization] Checking {resource_type} for '{service_name}'")
+        # Simulate pod-level resource data
+        pod_profiles = {
+            "payment-service": {"replicas": 5, "cpu_base": 75, "mem_base": 70, "status_mix": ["Running"] * 4 + ["CrashLoopBackOff"]},
+            "api-gateway": {"replicas": 3, "cpu_base": 45, "mem_base": 55, "status_mix": ["Running"] * 3},
+            "order-service": {"replicas": 3, "cpu_base": 35, "mem_base": 50, "status_mix": ["Running"] * 3},
+            "user-service": {"replicas": 2, "cpu_base": 20, "mem_base": 40, "status_mix": ["Running"] * 2},
+            "database-primary": {"replicas": 1, "cpu_base": 60, "mem_base": 85, "status_mix": ["Running"]},
+            "cache-redis": {"replicas": 3, "cpu_base": 15, "mem_base": 30, "status_mix": ["Running"] * 3},
+        }
+        profile = pod_profiles.get(service_name, {"replicas": 2, "cpu_base": 30, "mem_base": 40, "status_mix": ["Running"] * 2})
+        pods = []
+        for i in range(profile["replicas"]):
+            cpu = profile["cpu_base"] + random.uniform(-10, 20)
+            mem = profile["mem_base"] + random.uniform(-5, 15)
+            status = profile["status_mix"][i] if i < len(profile["status_mix"]) else "Running"
+            restarts = random.randint(5, 20) if status == "CrashLoopBackOff" else 0
+            pod = {
+                "name": f"{service_name}-{random.randint(1000, 9999)}-{''.join(random.choices('abcdef0123456789', k=5))}",
+                "status": status,
+                "restarts": restarts,
+                "age": f"{random.randint(1, 30)}d",
+            }
+            if resource_type in ("all", "cpu"):
+                pod["cpu"] = {
+                    "usage_pct": round(min(cpu, 100), 1),
+                    "request": "500m",
+                    "limit": "2000m",
+                    "throttled": cpu > 90,
+                }
+            if resource_type in ("all", "memory"):
+                pod["memory"] = {
+                    "usage_pct": round(min(mem, 100), 1),
+                    "usage_mb": round(mem * 40.96, 0),  # Assuming 4GB limit
+                    "request": "1Gi",
+                    "limit": "4Gi",
+                    "oom_risk": mem > 85,
+                }
+            if resource_type in ("all", "disk"):
+                pod["disk"] = {
+                    "usage_pct": round(random.uniform(20, 70), 1),
+                    "io_read_mbps": round(random.uniform(1, 50), 1),
+                    "io_write_mbps": round(random.uniform(1, 30), 1),
+                }
+            if resource_type in ("all", "network"):
+                pod["network"] = {
+                    "rx_mbps": round(random.uniform(10, 200), 1),
+                    "tx_mbps": round(random.uniform(5, 100), 1),
+                    "connections_active": random.randint(50, 500),
+                    "connections_max": 1000,
+                }
+            pods.append(pod)
+        # Aggregate stats
+        cpu_values = [p["cpu"]["usage_pct"] for p in pods if "cpu" in p]
+        mem_values = [p["memory"]["usage_pct"] for p in pods if "memory" in p]
+        unhealthy = [p for p in pods if p["status"] != "Running"]
+        result = {
+            "service": service_name,
+            "timestamp": datetime.utcnow().isoformat() + "Z",
+            "total_pods": len(pods),
+            "healthy_pods": len(pods) - len(unhealthy),
+            "unhealthy_pods": [{"name": p["name"], "status": p["status"], "restarts": p["restarts"]} for p in unhealthy],
+            "aggregate": {},
+            "pods": pods,
+            "alerts": [],
+        }
+        if cpu_values:
+            result["aggregate"]["cpu"] = {
+                "avg_pct": round(sum(cpu_values) / len(cpu_values), 1),
+                "max_pct": round(max(cpu_values), 1),
+                "throttled_pods": sum(1 for p in pods if p.get("cpu", {}).get("throttled")),
+            }
+        if mem_values:
+            result["aggregate"]["memory"] = {
+                "avg_pct": round(sum(mem_values) / len(mem_values), 1),
+                "max_pct": round(max(mem_values), 1),
+                "oom_risk_pods": sum(1 for p in pods if p.get("memory", {}).get("oom_risk")),
+            }
+        # Generate alerts
+        if unhealthy:
+            result["alerts"].append({"severity": "critical", "message": f"{len(unhealthy)} pod(s) unhealthy: {', '.join(p['name'] for p in unhealthy)}"})
+        if cpu_values and max(cpu_values) > 85:
+            result["alerts"].append({"severity": "warning", "message": f"High CPU detected: max {max(cpu_values):.1f}%"})
+        if mem_values and max(mem_values) > 85:
+            result["alerts"].append({"severity": "warning", "message": f"High memory detected: max {max(mem_values):.1f}%"})
+        print(f"[ResourceUtilization] {service_name}: {len(pods)} pods, {len(unhealthy)} unhealthy, {len(result['alerts'])} alerts")
+        return json.dumps(result, indent=2)
+class ServiceHealthCheckerTool(Tool):
+    """Quick health check for a service and its dependencies."""
+    name = "service_health_checker"
+    description = """Performs a comprehensive health check for a service.
+    Checks:
+    - Service endpoint health (/healthz, /readyz)
+    - Dependency connectivity (databases, caches, queues)
+    - Recent error rate and latency
+    - Deployment status
+    - Certificate validity
+    Returns a traffic-light health status (green/yellow/red) with details.
+    Use this for a quick overview of service health before deep investigation.
+    """
+    inputs = {
+        "service_name": {
+            "type": "string",
+            "description": "Service to check health for, e.g. 'payment-service'.",
+        },
+    }
+    output_type = "string"
+    def forward(self, service_name: str) -> str:
+        print(f"[HealthChecker] Running health check for '{service_name}'")
+        # Simulated health checks
+        health_profiles = {
+            "payment-service": {
+                "endpoint_health": {"status": "degraded", "response_time_ms": 2500, "last_check": "10s ago"},
+                "dependencies": [
+                    {"name": "payment-db", "status": "unhealthy", "latency_ms": 30000, "error": "Connection pool exhausted"},
+                    {"name": "payment-gateway-ext", "status": "healthy", "latency_ms": 150},
+                    {"name": "fraud-detection", "status": "healthy", "latency_ms": 45},
+                    {"name": "cache-redis", "status": "healthy", "latency_ms": 2},
+                ],
+                "recent_metrics": {"error_rate": 12.5, "p99_latency_ms": 2500, "rps": 450},
+                "deployment": {"version": "v2.5.0", "deployed_at": "2024-01-15T10:00:00Z", "rollback_available": "v2.4.1"},
+                "cert_expiry_days": 45,
+            },
+            "api-gateway": {
+                "endpoint_health": {"status": "healthy", "response_time_ms": 5, "last_check": "10s ago"},
+                "dependencies": [
+                    {"name": "payment-service", "status": "degraded", "latency_ms": 2500, "error": "High latency"},
+                    {"name": "user-service", "status": "healthy", "latency_ms": 15},
+                    {"name": "order-service", "status": "healthy", "latency_ms": 20},
+                    {"name": "auth-service", "status": "healthy", "latency_ms": 10},
+                ],
+                "recent_metrics": {"error_rate": 3.2, "p99_latency_ms": 180, "rps": 2000},
+                "deployment": {"version": "v3.1.0", "deployed_at": "2024-01-10T14:00:00Z", "rollback_available": "v3.0.9"},
+                "cert_expiry_days": 30,
+            },
+        }
+        profile = health_profiles.get(service_name, {
+            "endpoint_health": {"status": "healthy", "response_time_ms": random.randint(5, 50), "last_check": "10s ago"},
+            "dependencies": [
+                {"name": "database", "status": "healthy", "latency_ms": random.randint(1, 20)},
+                {"name": "cache", "status": "healthy", "latency_ms": random.randint(1, 5)},
+            ],
+            "recent_metrics": {"error_rate": round(random.uniform(0, 0.5), 2), "p99_latency_ms": random.randint(10, 100), "rps": random.randint(100, 1000)},
+            "deployment": {"version": "v1.0.0", "deployed_at": "2024-01-01T00:00:00Z", "rollback_available": "v0.9.9"},
+            "cert_expiry_days": random.randint(30, 365),
+        })
+        # Compute overall health
+        unhealthy_deps = [d for d in profile["dependencies"] if d["status"] != "healthy"]
+        error_rate = profile["recent_metrics"]["error_rate"]
+        endpoint_status = profile["endpoint_health"]["status"]
+        if endpoint_status == "unhealthy" or error_rate > 10 or len(unhealthy_deps) > 1:
+            overall = "RED"
+        elif endpoint_status == "degraded" or error_rate > 1 or len(unhealthy_deps) > 0:
+            overall = "YELLOW"
+        else:
+            overall = "GREEN"
+        result = {
+            "service": service_name,
+            "overall_health": overall,
+            "timestamp": datetime.utcnow().isoformat() + "Z",
+            "endpoint_health": profile["endpoint_health"],
+            "dependency_health": {
+                "total": len(profile["dependencies"]),
+                "healthy": len(profile["dependencies"]) - len(unhealthy_deps),
+                "unhealthy": len(unhealthy_deps),
+                "details": profile["dependencies"],
+            },
+            "recent_metrics": profile["recent_metrics"],
+            "deployment": profile["deployment"],
+            "cert_status": {
+                "days_until_expiry": profile["cert_expiry_days"],
+                "status": "OK" if profile["cert_expiry_days"] > 14 else "EXPIRING_SOON" if profile["cert_expiry_days"] > 0 else "EXPIRED",
+            },
+            "recommendations": [],
+        }
+        if overall == "RED":
+            result["recommendations"].append("URGENT: Service is in critical state — investigate immediately")
+        if unhealthy_deps:
+            for dep in unhealthy_deps:
+                result["recommendations"].append(f"Investigate dependency '{dep['name']}': {dep.get('error', 'unknown issue')}")
+        if error_rate > 5:
+            result["recommendations"].append(f"Error rate ({error_rate}%) is significantly elevated — check logs")
+        if profile["cert_expiry_days"] < 14:
+            result["recommendations"].append(f"TLS certificate expires in {profile['cert_expiry_days']} days — renew immediately")
+        print(f"[HealthChecker] {service_name}: {overall} — {len(unhealthy_deps)} unhealthy deps, error_rate={error_rate}%")
+        return json.dumps(result, indent=2)