ZeroTsai0308 commited on
Commit
60d4ff4
·
verified ·
1 Parent(s): 3691744

Add sre_agent/tools/infrastructure_tools.py

Browse files
sre_agent/tools/infrastructure_tools.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Infrastructure Tools for SRE Agent
3
+
4
+ Resource utilization monitoring and service health checking.
5
+ """
6
+
7
+ import json
8
+ from datetime import datetime, timedelta
9
+ import random
10
+ from smolagents import Tool
11
+
12
+
13
+ class ResourceUtilizationTool(Tool):
14
+ """Check current resource utilization for services/hosts."""
15
+ name = "resource_utilization"
16
+ description = """Checks current resource utilization for a service or infrastructure component.
17
+
18
+ Returns real-time (simulated) metrics:
19
+ - CPU utilization per pod/instance
20
+ - Memory usage per pod/instance
21
+ - Disk I/O and usage
22
+ - Network throughput
23
+ - Pod/container status (running, pending, crash-looping)
24
+
25
+ Use this to get a quick snapshot of resource health for a service.
26
+ """
27
+ inputs = {
28
+ "service_name": {
29
+ "type": "string",
30
+ "description": "Service or component to check, e.g. 'payment-service', 'database-primary'.",
31
+ },
32
+ "resource_type": {
33
+ "type": "string",
34
+ "description": "Specific resource to check: 'all', 'cpu', 'memory', 'disk', 'network'. Default: 'all'.",
35
+ "nullable": True,
36
+ },
37
+ }
38
+ output_type = "string"
39
+
40
+ def forward(self, service_name: str, resource_type: str = "all") -> str:
41
+ print(f"[ResourceUtilization] Checking {resource_type} for '{service_name}'")
42
+
43
+ # Simulate pod-level resource data
44
+ pod_profiles = {
45
+ "payment-service": {"replicas": 5, "cpu_base": 75, "mem_base": 70, "status_mix": ["Running"] * 4 + ["CrashLoopBackOff"]},
46
+ "api-gateway": {"replicas": 3, "cpu_base": 45, "mem_base": 55, "status_mix": ["Running"] * 3},
47
+ "order-service": {"replicas": 3, "cpu_base": 35, "mem_base": 50, "status_mix": ["Running"] * 3},
48
+ "user-service": {"replicas": 2, "cpu_base": 20, "mem_base": 40, "status_mix": ["Running"] * 2},
49
+ "database-primary": {"replicas": 1, "cpu_base": 60, "mem_base": 85, "status_mix": ["Running"]},
50
+ "cache-redis": {"replicas": 3, "cpu_base": 15, "mem_base": 30, "status_mix": ["Running"] * 3},
51
+ }
52
+
53
+ profile = pod_profiles.get(service_name, {"replicas": 2, "cpu_base": 30, "mem_base": 40, "status_mix": ["Running"] * 2})
54
+
55
+ pods = []
56
+ for i in range(profile["replicas"]):
57
+ cpu = profile["cpu_base"] + random.uniform(-10, 20)
58
+ mem = profile["mem_base"] + random.uniform(-5, 15)
59
+ status = profile["status_mix"][i] if i < len(profile["status_mix"]) else "Running"
60
+ restarts = random.randint(5, 20) if status == "CrashLoopBackOff" else 0
61
+
62
+ pod = {
63
+ "name": f"{service_name}-{random.randint(1000, 9999)}-{''.join(random.choices('abcdef0123456789', k=5))}",
64
+ "status": status,
65
+ "restarts": restarts,
66
+ "age": f"{random.randint(1, 30)}d",
67
+ }
68
+
69
+ if resource_type in ("all", "cpu"):
70
+ pod["cpu"] = {
71
+ "usage_pct": round(min(cpu, 100), 1),
72
+ "request": "500m",
73
+ "limit": "2000m",
74
+ "throttled": cpu > 90,
75
+ }
76
+ if resource_type in ("all", "memory"):
77
+ pod["memory"] = {
78
+ "usage_pct": round(min(mem, 100), 1),
79
+ "usage_mb": round(mem * 40.96, 0), # Assuming 4GB limit
80
+ "request": "1Gi",
81
+ "limit": "4Gi",
82
+ "oom_risk": mem > 85,
83
+ }
84
+ if resource_type in ("all", "disk"):
85
+ pod["disk"] = {
86
+ "usage_pct": round(random.uniform(20, 70), 1),
87
+ "io_read_mbps": round(random.uniform(1, 50), 1),
88
+ "io_write_mbps": round(random.uniform(1, 30), 1),
89
+ }
90
+ if resource_type in ("all", "network"):
91
+ pod["network"] = {
92
+ "rx_mbps": round(random.uniform(10, 200), 1),
93
+ "tx_mbps": round(random.uniform(5, 100), 1),
94
+ "connections_active": random.randint(50, 500),
95
+ "connections_max": 1000,
96
+ }
97
+
98
+ pods.append(pod)
99
+
100
+ # Aggregate stats
101
+ cpu_values = [p["cpu"]["usage_pct"] for p in pods if "cpu" in p]
102
+ mem_values = [p["memory"]["usage_pct"] for p in pods if "memory" in p]
103
+
104
+ unhealthy = [p for p in pods if p["status"] != "Running"]
105
+
106
+ result = {
107
+ "service": service_name,
108
+ "timestamp": datetime.utcnow().isoformat() + "Z",
109
+ "total_pods": len(pods),
110
+ "healthy_pods": len(pods) - len(unhealthy),
111
+ "unhealthy_pods": [{"name": p["name"], "status": p["status"], "restarts": p["restarts"]} for p in unhealthy],
112
+ "aggregate": {},
113
+ "pods": pods,
114
+ "alerts": [],
115
+ }
116
+
117
+ if cpu_values:
118
+ result["aggregate"]["cpu"] = {
119
+ "avg_pct": round(sum(cpu_values) / len(cpu_values), 1),
120
+ "max_pct": round(max(cpu_values), 1),
121
+ "throttled_pods": sum(1 for p in pods if p.get("cpu", {}).get("throttled")),
122
+ }
123
+ if mem_values:
124
+ result["aggregate"]["memory"] = {
125
+ "avg_pct": round(sum(mem_values) / len(mem_values), 1),
126
+ "max_pct": round(max(mem_values), 1),
127
+ "oom_risk_pods": sum(1 for p in pods if p.get("memory", {}).get("oom_risk")),
128
+ }
129
+
130
+ # Generate alerts
131
+ if unhealthy:
132
+ result["alerts"].append({"severity": "critical", "message": f"{len(unhealthy)} pod(s) unhealthy: {', '.join(p['name'] for p in unhealthy)}"})
133
+ if cpu_values and max(cpu_values) > 85:
134
+ result["alerts"].append({"severity": "warning", "message": f"High CPU detected: max {max(cpu_values):.1f}%"})
135
+ if mem_values and max(mem_values) > 85:
136
+ result["alerts"].append({"severity": "warning", "message": f"High memory detected: max {max(mem_values):.1f}%"})
137
+
138
+ print(f"[ResourceUtilization] {service_name}: {len(pods)} pods, {len(unhealthy)} unhealthy, {len(result['alerts'])} alerts")
139
+ return json.dumps(result, indent=2)
140
+
141
+
142
+ class ServiceHealthCheckerTool(Tool):
143
+ """Quick health check for a service and its dependencies."""
144
+ name = "service_health_checker"
145
+ description = """Performs a comprehensive health check for a service.
146
+
147
+ Checks:
148
+ - Service endpoint health (/healthz, /readyz)
149
+ - Dependency connectivity (databases, caches, queues)
150
+ - Recent error rate and latency
151
+ - Deployment status
152
+ - Certificate validity
153
+
154
+ Returns a traffic-light health status (green/yellow/red) with details.
155
+ Use this for a quick overview of service health before deep investigation.
156
+ """
157
+ inputs = {
158
+ "service_name": {
159
+ "type": "string",
160
+ "description": "Service to check health for, e.g. 'payment-service'.",
161
+ },
162
+ }
163
+ output_type = "string"
164
+
165
+ def forward(self, service_name: str) -> str:
166
+ print(f"[HealthChecker] Running health check for '{service_name}'")
167
+
168
+ # Simulated health checks
169
+ health_profiles = {
170
+ "payment-service": {
171
+ "endpoint_health": {"status": "degraded", "response_time_ms": 2500, "last_check": "10s ago"},
172
+ "dependencies": [
173
+ {"name": "payment-db", "status": "unhealthy", "latency_ms": 30000, "error": "Connection pool exhausted"},
174
+ {"name": "payment-gateway-ext", "status": "healthy", "latency_ms": 150},
175
+ {"name": "fraud-detection", "status": "healthy", "latency_ms": 45},
176
+ {"name": "cache-redis", "status": "healthy", "latency_ms": 2},
177
+ ],
178
+ "recent_metrics": {"error_rate": 12.5, "p99_latency_ms": 2500, "rps": 450},
179
+ "deployment": {"version": "v2.5.0", "deployed_at": "2024-01-15T10:00:00Z", "rollback_available": "v2.4.1"},
180
+ "cert_expiry_days": 45,
181
+ },
182
+ "api-gateway": {
183
+ "endpoint_health": {"status": "healthy", "response_time_ms": 5, "last_check": "10s ago"},
184
+ "dependencies": [
185
+ {"name": "payment-service", "status": "degraded", "latency_ms": 2500, "error": "High latency"},
186
+ {"name": "user-service", "status": "healthy", "latency_ms": 15},
187
+ {"name": "order-service", "status": "healthy", "latency_ms": 20},
188
+ {"name": "auth-service", "status": "healthy", "latency_ms": 10},
189
+ ],
190
+ "recent_metrics": {"error_rate": 3.2, "p99_latency_ms": 180, "rps": 2000},
191
+ "deployment": {"version": "v3.1.0", "deployed_at": "2024-01-10T14:00:00Z", "rollback_available": "v3.0.9"},
192
+ "cert_expiry_days": 30,
193
+ },
194
+ }
195
+
196
+ profile = health_profiles.get(service_name, {
197
+ "endpoint_health": {"status": "healthy", "response_time_ms": random.randint(5, 50), "last_check": "10s ago"},
198
+ "dependencies": [
199
+ {"name": "database", "status": "healthy", "latency_ms": random.randint(1, 20)},
200
+ {"name": "cache", "status": "healthy", "latency_ms": random.randint(1, 5)},
201
+ ],
202
+ "recent_metrics": {"error_rate": round(random.uniform(0, 0.5), 2), "p99_latency_ms": random.randint(10, 100), "rps": random.randint(100, 1000)},
203
+ "deployment": {"version": "v1.0.0", "deployed_at": "2024-01-01T00:00:00Z", "rollback_available": "v0.9.9"},
204
+ "cert_expiry_days": random.randint(30, 365),
205
+ })
206
+
207
+ # Compute overall health
208
+ unhealthy_deps = [d for d in profile["dependencies"] if d["status"] != "healthy"]
209
+ error_rate = profile["recent_metrics"]["error_rate"]
210
+ endpoint_status = profile["endpoint_health"]["status"]
211
+
212
+ if endpoint_status == "unhealthy" or error_rate > 10 or len(unhealthy_deps) > 1:
213
+ overall = "RED"
214
+ elif endpoint_status == "degraded" or error_rate > 1 or len(unhealthy_deps) > 0:
215
+ overall = "YELLOW"
216
+ else:
217
+ overall = "GREEN"
218
+
219
+ result = {
220
+ "service": service_name,
221
+ "overall_health": overall,
222
+ "timestamp": datetime.utcnow().isoformat() + "Z",
223
+ "endpoint_health": profile["endpoint_health"],
224
+ "dependency_health": {
225
+ "total": len(profile["dependencies"]),
226
+ "healthy": len(profile["dependencies"]) - len(unhealthy_deps),
227
+ "unhealthy": len(unhealthy_deps),
228
+ "details": profile["dependencies"],
229
+ },
230
+ "recent_metrics": profile["recent_metrics"],
231
+ "deployment": profile["deployment"],
232
+ "cert_status": {
233
+ "days_until_expiry": profile["cert_expiry_days"],
234
+ "status": "OK" if profile["cert_expiry_days"] > 14 else "EXPIRING_SOON" if profile["cert_expiry_days"] > 0 else "EXPIRED",
235
+ },
236
+ "recommendations": [],
237
+ }
238
+
239
+ if overall == "RED":
240
+ result["recommendations"].append("URGENT: Service is in critical state — investigate immediately")
241
+ if unhealthy_deps:
242
+ for dep in unhealthy_deps:
243
+ result["recommendations"].append(f"Investigate dependency '{dep['name']}': {dep.get('error', 'unknown issue')}")
244
+ if error_rate > 5:
245
+ result["recommendations"].append(f"Error rate ({error_rate}%) is significantly elevated — check logs")
246
+ if profile["cert_expiry_days"] < 14:
247
+ result["recommendations"].append(f"TLS certificate expires in {profile['cert_expiry_days']} days — renew immediately")
248
+
249
+ print(f"[HealthChecker] {service_name}: {overall} — {len(unhealthy_deps)} unhealthy deps, error_rate={error_rate}%")
250
+ return json.dumps(result, indent=2)