ZeroTsai0308 commited on
Commit
350aeeb
·
verified ·
1 Parent(s): c2e0132

Add sre_agent/tools/log_analysis_tools.py

Browse files
Files changed (1) hide show
  1. sre_agent/tools/log_analysis_tools.py +509 -0
sre_agent/tools/log_analysis_tools.py ADDED
@@ -0,0 +1,509 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Log Analysis Tools for SRE Agent
3
+
4
+ Implements structured log parsing, anomaly detection, and pattern extraction.
5
+ Uses techniques from:
6
+ - TrioXpert two-stage keyword+semantic log filtering (arxiv:2506.10043)
7
+ - LogAI library patterns (Salesforce, arxiv:2301.13415)
8
+ """
9
+
10
+ import json
11
+ import re
12
+ from datetime import datetime
13
+ from typing import Optional
14
+ from smolagents import Tool
15
+
16
+
17
+ class LogParserTool(Tool):
18
+ """Parse and filter logs for errors, patterns, and structured extraction."""
19
+ name = "log_parser"
20
+ description = """Parses raw log content and extracts structured information.
21
+
22
+ Capabilities:
23
+ - Filter logs by severity level (ERROR, WARN, INFO, DEBUG)
24
+ - Search for specific patterns (regex supported)
25
+ - Extract timestamps, service names, error codes
26
+ - Compute error frequency and distribution
27
+ - Identify error bursts (clustered errors in short time windows)
28
+
29
+ Returns structured JSON with matched entries, frequency analysis, and temporal distribution.
30
+ Use this to understand what's happening in logs during an incident.
31
+ """
32
+ inputs = {
33
+ "log_content": {
34
+ "type": "string",
35
+ "description": "Raw log text (multi-line). Or 'auto' for simulated log data.",
36
+ },
37
+ "filter_pattern": {
38
+ "type": "string",
39
+ "description": "Regex pattern to filter log lines. E.g. 'ERROR|CRITICAL', 'timeout|OOM', 'status=[45]\\d{2}'. Default: 'ERROR|CRITICAL|FATAL'.",
40
+ "nullable": True,
41
+ },
42
+ "service_name": {
43
+ "type": "string",
44
+ "description": "Service name to focus on (optional). If provided, only logs from this service are analyzed.",
45
+ "nullable": True,
46
+ },
47
+ "time_window_minutes": {
48
+ "type": "integer",
49
+ "description": "Analyze logs within the last N minutes. Default: 60.",
50
+ "nullable": True,
51
+ },
52
+ }
53
+ output_type = "string"
54
+
55
+ def _generate_sample_logs(self, service_name: str = None) -> str:
56
+ """Generate realistic log data with various error patterns."""
57
+ import random
58
+ services = [service_name] if service_name else ["api-gateway", "payment-service", "user-service", "order-service", "inventory-service"]
59
+ levels = ["INFO", "INFO", "INFO", "INFO", "WARN", "WARN", "ERROR", "ERROR", "CRITICAL"]
60
+
61
+ error_messages = [
62
+ "Connection timeout to database after 30000ms",
63
+ "OOM: Java heap space exceeded (max 4096MB)",
64
+ "Circuit breaker OPEN for downstream service",
65
+ "TLS handshake failed: certificate expired",
66
+ "Rate limit exceeded: 429 Too Many Requests",
67
+ "Disk space critically low: /data 95% used",
68
+ "Pod evicted due to memory pressure",
69
+ "DNS resolution failed for service-mesh.internal",
70
+ "Health check failed: /healthz returned 503",
71
+ "Deadlock detected in connection pool",
72
+ "gRPC call failed: UNAVAILABLE - transport closing",
73
+ "Kafka consumer lag exceeding threshold: 50000 messages",
74
+ "Redis connection refused: max clients reached",
75
+ "Request body too large: 52428800 bytes exceeds 10485760 limit",
76
+ "Authentication token expired for service account",
77
+ ]
78
+
79
+ info_messages = [
80
+ "Request processed successfully in {}ms",
81
+ "Health check passed: all dependencies healthy",
82
+ "Cache hit ratio: {:.1%}",
83
+ "Scaling replicas from {} to {}",
84
+ "Deployment rollout complete: v{}.{}.{}",
85
+ "Batch job completed: processed {} records",
86
+ "Connection pool stats: active={}, idle={}, waiting={}",
87
+ ]
88
+
89
+ now = datetime.utcnow()
90
+ lines = []
91
+
92
+ # Generate 200 log lines over 60 minutes
93
+ for i in range(200):
94
+ import random as rand
95
+ offset = random.randint(0, 3600)
96
+ ts = datetime(now.year, now.month, now.day, now.hour, now.minute, now.second)
97
+ from datetime import timedelta
98
+ ts = now - timedelta(seconds=3600 - offset)
99
+ ts_str = ts.strftime("%Y-%m-%dT%H:%M:%S.") + f"{random.randint(0,999):03d}Z"
100
+
101
+ service = random.choice(services)
102
+ level = random.choice(levels)
103
+
104
+ if level in ("ERROR", "CRITICAL"):
105
+ msg = random.choice(error_messages)
106
+ elif level == "WARN":
107
+ msg = random.choice(error_messages[:5]) if random.random() > 0.5 else f"Slow response: {random.randint(500, 5000)}ms"
108
+ else:
109
+ template = random.choice(info_messages)
110
+ try:
111
+ msg = template.format(
112
+ random.randint(5, 200),
113
+ random.random(),
114
+ random.randint(1, 10),
115
+ random.randint(1, 5),
116
+ random.randint(1, 3),
117
+ random.randint(0, 9),
118
+ random.randint(100, 10000),
119
+ )
120
+ except (IndexError, KeyError):
121
+ msg = template.format(random.randint(5, 200))
122
+
123
+ # Add request ID for tracing
124
+ req_id = f"req-{random.randint(10000, 99999)}"
125
+ lines.append(f"{ts_str} [{level}] [{service}] [{req_id}] {msg}")
126
+
127
+ # Inject an error burst (simulate incident)
128
+ burst_service = service_name or "payment-service"
129
+ for i in range(15):
130
+ ts = now - timedelta(seconds=random.randint(300, 600))
131
+ ts_str = ts.strftime("%Y-%m-%dT%H:%M:%S.") + f"{random.randint(0,999):03d}Z"
132
+ req_id = f"req-{random.randint(10000, 99999)}"
133
+ msg = random.choice(error_messages[:3])
134
+ lines.append(f"{ts_str} [ERROR] [{burst_service}] [{req_id}] {msg}")
135
+
136
+ lines.sort() # Sort by timestamp
137
+ return "\n".join(lines)
138
+
139
+ def forward(
140
+ self,
141
+ log_content: str,
142
+ filter_pattern: str = "ERROR|CRITICAL|FATAL",
143
+ service_name: str = "",
144
+ time_window_minutes: int = 60,
145
+ ) -> str:
146
+ if log_content.strip().lower() == "auto":
147
+ log_content = self._generate_sample_logs(service_name if service_name else None)
148
+ print(f"[LogParser] Generated simulated log data")
149
+
150
+ lines = log_content.strip().split("\n")
151
+ total_lines = len(lines)
152
+ print(f"[LogParser] Processing {total_lines} log lines with pattern '{filter_pattern}'")
153
+
154
+ # Filter by service if specified
155
+ if service_name:
156
+ lines = [l for l in lines if service_name.lower() in l.lower()]
157
+ print(f"[LogParser] Filtered to {len(lines)} lines for service '{service_name}'")
158
+
159
+ # Match pattern
160
+ matched = []
161
+ for line in lines:
162
+ if re.search(filter_pattern, line, re.IGNORECASE):
163
+ matched.append(line)
164
+
165
+ # Parse structured fields from matched lines
166
+ parsed_entries = []
167
+ level_counts = {}
168
+ service_counts = {}
169
+ error_types = {}
170
+
171
+ ts_pattern = re.compile(r'(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})')
172
+ level_pattern = re.compile(r'\[(ERROR|WARN|CRITICAL|FATAL|INFO|DEBUG)\]')
173
+ service_pattern = re.compile(r'\[([a-zA-Z][\w-]+)\]')
174
+
175
+ for line in matched:
176
+ entry = {"raw": line}
177
+
178
+ ts_match = ts_pattern.search(line)
179
+ if ts_match:
180
+ entry["timestamp"] = ts_match.group(1)
181
+
182
+ level_match = level_pattern.search(line)
183
+ if level_match:
184
+ level = level_match.group(1)
185
+ entry["level"] = level
186
+ level_counts[level] = level_counts.get(level, 0) + 1
187
+
188
+ svc_matches = service_pattern.findall(line)
189
+ # Filter out known non-service tokens
190
+ svc_matches = [s for s in svc_matches if s not in ("ERROR", "WARN", "CRITICAL", "FATAL", "INFO", "DEBUG") and not s.startswith("req-")]
191
+ if svc_matches:
192
+ entry["service"] = svc_matches[0]
193
+ service_counts[svc_matches[0]] = service_counts.get(svc_matches[0], 0) + 1
194
+
195
+ # Categorize error type
196
+ error_keywords = {
197
+ "timeout": "TIMEOUT",
198
+ "OOM": "OUT_OF_MEMORY",
199
+ "circuit breaker": "CIRCUIT_BREAKER",
200
+ "TLS": "TLS_ERROR",
201
+ "rate limit": "RATE_LIMIT",
202
+ "disk": "DISK_SPACE",
203
+ "evicted": "POD_EVICTION",
204
+ "DNS": "DNS_FAILURE",
205
+ "health check": "HEALTH_CHECK",
206
+ "deadlock": "DEADLOCK",
207
+ "gRPC": "GRPC_ERROR",
208
+ "Kafka": "KAFKA_LAG",
209
+ "Redis": "REDIS_ERROR",
210
+ "too large": "PAYLOAD_SIZE",
211
+ "token expired": "AUTH_ERROR",
212
+ }
213
+ for keyword, error_type in error_keywords.items():
214
+ if keyword.lower() in line.lower():
215
+ entry["error_type"] = error_type
216
+ error_types[error_type] = error_types.get(error_type, 0) + 1
217
+ break
218
+
219
+ parsed_entries.append(entry)
220
+
221
+ # Detect error bursts (clusters of errors within 5-minute windows)
222
+ timestamps = []
223
+ for entry in parsed_entries:
224
+ if "timestamp" in entry:
225
+ try:
226
+ ts = datetime.fromisoformat(entry["timestamp"])
227
+ timestamps.append(ts)
228
+ except ValueError:
229
+ pass
230
+
231
+ bursts = []
232
+ if timestamps:
233
+ timestamps.sort()
234
+ window_seconds = 300 # 5 minutes
235
+ i = 0
236
+ while i < len(timestamps):
237
+ window_count = 1
238
+ j = i + 1
239
+ while j < len(timestamps) and (timestamps[j] - timestamps[i]).total_seconds() < window_seconds:
240
+ window_count += 1
241
+ j += 1
242
+ if window_count >= 5: # 5+ errors in 5 minutes = burst
243
+ bursts.append({
244
+ "start": timestamps[i].isoformat(),
245
+ "end": timestamps[j - 1].isoformat(),
246
+ "count": window_count,
247
+ "severity": "critical" if window_count >= 10 else "warning",
248
+ })
249
+ i = j
250
+ else:
251
+ i += 1
252
+
253
+ result = {
254
+ "total_lines_processed": total_lines,
255
+ "lines_after_service_filter": len(lines) if service_name else total_lines,
256
+ "matched_lines": len(matched),
257
+ "filter_pattern": filter_pattern,
258
+ "severity_distribution": level_counts,
259
+ "service_distribution": dict(sorted(service_counts.items(), key=lambda x: x[1], reverse=True)),
260
+ "error_type_distribution": dict(sorted(error_types.items(), key=lambda x: x[1], reverse=True)),
261
+ "error_bursts": bursts,
262
+ "sample_entries": parsed_entries[:20], # Top 20 for context
263
+ "summary": {
264
+ "most_affected_service": max(service_counts, key=service_counts.get) if service_counts else "unknown",
265
+ "most_common_error": max(error_types, key=error_types.get) if error_types else "unknown",
266
+ "has_error_burst": len(bursts) > 0,
267
+ "error_rate_per_line": round(len(matched) / max(total_lines, 1), 4),
268
+ },
269
+ }
270
+
271
+ print(f"[LogParser] Found {len(matched)} matching lines, {len(bursts)} error bursts")
272
+ return json.dumps(result, indent=2)
273
+
274
+
275
+ class LogAnomalyDetectorTool(Tool):
276
+ """Detect anomalous log patterns using frequency analysis."""
277
+ name = "log_anomaly_detector"
278
+ description = """Detects anomalous log patterns by analyzing log frequency, new/rare log templates,
279
+ and sudden changes in log volume.
280
+
281
+ Use this to find:
282
+ - Sudden spikes in error logs
283
+ - New error messages that haven't appeared before
284
+ - Unusual log volume patterns (too many or too few logs)
285
+ - Log template drift (new types of messages appearing)
286
+
287
+ Complements the log_parser tool — use log_parser for filtering, this tool for pattern-level anomaly detection.
288
+ """
289
+ inputs = {
290
+ "log_content": {
291
+ "type": "string",
292
+ "description": "Raw log text (multi-line) or 'auto' for simulated data.",
293
+ },
294
+ "baseline_content": {
295
+ "type": "string",
296
+ "description": "Optional: baseline/normal log content for comparison. If not provided, uses first half of data as baseline.",
297
+ "nullable": True,
298
+ },
299
+ }
300
+ output_type = "string"
301
+
302
+ def _extract_template(self, line: str) -> str:
303
+ """Extract a log template by replacing variable parts."""
304
+ # Remove timestamps
305
+ result = re.sub(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z?', '<TS>', line)
306
+ # Remove UUIDs / request IDs
307
+ result = re.sub(r'[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}', '<UUID>', result)
308
+ result = re.sub(r'req-\d+', '<REQ_ID>', result)
309
+ # Remove numbers
310
+ result = re.sub(r'\b\d+\.?\d*\b', '<NUM>', result)
311
+ # Remove IP addresses
312
+ result = re.sub(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', '<IP>', result)
313
+ return result.strip()
314
+
315
+ def forward(self, log_content: str, baseline_content: str = "") -> str:
316
+ if log_content.strip().lower() == "auto":
317
+ # Generate logs with some anomalous patterns
318
+ import random
319
+ normal_templates = [
320
+ "2024-01-15T10:{:02d}:{:02d}.000Z [INFO] [api-gateway] [req-{}] Request processed in {}ms",
321
+ "2024-01-15T10:{:02d}:{:02d}.000Z [INFO] [user-service] [req-{}] Cache hit for user {}",
322
+ "2024-01-15T10:{:02d}:{:02d}.000Z [INFO] [order-service] [req-{}] Order {} created",
323
+ "2024-01-15T10:{:02d}:{:02d}.000Z [WARN] [api-gateway] [req-{}] Slow response: {}ms",
324
+ ]
325
+ anomaly_templates = [
326
+ "2024-01-15T10:{:02d}:{:02d}.000Z [ERROR] [payment-service] [req-{}] CRITICAL: Database connection pool exhausted",
327
+ "2024-01-15T10:{:02d}:{:02d}.000Z [ERROR] [payment-service] [req-{}] Transaction deadlock detected on table payments",
328
+ "2024-01-15T10:{:02d}:{:02d}.000Z [CRITICAL] [payment-service] [req-{}] Cascading failure: all replicas unhealthy",
329
+ ]
330
+
331
+ lines = []
332
+ # Normal baseline (first 30 min)
333
+ for i in range(100):
334
+ t = random.choice(normal_templates)
335
+ lines.append(t.format(random.randint(0, 29), random.randint(0, 59), random.randint(10000, 99999), random.randint(1, 500)))
336
+ # Anomalous period (last 30 min)
337
+ for i in range(60):
338
+ t = random.choice(normal_templates)
339
+ lines.append(t.format(random.randint(30, 59), random.randint(0, 59), random.randint(10000, 99999), random.randint(1, 500)))
340
+ for i in range(40):
341
+ t = random.choice(anomaly_templates)
342
+ lines.append(t.format(random.randint(30, 59), random.randint(0, 59), random.randint(10000, 99999)))
343
+
344
+ lines.sort()
345
+ log_content = "\n".join(lines)
346
+
347
+ lines = log_content.strip().split("\n")
348
+
349
+ # Split into baseline and current if no baseline provided
350
+ if baseline_content:
351
+ baseline_lines = baseline_content.strip().split("\n")
352
+ current_lines = lines
353
+ else:
354
+ mid = len(lines) // 2
355
+ baseline_lines = lines[:mid]
356
+ current_lines = lines[mid:]
357
+
358
+ print(f"[LogAnomalyDetector] Analyzing {len(current_lines)} current lines against {len(baseline_lines)} baseline lines")
359
+
360
+ # Extract templates
361
+ baseline_templates = {}
362
+ for line in baseline_lines:
363
+ template = self._extract_template(line)
364
+ baseline_templates[template] = baseline_templates.get(template, 0) + 1
365
+
366
+ current_templates = {}
367
+ for line in current_lines:
368
+ template = self._extract_template(line)
369
+ current_templates[template] = current_templates.get(template, 0) + 1
370
+
371
+ # Find new templates (not in baseline)
372
+ new_templates = {k: v for k, v in current_templates.items() if k not in baseline_templates}
373
+
374
+ # Find templates with significant frequency change
375
+ frequency_changes = []
376
+ for template, current_count in current_templates.items():
377
+ baseline_count = baseline_templates.get(template, 0)
378
+ if baseline_count > 0:
379
+ change_ratio = current_count / baseline_count
380
+ if change_ratio > 2.0 or change_ratio < 0.5:
381
+ frequency_changes.append({
382
+ "template": template[:150],
383
+ "baseline_count": baseline_count,
384
+ "current_count": current_count,
385
+ "change_ratio": round(change_ratio, 2),
386
+ "direction": "increase" if change_ratio > 1 else "decrease",
387
+ })
388
+
389
+ # Volume analysis
390
+ baseline_volume_per_min = len(baseline_lines) / max(1, 30) # assume 30 min windows
391
+ current_volume_per_min = len(current_lines) / max(1, 30)
392
+ volume_change = current_volume_per_min / max(baseline_volume_per_min, 0.01)
393
+
394
+ result = {
395
+ "baseline_lines": len(baseline_lines),
396
+ "current_lines": len(current_lines),
397
+ "baseline_unique_templates": len(baseline_templates),
398
+ "current_unique_templates": len(current_templates),
399
+ "new_templates": {
400
+ "count": len(new_templates),
401
+ "templates": [{"template": k[:150], "count": v} for k, v in sorted(new_templates.items(), key=lambda x: x[1], reverse=True)[:10]],
402
+ },
403
+ "frequency_changes": sorted(frequency_changes, key=lambda x: abs(x["change_ratio"]), reverse=True)[:10],
404
+ "volume_analysis": {
405
+ "baseline_volume_per_min": round(baseline_volume_per_min, 2),
406
+ "current_volume_per_min": round(current_volume_per_min, 2),
407
+ "volume_change_ratio": round(volume_change, 2),
408
+ "anomalous_volume": volume_change > 2.0 or volume_change < 0.5,
409
+ },
410
+ "verdict": {
411
+ "has_new_error_patterns": len(new_templates) > 0,
412
+ "has_frequency_anomalies": len(frequency_changes) > 0,
413
+ "has_volume_anomaly": volume_change > 2.0 or volume_change < 0.5,
414
+ "severity": (
415
+ "critical" if len(new_templates) > 5 or volume_change > 5
416
+ else "warning" if len(new_templates) > 0 or len(frequency_changes) > 0
417
+ else "ok"
418
+ ),
419
+ },
420
+ }
421
+
422
+ print(f"[LogAnomalyDetector] Found {len(new_templates)} new templates, {len(frequency_changes)} frequency changes")
423
+ return json.dumps(result, indent=2)
424
+
425
+
426
+ class LogPatternExtractorTool(Tool):
427
+ """Extract common patterns and keywords from logs for RCA."""
428
+ name = "log_pattern_extractor"
429
+ description = """Extracts common patterns, error codes, service names, and key phrases from log data.
430
+
431
+ Use this to:
432
+ - Identify the most frequent error messages
433
+ - Extract HTTP status codes, error codes, exception types
434
+ - Find common service/component names in error logs
435
+ - Build a keyword summary for root cause investigation
436
+
437
+ Good as a preprocessing step before feeding results to the RCA correlator.
438
+ """
439
+ inputs = {
440
+ "log_content": {
441
+ "type": "string",
442
+ "description": "Raw log text (multi-line) or 'auto' for simulated data.",
443
+ },
444
+ "top_k": {
445
+ "type": "integer",
446
+ "description": "Number of top patterns to return. Default: 10.",
447
+ "nullable": True,
448
+ },
449
+ }
450
+ output_type = "string"
451
+
452
+ def forward(self, log_content: str, top_k: int = 10) -> str:
453
+ if log_content.strip().lower() == "auto":
454
+ log_content = LogParserTool()._generate_sample_logs()
455
+
456
+ lines = log_content.strip().split("\n")
457
+ print(f"[LogPatternExtractor] Extracting patterns from {len(lines)} lines")
458
+
459
+ # Extract HTTP status codes
460
+ status_codes = {}
461
+ for code in re.findall(r'\b[2345]\d{2}\b', log_content):
462
+ status_codes[code] = status_codes.get(code, 0) + 1
463
+
464
+ # Extract exception/error types
465
+ exceptions = {}
466
+ for exc in re.findall(r'(?:Exception|Error|Failure|Fault|Timeout|OOM|Deadlock|CRITICAL)\b[:\s]*([\w\s]+?)(?:\.|,|\n|$)', log_content, re.IGNORECASE):
467
+ exc_clean = exc.strip()[:50]
468
+ if exc_clean:
469
+ exceptions[exc_clean] = exceptions.get(exc_clean, 0) + 1
470
+
471
+ # Extract service/component names
472
+ services = {}
473
+ for svc in re.findall(r'\[([a-zA-Z][\w-]+(?:-service|-api|-worker|-gateway|-proxy))\]', log_content):
474
+ services[svc] = services.get(svc, 0) + 1
475
+
476
+ # Extract key phrases (bigrams from error lines)
477
+ error_lines = [l for l in lines if re.search(r'ERROR|CRITICAL|FATAL', l, re.IGNORECASE)]
478
+ word_freq = {}
479
+ for line in error_lines:
480
+ words = re.findall(r'\b[a-zA-Z]{3,}\b', line.lower())
481
+ # Filter common words
482
+ stopwords = {'the', 'and', 'for', 'from', 'with', 'this', 'that', 'was', 'are', 'not', 'but', 'has', 'had', 'have', 'been', 'info', 'error', 'warn', 'critical', 'fatal', 'debug'}
483
+ words = [w for w in words if w not in stopwords]
484
+ for w in words:
485
+ word_freq[w] = word_freq.get(w, 0) + 1
486
+
487
+ result = {
488
+ "total_lines": len(lines),
489
+ "error_lines": len(error_lines),
490
+ "status_codes": dict(sorted(status_codes.items(), key=lambda x: x[1], reverse=True)[:top_k]),
491
+ "exception_types": dict(sorted(exceptions.items(), key=lambda x: x[1], reverse=True)[:top_k]),
492
+ "services_mentioned": dict(sorted(services.items(), key=lambda x: x[1], reverse=True)[:top_k]),
493
+ "top_error_keywords": dict(sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:top_k * 2]),
494
+ "key_insights": [],
495
+ }
496
+
497
+ # Generate insights
498
+ if status_codes:
499
+ top_code = max(status_codes, key=status_codes.get)
500
+ result["key_insights"].append(f"Most common HTTP status: {top_code} ({status_codes[top_code]} occurrences)")
501
+ if exceptions:
502
+ top_exc = max(exceptions, key=exceptions.get)
503
+ result["key_insights"].append(f"Most common error type: {top_exc} ({exceptions[top_exc]} occurrences)")
504
+ if services:
505
+ top_svc = max(services, key=services.get)
506
+ result["key_insights"].append(f"Most affected service: {top_svc} ({services[top_svc]} mentions in error logs)")
507
+
508
+ print(f"[LogPatternExtractor] Extracted {len(status_codes)} status codes, {len(exceptions)} exception types, {len(services)} services")
509
+ return json.dumps(result, indent=2)