Spaces:

ZeroTsai0308
/

sre-agent

Sleeping

App Files Files Community

ZeroTsai0308 commited on Apr 24

Commit

350aeeb

verified ·

1 Parent(s): c2e0132

Add sre_agent/tools/log_analysis_tools.py

Browse files

Files changed (1) hide show

sre_agent/tools/log_analysis_tools.py +509 -0

sre_agent/tools/log_analysis_tools.py ADDED Viewed

	@@ -0,0 +1,509 @@

+"""
+Log Analysis Tools for SRE Agent
+Implements structured log parsing, anomaly detection, and pattern extraction.
+Uses techniques from:
+- TrioXpert two-stage keyword+semantic log filtering (arxiv:2506.10043)
+- LogAI library patterns (Salesforce, arxiv:2301.13415)
+"""
+import json
+import re
+from datetime import datetime
+from typing import Optional
+from smolagents import Tool
+class LogParserTool(Tool):
+    """Parse and filter logs for errors, patterns, and structured extraction."""
+    name = "log_parser"
+    description = """Parses raw log content and extracts structured information.
+    Capabilities:
+    - Filter logs by severity level (ERROR, WARN, INFO, DEBUG)
+    - Search for specific patterns (regex supported)
+    - Extract timestamps, service names, error codes
+    - Compute error frequency and distribution
+    - Identify error bursts (clustered errors in short time windows)
+    Returns structured JSON with matched entries, frequency analysis, and temporal distribution.
+    Use this to understand what's happening in logs during an incident.
+    """
+    inputs = {
+        "log_content": {
+            "type": "string",
+            "description": "Raw log text (multi-line). Or 'auto' for simulated log data.",
+        },
+        "filter_pattern": {
+            "type": "string",
+            "description": "Regex pattern to filter log lines. E.g. 'ERROR|CRITICAL', 'timeout|OOM', 'status=[45]\\d{2}'. Default: 'ERROR|CRITICAL|FATAL'.",
+            "nullable": True,
+        },
+        "service_name": {
+            "type": "string",
+            "description": "Service name to focus on (optional). If provided, only logs from this service are analyzed.",
+            "nullable": True,
+        },
+        "time_window_minutes": {
+            "type": "integer",
+            "description": "Analyze logs within the last N minutes. Default: 60.",
+            "nullable": True,
+        },
+    }
+    output_type = "string"
+    def _generate_sample_logs(self, service_name: str = None) -> str:
+        """Generate realistic log data with various error patterns."""
+        import random
+        services = [service_name] if service_name else ["api-gateway", "payment-service", "user-service", "order-service", "inventory-service"]
+        levels = ["INFO", "INFO", "INFO", "INFO", "WARN", "WARN", "ERROR", "ERROR", "CRITICAL"]
+        error_messages = [
+            "Connection timeout to database after 30000ms",
+            "OOM: Java heap space exceeded (max 4096MB)",
+            "Circuit breaker OPEN for downstream service",
+            "TLS handshake failed: certificate expired",
+            "Rate limit exceeded: 429 Too Many Requests",
+            "Disk space critically low: /data 95% used",
+            "Pod evicted due to memory pressure",
+            "DNS resolution failed for service-mesh.internal",
+            "Health check failed: /healthz returned 503",
+            "Deadlock detected in connection pool",
+            "gRPC call failed: UNAVAILABLE - transport closing",
+            "Kafka consumer lag exceeding threshold: 50000 messages",
+            "Redis connection refused: max clients reached",
+            "Request body too large: 52428800 bytes exceeds 10485760 limit",
+            "Authentication token expired for service account",
+        ]
+        info_messages = [
+            "Request processed successfully in {}ms",
+            "Health check passed: all dependencies healthy",
+            "Cache hit ratio: {:.1%}",
+            "Scaling replicas from {} to {}",
+            "Deployment rollout complete: v{}.{}.{}",
+            "Batch job completed: processed {} records",
+            "Connection pool stats: active={}, idle={}, waiting={}",
+        ]
+        now = datetime.utcnow()
+        lines = []
+        # Generate 200 log lines over 60 minutes
+        for i in range(200):
+            import random as rand
+            offset = random.randint(0, 3600)
+            ts = datetime(now.year, now.month, now.day, now.hour, now.minute, now.second)
+            from datetime import timedelta
+            ts = now - timedelta(seconds=3600 - offset)
+            ts_str = ts.strftime("%Y-%m-%dT%H:%M:%S.") + f"{random.randint(0,999):03d}Z"
+            service = random.choice(services)
+            level = random.choice(levels)
+            if level in ("ERROR", "CRITICAL"):
+                msg = random.choice(error_messages)
+            elif level == "WARN":
+                msg = random.choice(error_messages[:5]) if random.random() > 0.5 else f"Slow response: {random.randint(500, 5000)}ms"
+            else:
+                template = random.choice(info_messages)
+                try:
+                    msg = template.format(
+                        random.randint(5, 200),
+                        random.random(),
+                        random.randint(1, 10),
+                        random.randint(1, 5),
+                        random.randint(1, 3),
+                        random.randint(0, 9),
+                        random.randint(100, 10000),
+                    )
+                except (IndexError, KeyError):
+                    msg = template.format(random.randint(5, 200))
+            # Add request ID for tracing
+            req_id = f"req-{random.randint(10000, 99999)}"
+            lines.append(f"{ts_str} [{level}] [{service}] [{req_id}] {msg}")
+        # Inject an error burst (simulate incident)
+        burst_service = service_name or "payment-service"
+        for i in range(15):
+            ts = now - timedelta(seconds=random.randint(300, 600))
+            ts_str = ts.strftime("%Y-%m-%dT%H:%M:%S.") + f"{random.randint(0,999):03d}Z"
+            req_id = f"req-{random.randint(10000, 99999)}"
+            msg = random.choice(error_messages[:3])
+            lines.append(f"{ts_str} [ERROR] [{burst_service}] [{req_id}] {msg}")
+        lines.sort()  # Sort by timestamp
+        return "\n".join(lines)
+    def forward(
+        self,
+        log_content: str,
+        filter_pattern: str = "ERROR|CRITICAL|FATAL",
+        service_name: str = "",
+        time_window_minutes: int = 60,
+    ) -> str:
+        if log_content.strip().lower() == "auto":
+            log_content = self._generate_sample_logs(service_name if service_name else None)
+            print(f"[LogParser] Generated simulated log data")
+        lines = log_content.strip().split("\n")
+        total_lines = len(lines)
+        print(f"[LogParser] Processing {total_lines} log lines with pattern '{filter_pattern}'")
+        # Filter by service if specified
+        if service_name:
+            lines = [l for l in lines if service_name.lower() in l.lower()]
+            print(f"[LogParser] Filtered to {len(lines)} lines for service '{service_name}'")
+        # Match pattern
+        matched = []
+        for line in lines:
+            if re.search(filter_pattern, line, re.IGNORECASE):
+                matched.append(line)
+        # Parse structured fields from matched lines
+        parsed_entries = []
+        level_counts = {}
+        service_counts = {}
+        error_types = {}
+        ts_pattern = re.compile(r'(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})')
+        level_pattern = re.compile(r'\[(ERROR|WARN|CRITICAL|FATAL|INFO|DEBUG)\]')
+        service_pattern = re.compile(r'\[([a-zA-Z][\w-]+)\]')
+        for line in matched:
+            entry = {"raw": line}
+            ts_match = ts_pattern.search(line)
+            if ts_match:
+                entry["timestamp"] = ts_match.group(1)
+            level_match = level_pattern.search(line)
+            if level_match:
+                level = level_match.group(1)
+                entry["level"] = level
+                level_counts[level] = level_counts.get(level, 0) + 1
+            svc_matches = service_pattern.findall(line)
+            # Filter out known non-service tokens
+            svc_matches = [s for s in svc_matches if s not in ("ERROR", "WARN", "CRITICAL", "FATAL", "INFO", "DEBUG") and not s.startswith("req-")]
+            if svc_matches:
+                entry["service"] = svc_matches[0]
+                service_counts[svc_matches[0]] = service_counts.get(svc_matches[0], 0) + 1
+            # Categorize error type
+            error_keywords = {
+                "timeout": "TIMEOUT",
+                "OOM": "OUT_OF_MEMORY",
+                "circuit breaker": "CIRCUIT_BREAKER",
+                "TLS": "TLS_ERROR",
+                "rate limit": "RATE_LIMIT",
+                "disk": "DISK_SPACE",
+                "evicted": "POD_EVICTION",
+                "DNS": "DNS_FAILURE",
+                "health check": "HEALTH_CHECK",
+                "deadlock": "DEADLOCK",
+                "gRPC": "GRPC_ERROR",
+                "Kafka": "KAFKA_LAG",
+                "Redis": "REDIS_ERROR",
+                "too large": "PAYLOAD_SIZE",
+                "token expired": "AUTH_ERROR",
+            }
+            for keyword, error_type in error_keywords.items():
+                if keyword.lower() in line.lower():
+                    entry["error_type"] = error_type
+                    error_types[error_type] = error_types.get(error_type, 0) + 1
+                    break
+            parsed_entries.append(entry)
+        # Detect error bursts (clusters of errors within 5-minute windows)
+        timestamps = []
+        for entry in parsed_entries:
+            if "timestamp" in entry:
+                try:
+                    ts = datetime.fromisoformat(entry["timestamp"])
+                    timestamps.append(ts)
+                except ValueError:
+                    pass
+        bursts = []
+        if timestamps:
+            timestamps.sort()
+            window_seconds = 300  # 5 minutes
+            i = 0
+            while i < len(timestamps):
+                window_count = 1
+                j = i + 1
+                while j < len(timestamps) and (timestamps[j] - timestamps[i]).total_seconds() < window_seconds:
+                    window_count += 1
+                    j += 1
+                if window_count >= 5:  # 5+ errors in 5 minutes = burst
+                    bursts.append({
+                        "start": timestamps[i].isoformat(),
+                        "end": timestamps[j - 1].isoformat(),
+                        "count": window_count,
+                        "severity": "critical" if window_count >= 10 else "warning",
+                    })
+                    i = j
+                else:
+                    i += 1
+        result = {
+            "total_lines_processed": total_lines,
+            "lines_after_service_filter": len(lines) if service_name else total_lines,
+            "matched_lines": len(matched),
+            "filter_pattern": filter_pattern,
+            "severity_distribution": level_counts,
+            "service_distribution": dict(sorted(service_counts.items(), key=lambda x: x[1], reverse=True)),
+            "error_type_distribution": dict(sorted(error_types.items(), key=lambda x: x[1], reverse=True)),
+            "error_bursts": bursts,
+            "sample_entries": parsed_entries[:20],  # Top 20 for context
+            "summary": {
+                "most_affected_service": max(service_counts, key=service_counts.get) if service_counts else "unknown",
+                "most_common_error": max(error_types, key=error_types.get) if error_types else "unknown",
+                "has_error_burst": len(bursts) > 0,
+                "error_rate_per_line": round(len(matched) / max(total_lines, 1), 4),
+            },
+        }
+        print(f"[LogParser] Found {len(matched)} matching lines, {len(bursts)} error bursts")
+        return json.dumps(result, indent=2)
+class LogAnomalyDetectorTool(Tool):
+    """Detect anomalous log patterns using frequency analysis."""
+    name = "log_anomaly_detector"
+    description = """Detects anomalous log patterns by analyzing log frequency, new/rare log templates,
+    and sudden changes in log volume.
+    Use this to find:
+    - Sudden spikes in error logs
+    - New error messages that haven't appeared before
+    - Unusual log volume patterns (too many or too few logs)
+    - Log template drift (new types of messages appearing)
+    Complements the log_parser tool — use log_parser for filtering, this tool for pattern-level anomaly detection.
+    """
+    inputs = {
+        "log_content": {
+            "type": "string",
+            "description": "Raw log text (multi-line) or 'auto' for simulated data.",
+        },
+        "baseline_content": {
+            "type": "string",
+            "description": "Optional: baseline/normal log content for comparison. If not provided, uses first half of data as baseline.",
+            "nullable": True,
+        },
+    }
+    output_type = "string"
+    def _extract_template(self, line: str) -> str:
+        """Extract a log template by replacing variable parts."""
+        # Remove timestamps
+        result = re.sub(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z?', '<TS>', line)
+        # Remove UUIDs / request IDs
+        result = re.sub(r'[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}', '<UUID>', result)
+        result = re.sub(r'req-\d+', '<REQ_ID>', result)
+        # Remove numbers
+        result = re.sub(r'\b\d+\.?\d*\b', '<NUM>', result)
+        # Remove IP addresses
+        result = re.sub(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', '<IP>', result)
+        return result.strip()
+    def forward(self, log_content: str, baseline_content: str = "") -> str:
+        if log_content.strip().lower() == "auto":
+            # Generate logs with some anomalous patterns
+            import random
+            normal_templates = [
+                "2024-01-15T10:{:02d}:{:02d}.000Z [INFO] [api-gateway] [req-{}] Request processed in {}ms",
+                "2024-01-15T10:{:02d}:{:02d}.000Z [INFO] [user-service] [req-{}] Cache hit for user {}",
+                "2024-01-15T10:{:02d}:{:02d}.000Z [INFO] [order-service] [req-{}] Order {} created",
+                "2024-01-15T10:{:02d}:{:02d}.000Z [WARN] [api-gateway] [req-{}] Slow response: {}ms",
+            ]
+            anomaly_templates = [
+                "2024-01-15T10:{:02d}:{:02d}.000Z [ERROR] [payment-service] [req-{}] CRITICAL: Database connection pool exhausted",
+                "2024-01-15T10:{:02d}:{:02d}.000Z [ERROR] [payment-service] [req-{}] Transaction deadlock detected on table payments",
+                "2024-01-15T10:{:02d}:{:02d}.000Z [CRITICAL] [payment-service] [req-{}] Cascading failure: all replicas unhealthy",
+            ]
+            lines = []
+            # Normal baseline (first 30 min)
+            for i in range(100):
+                t = random.choice(normal_templates)
+                lines.append(t.format(random.randint(0, 29), random.randint(0, 59), random.randint(10000, 99999), random.randint(1, 500)))
+            # Anomalous period (last 30 min)
+            for i in range(60):
+                t = random.choice(normal_templates)
+                lines.append(t.format(random.randint(30, 59), random.randint(0, 59), random.randint(10000, 99999), random.randint(1, 500)))
+            for i in range(40):
+                t = random.choice(anomaly_templates)
+                lines.append(t.format(random.randint(30, 59), random.randint(0, 59), random.randint(10000, 99999)))
+            lines.sort()
+            log_content = "\n".join(lines)
+        lines = log_content.strip().split("\n")
+        # Split into baseline and current if no baseline provided
+        if baseline_content:
+            baseline_lines = baseline_content.strip().split("\n")
+            current_lines = lines
+        else:
+            mid = len(lines) // 2
+            baseline_lines = lines[:mid]
+            current_lines = lines[mid:]
+        print(f"[LogAnomalyDetector] Analyzing {len(current_lines)} current lines against {len(baseline_lines)} baseline lines")
+        # Extract templates
+        baseline_templates = {}
+        for line in baseline_lines:
+            template = self._extract_template(line)
+            baseline_templates[template] = baseline_templates.get(template, 0) + 1
+        current_templates = {}
+        for line in current_lines:
+            template = self._extract_template(line)
+            current_templates[template] = current_templates.get(template, 0) + 1
+        # Find new templates (not in baseline)
+        new_templates = {k: v for k, v in current_templates.items() if k not in baseline_templates}
+        # Find templates with significant frequency change
+        frequency_changes = []
+        for template, current_count in current_templates.items():
+            baseline_count = baseline_templates.get(template, 0)
+            if baseline_count > 0:
+                change_ratio = current_count / baseline_count
+                if change_ratio > 2.0 or change_ratio < 0.5:
+                    frequency_changes.append({
+                        "template": template[:150],
+                        "baseline_count": baseline_count,
+                        "current_count": current_count,
+                        "change_ratio": round(change_ratio, 2),
+                        "direction": "increase" if change_ratio > 1 else "decrease",
+                    })
+        # Volume analysis
+        baseline_volume_per_min = len(baseline_lines) / max(1, 30)  # assume 30 min windows
+        current_volume_per_min = len(current_lines) / max(1, 30)
+        volume_change = current_volume_per_min / max(baseline_volume_per_min, 0.01)
+        result = {
+            "baseline_lines": len(baseline_lines),
+            "current_lines": len(current_lines),
+            "baseline_unique_templates": len(baseline_templates),
+            "current_unique_templates": len(current_templates),
+            "new_templates": {
+                "count": len(new_templates),
+                "templates": [{"template": k[:150], "count": v} for k, v in sorted(new_templates.items(), key=lambda x: x[1], reverse=True)[:10]],
+            },
+            "frequency_changes": sorted(frequency_changes, key=lambda x: abs(x["change_ratio"]), reverse=True)[:10],
+            "volume_analysis": {
+                "baseline_volume_per_min": round(baseline_volume_per_min, 2),
+                "current_volume_per_min": round(current_volume_per_min, 2),
+                "volume_change_ratio": round(volume_change, 2),
+                "anomalous_volume": volume_change > 2.0 or volume_change < 0.5,
+            },
+            "verdict": {
+                "has_new_error_patterns": len(new_templates) > 0,
+                "has_frequency_anomalies": len(frequency_changes) > 0,
+                "has_volume_anomaly": volume_change > 2.0 or volume_change < 0.5,
+                "severity": (
+                    "critical" if len(new_templates) > 5 or volume_change > 5
+                    else "warning" if len(new_templates) > 0 or len(frequency_changes) > 0
+                    else "ok"
+                ),
+            },
+        }
+        print(f"[LogAnomalyDetector] Found {len(new_templates)} new templates, {len(frequency_changes)} frequency changes")
+        return json.dumps(result, indent=2)
+class LogPatternExtractorTool(Tool):
+    """Extract common patterns and keywords from logs for RCA."""
+    name = "log_pattern_extractor"
+    description = """Extracts common patterns, error codes, service names, and key phrases from log data.
+    Use this to:
+    - Identify the most frequent error messages
+    - Extract HTTP status codes, error codes, exception types
+    - Find common service/component names in error logs
+    - Build a keyword summary for root cause investigation
+    Good as a preprocessing step before feeding results to the RCA correlator.
+    """
+    inputs = {
+        "log_content": {
+            "type": "string",
+            "description": "Raw log text (multi-line) or 'auto' for simulated data.",
+        },
+        "top_k": {
+            "type": "integer",
+            "description": "Number of top patterns to return. Default: 10.",
+            "nullable": True,
+        },
+    }
+    output_type = "string"
+    def forward(self, log_content: str, top_k: int = 10) -> str:
+        if log_content.strip().lower() == "auto":
+            log_content = LogParserTool()._generate_sample_logs()
+        lines = log_content.strip().split("\n")
+        print(f"[LogPatternExtractor] Extracting patterns from {len(lines)} lines")
+        # Extract HTTP status codes
+        status_codes = {}
+        for code in re.findall(r'\b[2345]\d{2}\b', log_content):
+            status_codes[code] = status_codes.get(code, 0) + 1
+        # Extract exception/error types
+        exceptions = {}
+        for exc in re.findall(r'(?:Exception|Error|Failure|Fault|Timeout|OOM|Deadlock|CRITICAL)\b[:\s]*([\w\s]+?)(?:\.|,|\n|$)', log_content, re.IGNORECASE):
+            exc_clean = exc.strip()[:50]
+            if exc_clean:
+                exceptions[exc_clean] = exceptions.get(exc_clean, 0) + 1
+        # Extract service/component names
+        services = {}
+        for svc in re.findall(r'\[([a-zA-Z][\w-]+(?:-service|-api|-worker|-gateway|-proxy))\]', log_content):
+            services[svc] = services.get(svc, 0) + 1
+        # Extract key phrases (bigrams from error lines)
+        error_lines = [l for l in lines if re.search(r'ERROR|CRITICAL|FATAL', l, re.IGNORECASE)]
+        word_freq = {}
+        for line in error_lines:
+            words = re.findall(r'\b[a-zA-Z]{3,}\b', line.lower())
+            # Filter common words
+            stopwords = {'the', 'and', 'for', 'from', 'with', 'this', 'that', 'was', 'are', 'not', 'but', 'has', 'had', 'have', 'been', 'info', 'error', 'warn', 'critical', 'fatal', 'debug'}
+            words = [w for w in words if w not in stopwords]
+            for w in words:
+                word_freq[w] = word_freq.get(w, 0) + 1
+        result = {
+            "total_lines": len(lines),
+            "error_lines": len(error_lines),
+            "status_codes": dict(sorted(status_codes.items(), key=lambda x: x[1], reverse=True)[:top_k]),
+            "exception_types": dict(sorted(exceptions.items(), key=lambda x: x[1], reverse=True)[:top_k]),
+            "services_mentioned": dict(sorted(services.items(), key=lambda x: x[1], reverse=True)[:top_k]),
+            "top_error_keywords": dict(sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:top_k * 2]),
+            "key_insights": [],
+        }
+        # Generate insights
+        if status_codes:
+            top_code = max(status_codes, key=status_codes.get)
+            result["key_insights"].append(f"Most common HTTP status: {top_code} ({status_codes[top_code]} occurrences)")
+        if exceptions:
+            top_exc = max(exceptions, key=exceptions.get)
+            result["key_insights"].append(f"Most common error type: {top_exc} ({exceptions[top_exc]} occurrences)")
+        if services:
+            top_svc = max(services, key=services.get)
+            result["key_insights"].append(f"Most affected service: {top_svc} ({services[top_svc]} mentions in error logs)")
+        print(f"[LogPatternExtractor] Extracted {len(status_codes)} status codes, {len(exceptions)} exception types, {len(services)} services")
+        return json.dumps(result, indent=2)