File size: 8,499 Bytes
e270f30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fbb0927
 
 
 
e270f30
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
from __future__ import annotations
from typing import Literal, Optional, ClassVar
from pydantic import BaseModel, Field


# ─── LOG LINE ─────────────────────────────────────────────────────────────────

class LogLine(BaseModel):
    """A single log line from the simulated microservice cluster."""
    timestamp: str = Field(..., description="ISO 8601 timestamp")
    level: Literal["DEBUG", "INFO", "WARN", "ERROR", "FATAL"]
    service: str = Field(..., description="Service that emitted the log")
    request_id: Optional[str] = Field(None, description="Request trace ID if present")
    message: str = Field(..., description="Log message content")
    latency_ms: Optional[int] = Field(None, description="Latency if relevant")


# ─── SERVICE STATUS ────────────────────────────────────────────────────────────

class ServiceStatus(BaseModel):
    """Current health snapshot of one microservice."""
    name: str
    status: Literal["up", "degraded", "down"]
    error_rate: float = Field(..., ge=0.0, le=1.0, description="Error rate 0.0-1.0")
    latency_p99_ms: int = Field(..., description="99th percentile latency in ms")
    last_updated: str = Field(..., description="ISO 8601 timestamp of last update")


# ─── ACTION ───────────────────────────────────────────────────────────────────

class TriageAction(BaseModel):
    """
    Action taken by the agent in one step.

    action_type options:
      - classify_severity  : value must be "P1", "P2", or "P3"
      - identify_root_cause: value must be a valid service name
      - escalate           : value must be a valid team name
      - remediate          : value must be "restart:<svc>", "rollback:<svc>",
                             "scale:<svc>", "flush-cache:<svc>", "kill-query:<svc>"
      - request_more_logs  : value must be a service name or "all"
      - resolve            : value must be "resolved"
      - ignore             : value must be "noise"
    """
    action_type: Literal[
        "classify_severity",
        "identify_root_cause",
        "escalate",
        "remediate",
        "request_more_logs",
        "resolve",
        "ignore",
    ] = Field(..., description="Type of triage action to perform")

    value: str = Field(
        ...,
        description="Action value β€” depends on action_type (see docstring)"
    )

    confidence: float = Field(
        default=1.0,
        ge=0.0,
        le=1.0,
        description="Agent self-reported confidence in this action (0.0-1.0)"
    )

    reasoning: str = Field(
        default="",
        description="Optional free-text reasoning (used for interpretability)"
    )

    # ── Valid value constants ──────────────────────────────────────────────────
    VALID_SEVERITIES: ClassVar = {"P1", "P2", "P3"}
    VALID_SERVICES: ClassVar = {
        "api-gateway",
        "auth-service",
        "user-db",
        "payment-service",
        "payment-db",
        "notification-service",
        "email-queue",
    }
    VALID_TEAMS: ClassVar = {
        "sre-team",
        "backend-team",
        "dba-team",
        "security-team",
    }
    VALID_REMEDIATION_PREFIXES: ClassVar = {
        "restart",
        "rollback",
        "scale",
        "flush-cache",
        "kill-query",
    }

    def is_valid(self) -> tuple[bool, str]:
        """
        Validate the action value against its action_type.
        Returns (is_valid: bool, error_message: str).
        """
        if self.action_type == "classify_severity":
            if self.value not in self.VALID_SEVERITIES:
                return False, f"classify_severity value must be one of {self.VALID_SEVERITIES}"

        elif self.action_type == "identify_root_cause":
            if self.value not in self.VALID_SERVICES:
                return False, f"identify_root_cause value must be one of {self.VALID_SERVICES}"

        elif self.action_type == "escalate":
            if self.value not in self.VALID_TEAMS:
                return False, f"escalate value must be one of {self.VALID_TEAMS}"

        elif self.action_type == "remediate":
            prefix = self.value.split(":")[0]
            if prefix not in self.VALID_REMEDIATION_PREFIXES:
                return False, f"remediate prefix must be one of {self.VALID_REMEDIATION_PREFIXES}"
            parts = self.value.split(":")
            if len(parts) != 2 or parts[1] not in self.VALID_SERVICES:
                return False, f"remediate format must be '<action>:<service>'"

        elif self.action_type == "request_more_logs":
            if self.value != "all" and self.value not in self.VALID_SERVICES:
                return False, f"request_more_logs value must be 'all' or a valid service name"

        elif self.action_type == "resolve":
            if self.value != "resolved":
                return False, "resolve value must be 'resolved'"

        elif self.action_type == "ignore":
            if self.value != "noise":
                return False, "ignore value must be 'noise'"

        return True, ""


# ─── OBSERVATION ──────────────────────────────────────────────────────────────

class TriageObservation(BaseModel):
    """
    Observation returned to the agent after each step (and after reset).
    Contains the current log batch, system state, incident metadata,
    and reward signals.
    """
    # Log batch for this step
    logs: list[LogLine] = Field(
        ...,
        description="Current batch of log lines (5-15 lines)"
    )

    # System state snapshot
    system_state: dict[str, ServiceStatus] = Field(
        ...,
        description="Per-service health snapshot keyed by service name"
    )

    # Incident metadata
    incident_id: str = Field(..., description="Unique ID for this episode")
    task_id: str = Field(..., description="Which task is being run")
    step_count: int = Field(..., description="Current step number (0-indexed)")
    time_elapsed_seconds: int = Field(
        ...,
        description="Simulated incident time elapsed in seconds"
    )
    active_alerts: list[str] = Field(
        default_factory=list,
        description="Currently firing alert names"
    )

    # Reward signals
    reward: float = Field(
        default=0.0,
        description="Reward received for the last action"
    )
    cumulative_score: float = Field(
        default=0.0,
        description="Running total score for this episode"
    )
    done: bool = Field(
        default=False,
        description="Whether the episode has ended"
    )

    # Feedback
    last_action_feedback: str = Field(
        default="",
        description="Natural language feedback on the previous action"
    )
    invalid_action_error: Optional[str] = Field(
        default=None,
        description="Set if the last action was invalid (wrong format/value)"
    )


# ─── EPISODE STATE ────────────────────────────────────────────────────────────

class EpisodeState(BaseModel):
    """Internal state of the current episode (returned by state() endpoint)."""
    episode_id: str
    task_id: str
    step_count: int
    max_steps: int
    done: bool
    cumulative_score: float
    actions_taken: list[str] = Field(
        default_factory=list,
        description="List of action_type values taken so far this episode"
    )
    action_history: list[dict] = Field(
        default_factory=list,
        description="Full action objects taken this episode (for grader evaluation)"
    )
    correct_severity: Optional[str] = Field(
        None,
        description="Whether agent has correctly classified severity yet"
    )
    correct_root_cause: Optional[str] = Field(
        None,
        description="Whether agent has correctly identified root cause yet"
    )
    correct_remediation: bool = False