Spaces:
Running
Running
File size: 8,499 Bytes
e270f30 fbb0927 e270f30 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 | from __future__ import annotations
from typing import Literal, Optional, ClassVar
from pydantic import BaseModel, Field
# βββ LOG LINE βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class LogLine(BaseModel):
"""A single log line from the simulated microservice cluster."""
timestamp: str = Field(..., description="ISO 8601 timestamp")
level: Literal["DEBUG", "INFO", "WARN", "ERROR", "FATAL"]
service: str = Field(..., description="Service that emitted the log")
request_id: Optional[str] = Field(None, description="Request trace ID if present")
message: str = Field(..., description="Log message content")
latency_ms: Optional[int] = Field(None, description="Latency if relevant")
# βββ SERVICE STATUS ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class ServiceStatus(BaseModel):
"""Current health snapshot of one microservice."""
name: str
status: Literal["up", "degraded", "down"]
error_rate: float = Field(..., ge=0.0, le=1.0, description="Error rate 0.0-1.0")
latency_p99_ms: int = Field(..., description="99th percentile latency in ms")
last_updated: str = Field(..., description="ISO 8601 timestamp of last update")
# βββ ACTION βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TriageAction(BaseModel):
"""
Action taken by the agent in one step.
action_type options:
- classify_severity : value must be "P1", "P2", or "P3"
- identify_root_cause: value must be a valid service name
- escalate : value must be a valid team name
- remediate : value must be "restart:<svc>", "rollback:<svc>",
"scale:<svc>", "flush-cache:<svc>", "kill-query:<svc>"
- request_more_logs : value must be a service name or "all"
- resolve : value must be "resolved"
- ignore : value must be "noise"
"""
action_type: Literal[
"classify_severity",
"identify_root_cause",
"escalate",
"remediate",
"request_more_logs",
"resolve",
"ignore",
] = Field(..., description="Type of triage action to perform")
value: str = Field(
...,
description="Action value β depends on action_type (see docstring)"
)
confidence: float = Field(
default=1.0,
ge=0.0,
le=1.0,
description="Agent self-reported confidence in this action (0.0-1.0)"
)
reasoning: str = Field(
default="",
description="Optional free-text reasoning (used for interpretability)"
)
# ββ Valid value constants ββββββββββββββββββββββββββββββββββββββββββββββββββ
VALID_SEVERITIES: ClassVar = {"P1", "P2", "P3"}
VALID_SERVICES: ClassVar = {
"api-gateway",
"auth-service",
"user-db",
"payment-service",
"payment-db",
"notification-service",
"email-queue",
}
VALID_TEAMS: ClassVar = {
"sre-team",
"backend-team",
"dba-team",
"security-team",
}
VALID_REMEDIATION_PREFIXES: ClassVar = {
"restart",
"rollback",
"scale",
"flush-cache",
"kill-query",
}
def is_valid(self) -> tuple[bool, str]:
"""
Validate the action value against its action_type.
Returns (is_valid: bool, error_message: str).
"""
if self.action_type == "classify_severity":
if self.value not in self.VALID_SEVERITIES:
return False, f"classify_severity value must be one of {self.VALID_SEVERITIES}"
elif self.action_type == "identify_root_cause":
if self.value not in self.VALID_SERVICES:
return False, f"identify_root_cause value must be one of {self.VALID_SERVICES}"
elif self.action_type == "escalate":
if self.value not in self.VALID_TEAMS:
return False, f"escalate value must be one of {self.VALID_TEAMS}"
elif self.action_type == "remediate":
prefix = self.value.split(":")[0]
if prefix not in self.VALID_REMEDIATION_PREFIXES:
return False, f"remediate prefix must be one of {self.VALID_REMEDIATION_PREFIXES}"
parts = self.value.split(":")
if len(parts) != 2 or parts[1] not in self.VALID_SERVICES:
return False, f"remediate format must be '<action>:<service>'"
elif self.action_type == "request_more_logs":
if self.value != "all" and self.value not in self.VALID_SERVICES:
return False, f"request_more_logs value must be 'all' or a valid service name"
elif self.action_type == "resolve":
if self.value != "resolved":
return False, "resolve value must be 'resolved'"
elif self.action_type == "ignore":
if self.value != "noise":
return False, "ignore value must be 'noise'"
return True, ""
# βββ OBSERVATION ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TriageObservation(BaseModel):
"""
Observation returned to the agent after each step (and after reset).
Contains the current log batch, system state, incident metadata,
and reward signals.
"""
# Log batch for this step
logs: list[LogLine] = Field(
...,
description="Current batch of log lines (5-15 lines)"
)
# System state snapshot
system_state: dict[str, ServiceStatus] = Field(
...,
description="Per-service health snapshot keyed by service name"
)
# Incident metadata
incident_id: str = Field(..., description="Unique ID for this episode")
task_id: str = Field(..., description="Which task is being run")
step_count: int = Field(..., description="Current step number (0-indexed)")
time_elapsed_seconds: int = Field(
...,
description="Simulated incident time elapsed in seconds"
)
active_alerts: list[str] = Field(
default_factory=list,
description="Currently firing alert names"
)
# Reward signals
reward: float = Field(
default=0.0,
description="Reward received for the last action"
)
cumulative_score: float = Field(
default=0.0,
description="Running total score for this episode"
)
done: bool = Field(
default=False,
description="Whether the episode has ended"
)
# Feedback
last_action_feedback: str = Field(
default="",
description="Natural language feedback on the previous action"
)
invalid_action_error: Optional[str] = Field(
default=None,
description="Set if the last action was invalid (wrong format/value)"
)
# βββ EPISODE STATE ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class EpisodeState(BaseModel):
"""Internal state of the current episode (returned by state() endpoint)."""
episode_id: str
task_id: str
step_count: int
max_steps: int
done: bool
cumulative_score: float
actions_taken: list[str] = Field(
default_factory=list,
description="List of action_type values taken so far this episode"
)
action_history: list[dict] = Field(
default_factory=list,
description="Full action objects taken this episode (for grader evaluation)"
)
correct_severity: Optional[str] = Field(
None,
description="Whether agent has correctly classified severity yet"
)
correct_root_cause: Optional[str] = Field(
None,
description="Whether agent has correctly identified root cause yet"
)
correct_remediation: bool = False
|