| """Type contracts for ChaosOps AI. |
| |
| Design principles |
| ----------------- |
| * Strict Pydantic v2 models β same schemas on client and server. |
| * Enums for every categorical field so the action space is fully discoverable |
| at train-time (no magic strings). |
| * Role-aware observations: each agent sees a projected slice of the world, |
| never the ground truth. This is what makes the env partially observable |
| and what forces Theory-of-Mind reasoning in the Oversight agent. |
| * Observations carry only *deltas* since the last step where useful; the |
| full ground-truth state is kept server-side in ``ChaosOpsState``. |
| """ |
|
|
| from __future__ import annotations |
|
|
| from enum import Enum |
| from typing import Any |
|
|
| from pydantic import BaseModel, Field, field_validator |
|
|
|
|
| |
| |
| |
|
|
|
|
| class ServiceName(str, Enum): |
| AUTH = "auth" |
| PAYMENTS = "payments" |
| NOTIFICATIONS = "notifications" |
| DB = "db" |
| AUTOSCALER = "autoscaler" |
| LOAD_BALANCER = "load_balancer" |
| DEPLOY_BOT = "deploy_bot" |
|
|
|
|
| class ServiceHealth(str, Enum): |
| HEALTHY = "healthy" |
| DEGRADED = "degraded" |
| CRITICAL = "critical" |
| DOWN = "down" |
|
|
|
|
| class FailureType(str, Enum): |
| """Ground-truth root cause of the incident (hidden from agents).""" |
|
|
| DB_DEADLOCK = "db_deadlock" |
| MEMORY_LEAK = "memory_leak" |
| BAD_CONFIG_PUSH = "bad_config_push" |
| AUTOSCALER_COST_CUT = "autoscaler_cost_cut" |
| MISROUTED_TRAFFIC = "misrouted_traffic" |
| CASCADE = "cascade" |
| DNS_OUTAGE = "dns_outage" |
| DISK_FULL = "disk_full" |
| ROGUE_DEPLOY_BOT = "rogue_deploy_bot" |
|
|
| @property |
| def is_rogue_agent(self) -> bool: |
| return self in { |
| FailureType.AUTOSCALER_COST_CUT, |
| FailureType.MISROUTED_TRAFFIC, |
| FailureType.ROGUE_DEPLOY_BOT, |
| } |
|
|
|
|
| class AgentRole(str, Enum): |
| SRE = "sre" |
| DEV = "dev" |
| MANAGER = "manager" |
| OVERSIGHT = "oversight" |
|
|
|
|
| class ActionType(str, Enum): |
| """Discrete action space shared by every LLM agent.""" |
|
|
| NOOP = "noop" |
| COMMUNICATE = "communicate" |
| RESTART = "restart" |
| ROLLBACK = "rollback" |
| SCALE = "scale" |
| FLAG_ROGUE = "flag_rogue" |
| IDENTIFY_ROOT_CAUSE = "identify_root_cause" |
| ESCALATE = "escalate" |
| RESOLVE = "resolve" |
|
|
|
|
| class DifficultyTier(str, Enum): |
| EASY = "easy" |
| MEDIUM = "medium" |
| HARD = "hard" |
|
|
|
|
| |
| |
| |
|
|
|
|
| class ServiceMetrics(BaseModel): |
| """Per-service telemetry point. Bounded for reward normalization.""" |
|
|
| cpu_pct: float = Field(ge=0.0, le=100.0) |
| memory_mb: float = Field(ge=0.0) |
| latency_ms: float = Field(ge=0.0) |
| error_rate: float = Field(ge=0.0, le=1.0) |
| replicas: int = Field(ge=0) |
| health: ServiceHealth = ServiceHealth.HEALTHY |
|
|
|
|
| class LogLine(BaseModel): |
| timestamp: float |
| service: ServiceName |
| level: str |
| message: str |
| |
| |
| is_red_herring: bool = False |
|
|
|
|
| class Alert(BaseModel): |
| service: ServiceName |
| severity: str |
| message: str |
| triggered_at_step: int |
|
|
|
|
| class FleetAgentLog(BaseModel): |
| """Record of an action taken by an in-fleet (non-LLM) agent. |
| |
| These are the "employees" the Oversight agent may need to flag as rogue. |
| """ |
|
|
| agent_name: str |
| action: str |
| target: str |
| step: int |
| was_malicious: bool = False |
|
|
|
|
| |
| |
| |
|
|
|
|
| class ChaosOpsAction(BaseModel): |
| """A single action taken by an LLM agent. |
| |
| ``args`` carries action-specific parameters (target service, message body, |
| proposed root cause, etc.). Keeping everything in one payload lets TRL |
| train on a single stringified action per step. |
| """ |
|
|
| role: AgentRole |
| action_type: ActionType |
| target: str | None = None |
| args: dict[str, Any] = Field(default_factory=dict) |
|
|
| @field_validator("args") |
| @classmethod |
| def _no_none_args(cls, v: dict[str, Any]) -> dict[str, Any]: |
| return {k: val for k, val in v.items() if val is not None} |
|
|
|
|
| |
| |
| |
|
|
|
|
| class RoleView(BaseModel): |
| """What a single agent sees on its turn.""" |
|
|
| role: AgentRole |
| visible_metrics: dict[str, ServiceMetrics] |
| visible_logs: list[LogLine] |
| visible_alerts: list[Alert] |
| visible_fleet_actions: list[FleetAgentLog] |
| shared_chat: list[str] |
| private_note: str | None = None |
| |
| |
| |
| private_inbox: list[str] = Field(default_factory=list) |
|
|
|
|
| class ChaosOpsObservation(BaseModel): |
| done: bool = False |
| reward: float | None = None |
| view: RoleView |
| step: int |
| turn_role: AgentRole |
| message: str = "" |
|
|
|
|
| |
| |
| |
|
|
|
|
| class ChaosOpsState(BaseModel): |
| """Complete ground-truth environment state. |
| |
| Agents never receive this wholesale; ``RoleView`` is derived per turn. |
| """ |
|
|
| episode_id: str | None = None |
| step_count: int = 0 |
| max_steps: int = 20 |
|
|
| difficulty: DifficultyTier = DifficultyTier.EASY |
| failure_type: FailureType = FailureType.DB_DEADLOCK |
| rogue_fleet_agent: str | None = None |
| misleading_log_count: int = 0 |
|
|
| services: dict[str, ServiceMetrics] = Field(default_factory=dict) |
| all_logs: list[LogLine] = Field(default_factory=list) |
| all_alerts: list[Alert] = Field(default_factory=list) |
| fleet_actions: list[FleetAgentLog] = Field(default_factory=list) |
| chat_history: list[str] = Field(default_factory=list) |
| |
| |
| |
| private_chat: dict[str, list[str]] = Field(default_factory=dict) |
|
|
| resolved: bool = False |
| wrong_fixes: int = 0 |
| miscommunications: int = 0 |
| oversight_flags: list[str] = Field(default_factory=list) |
| declared_root_cause: FailureType | None = None |
| declared_root_cause_step: int | None = None |
| cumulative_reward: float = 0.0 |
|
|