"""Type contracts for ChaosOps AI. Design principles ----------------- * Strict Pydantic v2 models — same schemas on client and server. * Enums for every categorical field so the action space is fully discoverable at train-time (no magic strings). * Role-aware observations: each agent sees a projected slice of the world, never the ground truth. This is what makes the env partially observable and what forces Theory-of-Mind reasoning in the Oversight agent. * Observations carry only *deltas* since the last step where useful; the full ground-truth state is kept server-side in ``ChaosOpsState``. """ from __future__ import annotations from enum import Enum from typing import Any from pydantic import BaseModel, Field, field_validator # --------------------------------------------------------------------------- # Enums — the vocabulary of the environment # --------------------------------------------------------------------------- class ServiceName(str, Enum): AUTH = "auth" PAYMENTS = "payments" NOTIFICATIONS = "notifications" DB = "db" AUTOSCALER = "autoscaler" LOAD_BALANCER = "load_balancer" DEPLOY_BOT = "deploy_bot" class ServiceHealth(str, Enum): HEALTHY = "healthy" DEGRADED = "degraded" CRITICAL = "critical" DOWN = "down" class FailureType(str, Enum): """Ground-truth root cause of the incident (hidden from agents).""" DB_DEADLOCK = "db_deadlock" MEMORY_LEAK = "memory_leak" BAD_CONFIG_PUSH = "bad_config_push" AUTOSCALER_COST_CUT = "autoscaler_cost_cut" # rogue-agent MISROUTED_TRAFFIC = "misrouted_traffic" # rogue-agent CASCADE = "cascade" # second-order, combines two above DNS_OUTAGE = "dns_outage" DISK_FULL = "disk_full" ROGUE_DEPLOY_BOT = "rogue_deploy_bot" # rogue-agent @property def is_rogue_agent(self) -> bool: return self in { FailureType.AUTOSCALER_COST_CUT, FailureType.MISROUTED_TRAFFIC, FailureType.ROGUE_DEPLOY_BOT, } class AgentRole(str, Enum): SRE = "sre" DEV = "dev" MANAGER = "manager" OVERSIGHT = "oversight" class ActionType(str, Enum): """Discrete action space shared by every LLM agent.""" NOOP = "noop" COMMUNICATE = "communicate" # broadcast a message RESTART = "restart" # restart a service ROLLBACK = "rollback" # rollback a service to previous version SCALE = "scale" # scale(service, n_replicas) FLAG_ROGUE = "flag_rogue" # Oversight only: flag an in-fleet agent IDENTIFY_ROOT_CAUSE = "identify_root_cause" # declare a FailureType ESCALATE = "escalate" # ask Manager to force a decision RESOLVE = "resolve" # Manager only: declare incident resolved class DifficultyTier(str, Enum): EASY = "easy" MEDIUM = "medium" HARD = "hard" # --------------------------------------------------------------------------- # Core data classes # --------------------------------------------------------------------------- class ServiceMetrics(BaseModel): """Per-service telemetry point. Bounded for reward normalization.""" cpu_pct: float = Field(ge=0.0, le=100.0) memory_mb: float = Field(ge=0.0) latency_ms: float = Field(ge=0.0) error_rate: float = Field(ge=0.0, le=1.0) replicas: int = Field(ge=0) health: ServiceHealth = ServiceHealth.HEALTHY class LogLine(BaseModel): timestamp: float service: ServiceName level: str # INFO, WARN, ERROR, FATAL message: str # Honest flag — used internally to grade the Oversight agent. # Agents NEVER see this; the env strips it before sending. is_red_herring: bool = False class Alert(BaseModel): service: ServiceName severity: str # page, warn, info message: str triggered_at_step: int class FleetAgentLog(BaseModel): """Record of an action taken by an in-fleet (non-LLM) agent. These are the "employees" the Oversight agent may need to flag as rogue. """ agent_name: str # e.g., "autoscaler", "load_balancer" action: str target: str step: int was_malicious: bool = False # ground truth, stripped from agent view # --------------------------------------------------------------------------- # Action — exactly one per agent turn # --------------------------------------------------------------------------- class ChaosOpsAction(BaseModel): """A single action taken by an LLM agent. ``args`` carries action-specific parameters (target service, message body, proposed root cause, etc.). Keeping everything in one payload lets TRL train on a single stringified action per step. """ role: AgentRole action_type: ActionType target: str | None = None args: dict[str, Any] = Field(default_factory=dict) @field_validator("args") @classmethod def _no_none_args(cls, v: dict[str, Any]) -> dict[str, Any]: return {k: val for k, val in v.items() if val is not None} # --------------------------------------------------------------------------- # Observation — role-projected view of the world # --------------------------------------------------------------------------- class RoleView(BaseModel): """What a single agent sees on its turn.""" role: AgentRole visible_metrics: dict[str, ServiceMetrics] visible_logs: list[LogLine] visible_alerts: list[Alert] visible_fleet_actions: list[FleetAgentLog] shared_chat: list[str] private_note: str | None = None # only Oversight receives suspicion hints # Messages addressed privately to this role (e.g., SRE <-> DEV backchannel). # Populated by :func:`chaosops.env.projections.project_view` and never # leaks into ``shared_chat``. private_inbox: list[str] = Field(default_factory=list) class ChaosOpsObservation(BaseModel): done: bool = False reward: float | None = None view: RoleView step: int turn_role: AgentRole message: str = "" # --------------------------------------------------------------------------- # Full ground-truth state (server-side only) # --------------------------------------------------------------------------- class ChaosOpsState(BaseModel): """Complete ground-truth environment state. Agents never receive this wholesale; ``RoleView`` is derived per turn. """ episode_id: str | None = None step_count: int = 0 max_steps: int = 20 difficulty: DifficultyTier = DifficultyTier.EASY failure_type: FailureType = FailureType.DB_DEADLOCK rogue_fleet_agent: str | None = None # name of the misbehaving fleet agent, if any misleading_log_count: int = 0 services: dict[str, ServiceMetrics] = Field(default_factory=dict) all_logs: list[LogLine] = Field(default_factory=list) all_alerts: list[Alert] = Field(default_factory=list) fleet_actions: list[FleetAgentLog] = Field(default_factory=list) chat_history: list[str] = Field(default_factory=list) # Per-role private inboxes (keyed by AgentRole.value). Populated when a # communicate action carries a ``to=`` argument. Kept server-side # so ``project_view`` can hand each role only its own slice. private_chat: dict[str, list[str]] = Field(default_factory=dict) resolved: bool = False wrong_fixes: int = 0 miscommunications: int = 0 oversight_flags: list[str] = Field(default_factory=list) # who Oversight flagged declared_root_cause: FailureType | None = None declared_root_cause_step: int | None = None cumulative_reward: float = 0.0