chaosops / dashboard /terminal.py
helloAK96's picture
Initializing space
83136ac
"""Rich-based live dashboard for ChaosOps AI episodes.
Used during the demo to make the env tangible: the judges watch the alert
fire, the agents converse, and the Oversight panel light up in real time.
CLI
----
python -m chaosops.dashboard.terminal \
--scenario autoscaler_cost_cut \
--policy oracle \
--difficulty medium
By default the dashboard renders frame-by-frame with a short sleep; pass
``--no-sleep`` to run through a recorded episode as fast as the terminal
can refresh (useful for screenshot capture).
"""
from __future__ import annotations
import argparse
import time
from dataclasses import dataclass
from rich.align import Align
from rich.console import Console, Group
from rich.layout import Layout
from rich.live import Live
from rich.panel import Panel
from rich.progress_bar import ProgressBar
from rich.table import Table
from rich.text import Text
from chaosops.agents.policies import (
Policy,
heuristic_policy,
oracle_policy,
random_policy,
)
from chaosops.agents.runner import EpisodeStep
from chaosops.env.environment import ChaosOpsEnvironment
from chaosops.env.models import (
AgentRole,
ChaosOpsAction,
DifficultyTier,
FailureType,
ServiceHealth,
)
from chaosops.env.world_sim import Scenario
ROLE_COLORS: dict[AgentRole, str] = {
AgentRole.SRE: "cyan",
AgentRole.DEV: "magenta",
AgentRole.MANAGER: "yellow",
AgentRole.OVERSIGHT: "bright_red",
}
HEALTH_STYLES: dict[ServiceHealth, tuple[str, str]] = {
ServiceHealth.HEALTHY: ("●", "green"),
ServiceHealth.DEGRADED: ("●", "yellow"),
ServiceHealth.CRITICAL: ("●", "red"),
ServiceHealth.DOWN: ("●", "bright_red"),
}
# ---------------------------------------------------------------------------
# Dashboard state
# ---------------------------------------------------------------------------
@dataclass
class DashboardFrame:
env: ChaosOpsEnvironment
last_step: EpisodeStep | None
cumulative_reward: float
turn_index: int
# ---------------------------------------------------------------------------
# Rendering
# ---------------------------------------------------------------------------
def _render_header(env: ChaosOpsEnvironment, cumulative_reward: float) -> Panel:
scen = env._sim._scenario # noqa: SLF001 — dashboard inspects the sim
title = Text("ChaosOps AI", style="bold white on red")
ft = "n/a" if scen is None else scen.failure_type.value
diff = "n/a" if scen is None else scen.difficulty.value
sub = Text.assemble(
("scenario: ", "dim"),
(ft, "bold"),
(" difficulty: ", "dim"),
(diff, "bold"),
(" step: ", "dim"),
(str(env.state.step_count), "bold"),
(" reward: ", "dim"),
(f"{cumulative_reward:+.1f}", "bold green" if cumulative_reward >= 0 else "bold red"),
)
return Panel(Align.center(Group(title, sub)), border_style="red", padding=(0, 2))
def _render_services(env: ChaosOpsEnvironment) -> Panel:
table = Table(expand=True, show_edge=False, pad_edge=False, header_style="bold white")
table.add_column("Service", width=16)
table.add_column("Health", width=10)
table.add_column("CPU %", justify="right")
table.add_column("Mem MB", justify="right")
table.add_column("Latency ms", justify="right")
table.add_column("Err rate", justify="right")
table.add_column("Replicas", justify="right")
for name, metrics in env.state.services.items():
glyph, color = HEALTH_STYLES[metrics.health]
table.add_row(
name,
Text(f"{glyph} {metrics.health.value}", style=color),
f"{metrics.cpu_pct:>5.1f}",
f"{metrics.memory_mb:>7.0f}",
f"{metrics.latency_ms:>8.0f}",
f"{metrics.error_rate:>6.2%}",
str(metrics.replicas),
)
return Panel(table, title="Services", border_style="cyan")
def _render_alerts(env: ChaosOpsEnvironment) -> Panel:
if not env.state.all_alerts:
body: Table | Text = Text("No active alerts.", style="dim")
else:
body = Table(expand=True, show_edge=False, pad_edge=False)
body.add_column("Sev")
body.add_column("Service")
body.add_column("Message", overflow="fold")
body.add_column("Step", justify="right")
for alert in env.state.all_alerts[-6:]:
sev_style = {"page": "bold red", "warn": "yellow", "info": "blue"}.get(
alert.severity, "white"
)
body.add_row(
Text(alert.severity, style=sev_style),
alert.service.value,
alert.message,
str(alert.triggered_at_step),
)
return Panel(body, title="Alerts", border_style="red")
def _render_chat(env: ChaosOpsEnvironment) -> Panel:
if not env.state.chat_history:
body = Text("Channel silent. Waiting for first message.", style="dim")
else:
lines = []
for msg in env.state.chat_history[-10:]:
# Format "[role] body" — color the role tag.
if msg.startswith("[") and "]" in msg:
close = msg.index("]")
role_tag = msg[1:close]
body_txt = msg[close + 1 :].strip()
try:
role = AgentRole(role_tag)
style = ROLE_COLORS.get(role, "white")
except ValueError:
style = "white"
lines.append(
Text.assemble(
(f"{role_tag:<10}", f"bold {style}"),
(" ", ""),
(body_txt, "white"),
)
)
else:
lines.append(Text(msg))
body = Group(*lines)
return Panel(body, title="Incident Channel", border_style="magenta")
def _render_oversight(env: ChaosOpsEnvironment) -> Panel:
"""Oversight panel — suspicion bars driven by REAL fleet activity.
Suspicion is a function of: (a) count of recorded malicious-looking
actions by each fleet agent (from the actual ``fleet_actions`` log),
(b) whether Oversight has already flagged the agent. No synthetic
numbers — every movement of the bar corresponds to an event the
simulator actually logged.
"""
# Count fleet actions per agent — higher count = more suspicious.
action_counts: dict[str, int] = {"autoscaler": 0, "load_balancer": 0}
for fa in env.state.fleet_actions:
if fa.agent_name in action_counts:
action_counts[fa.agent_name] += 1
suspicion: dict[str, float] = {}
for agent, count in action_counts.items():
# 0 actions -> 0.05 baseline; each observed action adds 0.25, capped at 0.90.
suspicion[agent] = min(0.05 + 0.25 * count, 0.90)
for flagged in env.state.oversight_flags:
if flagged in suspicion:
suspicion[flagged] = 0.95
table = Table(expand=True, show_edge=False, pad_edge=False)
table.add_column("Fleet agent", width=16)
table.add_column("Suspicion", ratio=1)
table.add_column("Actions", width=8, justify="right")
table.add_column("Status", width=12, justify="right")
for agent, score in suspicion.items():
bar = ProgressBar(total=1.0, completed=score, width=None)
flagged = agent in env.state.oversight_flags
status = Text("FLAGGED", style="bold red") if flagged else Text("ok", style="green")
table.add_row(agent, bar, str(action_counts[agent]), status)
return Panel(table, title="Oversight Belief", border_style="bright_red")
def _render_metrics_series(env: ChaosOpsEnvironment) -> Panel:
"""Live time-series panel — real numbers from the ring buffer."""
snap = env.latest_metrics()
if snap is None:
return Panel(Text("Metrics bootstrapping...", style="dim"),
title="Telemetry", border_style="blue")
# Render per-service sparklines using a tiny unicode block set.
def spark(values: list[float]) -> str:
if not values:
return ""
blocks = " ▁▂▃▄▅▆▇█"
lo = min(values)
hi = max(values) if max(values) > lo else lo + 1.0
return "".join(
blocks[min(len(blocks) - 1,
int((v - lo) / (hi - lo) * (len(blocks) - 1)))]
for v in values[-20:]
)
table = Table(expand=True, show_edge=False, pad_edge=False)
table.add_column("Service", width=14)
table.add_column("Latency (ms)", justify="right", width=14)
table.add_column("Trend", ratio=1)
table.add_column("Err %", justify="right", width=8)
for svc in snap.service_latency_ms.keys():
latency_hist = env.metrics.latency_series(svc)
table.add_row(
svc,
f"{snap.service_latency_ms[svc]:.0f}",
Text(spark(latency_hist), style="cyan"),
f"{snap.service_error_rate[svc] * 100:.1f}",
)
footer = Text.assemble(
("wrong_fixes: ", "dim"),
(str(snap.wrong_fixes), "bold"),
(" miscom: ", "dim"),
(str(snap.miscommunications), "bold"),
(" flags: ", "dim"),
(str(snap.oversight_flag_count), "bold"),
(" mttr: ", "dim"),
(str(snap.mttr_steps) if snap.mttr_steps >= 0 else "resolved", "bold"),
)
return Panel(Group(table, footer), title="Telemetry (real, ring-buffer)", border_style="blue")
def _render_turn(frame: DashboardFrame) -> Panel:
last = frame.last_step
if last is None:
body: Group | Text = Text("Episode about to begin.", style="dim")
else:
role = last.role
color = ROLE_COLORS.get(role, "white")
header = Text.assemble(
("Turn ", "dim"),
(str(last.turn), "bold"),
(" role: ", "dim"),
(role.value.upper(), f"bold {color}"),
(" action: ", "dim"),
(last.action.action_type.value, "bold"),
)
bd = last.breakdown
reward_text = Text.assemble(
("reward: ", "dim"),
(
f"{last.reward:+.1f}",
"bold green" if last.reward >= 0 else "bold red",
),
(
f" team {bd.team_reward:+.1f} oversight {bd.oversight_reward:+.1f}",
"dim",
),
)
detail_pairs = [
("resolved", bd.resolved_bonus),
("mttr", bd.mttr_penalty),
("wrong_fix", bd.wrong_fix_penalty),
("miscom", bd.miscommunication_penalty),
("early_rca", bd.early_root_cause_bonus),
("rogue+", bd.rogue_caught_bonus),
("rogue-", bd.rogue_false_positive_penalty),
("cascade", bd.cascade_penalty),
("budget", bd.under_budget_bonus),
]
non_zero = [f"{k}={v:+.0f}" for k, v in detail_pairs if v != 0]
details = Text(" | ".join(non_zero) if non_zero else "no reward components this step", style="dim")
body = Group(header, reward_text, details)
return Panel(body, title="Last Turn", border_style="yellow")
def render(frame: DashboardFrame) -> Layout:
layout = Layout()
layout.split_column(
Layout(name="header", size=3),
Layout(name="middle", ratio=1),
Layout(name="lower", size=11),
)
layout["middle"].split_row(
Layout(name="left", ratio=2),
Layout(name="right", ratio=1),
)
layout["left"].split_column(
Layout(name="services"),
Layout(name="chat"),
)
layout["right"].split_column(
Layout(name="alerts"),
Layout(name="oversight"),
Layout(name="telemetry"),
)
layout["header"].update(_render_header(frame.env, frame.cumulative_reward))
layout["services"].update(_render_services(frame.env))
layout["chat"].update(_render_chat(frame.env))
layout["alerts"].update(_render_alerts(frame.env))
layout["oversight"].update(_render_oversight(frame.env))
layout["telemetry"].update(_render_metrics_series(frame.env))
layout["lower"].update(_render_turn(frame))
return layout
# ---------------------------------------------------------------------------
# Driver
# ---------------------------------------------------------------------------
def _policy_by_name(name: str, failure_type: FailureType) -> Policy:
if name == "oracle":
return oracle_policy(failure_type)
if name == "heuristic":
return heuristic_policy(seed=0)
if name == "random":
return random_policy(seed=0)
raise SystemExit(f"unknown policy '{name}' (expected oracle|heuristic|random)")
def run_dashboard(
*,
failure_type: FailureType,
difficulty: DifficultyTier,
policy_name: str,
seed: int,
frame_delay: float,
) -> None:
env = ChaosOpsEnvironment()
scen = Scenario.from_type(failure_type, seed=seed, difficulty=difficulty)
policy = _policy_by_name(policy_name, failure_type)
observation = env.reset(scenario=scen)
cumulative = 0.0
last_step: EpisodeStep | None = None
console = Console()
with Live(render(DashboardFrame(env, None, 0.0, 0)), console=console, refresh_per_second=20) as live:
turn_limit = scen.max_steps * len(env.turn_order)
for turn in range(turn_limit):
role = observation.turn_role
action = policy(observation, role)
action = ChaosOpsAction.model_validate({**action.model_dump(), "role": role.value})
next_obs = env.step(action)
cumulative += next_obs.reward or 0.0
last_step = EpisodeStep(
turn=turn,
role=role,
observation=observation,
action=action,
reward=next_obs.reward or 0.0,
breakdown=env.last_breakdown, # type: ignore[arg-type]
done=next_obs.done,
)
live.update(render(DashboardFrame(env, last_step, cumulative, turn)))
if next_obs.done:
break
observation = next_obs
if frame_delay > 0:
time.sleep(frame_delay)
status = "RESOLVED" if env.state.resolved else "UNRESOLVED"
color = "green" if env.state.resolved else "red"
console.print()
console.print(
Panel(
Text.assemble(
("status: ", "dim"),
(status, f"bold {color}"),
(" final reward: ", "dim"),
(f"{env.state.cumulative_reward:+.1f}", f"bold {color}"),
(" MTTR steps: ", "dim"),
(str(env.state.step_count) if env.state.resolved else "—", "bold"),
(" wrong fixes: ", "dim"),
(str(env.state.wrong_fixes), "bold"),
(" oversight flags: ", "dim"),
(", ".join(env.state.oversight_flags) or "—", "bold"),
),
title="Episode Summary",
border_style=color,
)
)
def _parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="ChaosOps AI live dashboard")
parser.add_argument(
"--scenario",
type=str,
default=FailureType.AUTOSCALER_COST_CUT.value,
choices=[ft.value for ft in FailureType],
help="failure type to inject",
)
parser.add_argument(
"--difficulty",
type=str,
default=DifficultyTier.MEDIUM.value,
choices=[d.value for d in DifficultyTier],
)
parser.add_argument(
"--policy",
type=str,
default="oracle",
choices=["oracle", "heuristic", "random"],
)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument(
"--frame-delay",
type=float,
default=0.6,
help="seconds between turns; set to 0 for fastest playback",
)
return parser.parse_args()
def main() -> None:
args = _parse_args()
run_dashboard(
failure_type=FailureType(args.scenario),
difficulty=DifficultyTier(args.difficulty),
policy_name=args.policy,
seed=args.seed,
frame_delay=args.frame_delay,
)
if __name__ == "__main__":
main()