Spaces:

helloAK96
/

chaosops

Sleeping

App Files Files Community

chaosops / dashboard /terminal.py

helloAK96

Initializing space

83136ac about 1 month ago

raw

history blame contribute delete

16.3 kB

	"""Rich-based live dashboard for ChaosOps AI episodes.

	Used during the demo to make the env tangible: the judges watch the alert
	fire, the agents converse, and the Oversight panel light up in real time.

	CLI
	----
	python -m chaosops.dashboard.terminal \
	--scenario autoscaler_cost_cut \
	--policy oracle \
	--difficulty medium

	By default the dashboard renders frame-by-frame with a short sleep; pass
	``--no-sleep`` to run through a recorded episode as fast as the terminal
	can refresh (useful for screenshot capture).
	"""

	from __future__ import annotations

	import argparse
	import time
	from dataclasses import dataclass

	from rich.align import Align
	from rich.console import Console, Group
	from rich.layout import Layout
	from rich.live import Live
	from rich.panel import Panel
	from rich.progress_bar import ProgressBar
	from rich.table import Table
	from rich.text import Text

	from chaosops.agents.policies import (
	Policy,
	heuristic_policy,
	oracle_policy,
	random_policy,
	)
	from chaosops.agents.runner import EpisodeStep
	from chaosops.env.environment import ChaosOpsEnvironment
	from chaosops.env.models import (
	AgentRole,
	ChaosOpsAction,
	DifficultyTier,
	FailureType,
	ServiceHealth,
	)
	from chaosops.env.world_sim import Scenario


	ROLE_COLORS: dict[AgentRole, str] = {
	AgentRole.SRE: "cyan",
	AgentRole.DEV: "magenta",
	AgentRole.MANAGER: "yellow",
	AgentRole.OVERSIGHT: "bright_red",
	}

	HEALTH_STYLES: dict[ServiceHealth, tuple[str, str]] = {
	ServiceHealth.HEALTHY: ("●", "green"),
	ServiceHealth.DEGRADED: ("●", "yellow"),
	ServiceHealth.CRITICAL: ("●", "red"),
	ServiceHealth.DOWN: ("●", "bright_red"),
	}


	# ---------------------------------------------------------------------------
	# Dashboard state
	# ---------------------------------------------------------------------------


	@dataclass
	class DashboardFrame:
	env: ChaosOpsEnvironment
	last_step: EpisodeStep \| None
	cumulative_reward: float
	turn_index: int


	# ---------------------------------------------------------------------------
	# Rendering
	# ---------------------------------------------------------------------------


	def _render_header(env: ChaosOpsEnvironment, cumulative_reward: float) -> Panel:
	scen = env._sim._scenario # noqa: SLF001 — dashboard inspects the sim
	title = Text("ChaosOps AI", style="bold white on red")
	ft = "n/a" if scen is None else scen.failure_type.value
	diff = "n/a" if scen is None else scen.difficulty.value
	sub = Text.assemble(
	("scenario: ", "dim"),
	(ft, "bold"),
	(" difficulty: ", "dim"),
	(diff, "bold"),
	(" step: ", "dim"),
	(str(env.state.step_count), "bold"),
	(" reward: ", "dim"),
	(f"{cumulative_reward:+.1f}", "bold green" if cumulative_reward >= 0 else "bold red"),
	)
	return Panel(Align.center(Group(title, sub)), border_style="red", padding=(0, 2))


	def _render_services(env: ChaosOpsEnvironment) -> Panel:
	table = Table(expand=True, show_edge=False, pad_edge=False, header_style="bold white")
	table.add_column("Service", width=16)
	table.add_column("Health", width=10)
	table.add_column("CPU %", justify="right")
	table.add_column("Mem MB", justify="right")
	table.add_column("Latency ms", justify="right")
	table.add_column("Err rate", justify="right")
	table.add_column("Replicas", justify="right")
	for name, metrics in env.state.services.items():
	glyph, color = HEALTH_STYLES[metrics.health]
	table.add_row(
	name,
	Text(f"{glyph} {metrics.health.value}", style=color),
	f"{metrics.cpu_pct:>5.1f}",
	f"{metrics.memory_mb:>7.0f}",
	f"{metrics.latency_ms:>8.0f}",
	f"{metrics.error_rate:>6.2%}",
	str(metrics.replicas),
	)
	return Panel(table, title="Services", border_style="cyan")


	def _render_alerts(env: ChaosOpsEnvironment) -> Panel:
	if not env.state.all_alerts:
	body: Table \| Text = Text("No active alerts.", style="dim")
	else:
	body = Table(expand=True, show_edge=False, pad_edge=False)
	body.add_column("Sev")
	body.add_column("Service")
	body.add_column("Message", overflow="fold")
	body.add_column("Step", justify="right")
	for alert in env.state.all_alerts[-6:]:
	sev_style = {"page": "bold red", "warn": "yellow", "info": "blue"}.get(
	alert.severity, "white"
	)
	body.add_row(
	Text(alert.severity, style=sev_style),
	alert.service.value,
	alert.message,
	str(alert.triggered_at_step),
	)
	return Panel(body, title="Alerts", border_style="red")


	def _render_chat(env: ChaosOpsEnvironment) -> Panel:
	if not env.state.chat_history:
	body = Text("Channel silent. Waiting for first message.", style="dim")
	else:
	lines = []
	for msg in env.state.chat_history[-10:]:
	# Format "[role] body" — color the role tag.
	if msg.startswith("[") and "]" in msg:
	close = msg.index("]")
	role_tag = msg[1:close]
	body_txt = msg[close + 1 :].strip()
	try:
	role = AgentRole(role_tag)
	style = ROLE_COLORS.get(role, "white")
	except ValueError:
	style = "white"
	lines.append(
	Text.assemble(
	(f"{role_tag:<10}", f"bold {style}"),
	(" ", ""),
	(body_txt, "white"),
	)
	)
	else:
	lines.append(Text(msg))
	body = Group(*lines)
	return Panel(body, title="Incident Channel", border_style="magenta")


	def _render_oversight(env: ChaosOpsEnvironment) -> Panel:
	"""Oversight panel — suspicion bars driven by REAL fleet activity.

	Suspicion is a function of: (a) count of recorded malicious-looking
	actions by each fleet agent (from the actual ``fleet_actions`` log),
	(b) whether Oversight has already flagged the agent. No synthetic
	numbers — every movement of the bar corresponds to an event the
	simulator actually logged.
	"""
	# Count fleet actions per agent — higher count = more suspicious.
	action_counts: dict[str, int] = {"autoscaler": 0, "load_balancer": 0}
	for fa in env.state.fleet_actions:
	if fa.agent_name in action_counts:
	action_counts[fa.agent_name] += 1

	suspicion: dict[str, float] = {}
	for agent, count in action_counts.items():
	# 0 actions -> 0.05 baseline; each observed action adds 0.25, capped at 0.90.
	suspicion[agent] = min(0.05 + 0.25 * count, 0.90)
	for flagged in env.state.oversight_flags:
	if flagged in suspicion:
	suspicion[flagged] = 0.95

	table = Table(expand=True, show_edge=False, pad_edge=False)
	table.add_column("Fleet agent", width=16)
	table.add_column("Suspicion", ratio=1)
	table.add_column("Actions", width=8, justify="right")
	table.add_column("Status", width=12, justify="right")
	for agent, score in suspicion.items():
	bar = ProgressBar(total=1.0, completed=score, width=None)
	flagged = agent in env.state.oversight_flags
	status = Text("FLAGGED", style="bold red") if flagged else Text("ok", style="green")
	table.add_row(agent, bar, str(action_counts[agent]), status)
	return Panel(table, title="Oversight Belief", border_style="bright_red")


	def _render_metrics_series(env: ChaosOpsEnvironment) -> Panel:
	"""Live time-series panel — real numbers from the ring buffer."""
	snap = env.latest_metrics()
	if snap is None:
	return Panel(Text("Metrics bootstrapping...", style="dim"),
	title="Telemetry", border_style="blue")

	# Render per-service sparklines using a tiny unicode block set.
	def spark(values: list[float]) -> str:
	if not values:
	return ""
	blocks = " ▁▂▃▄▅▆▇█"
	lo = min(values)
	hi = max(values) if max(values) > lo else lo + 1.0
	return "".join(
	blocks[min(len(blocks) - 1,
	int((v - lo) / (hi - lo) * (len(blocks) - 1)))]
	for v in values[-20:]
	)

	table = Table(expand=True, show_edge=False, pad_edge=False)
	table.add_column("Service", width=14)
	table.add_column("Latency (ms)", justify="right", width=14)
	table.add_column("Trend", ratio=1)
	table.add_column("Err %", justify="right", width=8)
	for svc in snap.service_latency_ms.keys():
	latency_hist = env.metrics.latency_series(svc)
	table.add_row(
	svc,
	f"{snap.service_latency_ms[svc]:.0f}",
	Text(spark(latency_hist), style="cyan"),
	f"{snap.service_error_rate[svc] * 100:.1f}",
	)

	footer = Text.assemble(
	("wrong_fixes: ", "dim"),
	(str(snap.wrong_fixes), "bold"),
	(" miscom: ", "dim"),
	(str(snap.miscommunications), "bold"),
	(" flags: ", "dim"),
	(str(snap.oversight_flag_count), "bold"),
	(" mttr: ", "dim"),
	(str(snap.mttr_steps) if snap.mttr_steps >= 0 else "resolved", "bold"),
	)
	return Panel(Group(table, footer), title="Telemetry (real, ring-buffer)", border_style="blue")


	def _render_turn(frame: DashboardFrame) -> Panel:
	last = frame.last_step
	if last is None:
	body: Group \| Text = Text("Episode about to begin.", style="dim")
	else:
	role = last.role
	color = ROLE_COLORS.get(role, "white")
	header = Text.assemble(
	("Turn ", "dim"),
	(str(last.turn), "bold"),
	(" role: ", "dim"),
	(role.value.upper(), f"bold {color}"),
	(" action: ", "dim"),
	(last.action.action_type.value, "bold"),
	)
	bd = last.breakdown
	reward_text = Text.assemble(
	("reward: ", "dim"),
	(
	f"{last.reward:+.1f}",
	"bold green" if last.reward >= 0 else "bold red",
	),
	(
	f" team {bd.team_reward:+.1f} oversight {bd.oversight_reward:+.1f}",
	"dim",
	),
	)
	detail_pairs = [
	("resolved", bd.resolved_bonus),
	("mttr", bd.mttr_penalty),
	("wrong_fix", bd.wrong_fix_penalty),
	("miscom", bd.miscommunication_penalty),
	("early_rca", bd.early_root_cause_bonus),
	("rogue+", bd.rogue_caught_bonus),
	("rogue-", bd.rogue_false_positive_penalty),
	("cascade", bd.cascade_penalty),
	("budget", bd.under_budget_bonus),
	]
	non_zero = [f"{k}={v:+.0f}" for k, v in detail_pairs if v != 0]
	details = Text(" \| ".join(non_zero) if non_zero else "no reward components this step", style="dim")
	body = Group(header, reward_text, details)
	return Panel(body, title="Last Turn", border_style="yellow")


	def render(frame: DashboardFrame) -> Layout:
	layout = Layout()
	layout.split_column(
	Layout(name="header", size=3),
	Layout(name="middle", ratio=1),
	Layout(name="lower", size=11),
	)
	layout["middle"].split_row(
	Layout(name="left", ratio=2),
	Layout(name="right", ratio=1),
	)
	layout["left"].split_column(
	Layout(name="services"),
	Layout(name="chat"),
	)
	layout["right"].split_column(
	Layout(name="alerts"),
	Layout(name="oversight"),
	Layout(name="telemetry"),
	)
	layout["header"].update(_render_header(frame.env, frame.cumulative_reward))
	layout["services"].update(_render_services(frame.env))
	layout["chat"].update(_render_chat(frame.env))
	layout["alerts"].update(_render_alerts(frame.env))
	layout["oversight"].update(_render_oversight(frame.env))
	layout["telemetry"].update(_render_metrics_series(frame.env))
	layout["lower"].update(_render_turn(frame))
	return layout


	# ---------------------------------------------------------------------------
	# Driver
	# ---------------------------------------------------------------------------


	def _policy_by_name(name: str, failure_type: FailureType) -> Policy:
	if name == "oracle":
	return oracle_policy(failure_type)
	if name == "heuristic":
	return heuristic_policy(seed=0)
	if name == "random":
	return random_policy(seed=0)
	raise SystemExit(f"unknown policy '{name}' (expected oracle\|heuristic\|random)")


	def run_dashboard(
	*,
	failure_type: FailureType,
	difficulty: DifficultyTier,
	policy_name: str,
	seed: int,
	frame_delay: float,
	) -> None:
	env = ChaosOpsEnvironment()
	scen = Scenario.from_type(failure_type, seed=seed, difficulty=difficulty)
	policy = _policy_by_name(policy_name, failure_type)
	observation = env.reset(scenario=scen)
	cumulative = 0.0
	last_step: EpisodeStep \| None = None

	console = Console()
	with Live(render(DashboardFrame(env, None, 0.0, 0)), console=console, refresh_per_second=20) as live:
	turn_limit = scen.max_steps * len(env.turn_order)
	for turn in range(turn_limit):
	role = observation.turn_role
	action = policy(observation, role)
	action = ChaosOpsAction.model_validate({**action.model_dump(), "role": role.value})
	next_obs = env.step(action)
	cumulative += next_obs.reward or 0.0
	last_step = EpisodeStep(
	turn=turn,
	role=role,
	observation=observation,
	action=action,
	reward=next_obs.reward or 0.0,
	breakdown=env.last_breakdown, # type: ignore[arg-type]
	done=next_obs.done,
	)
	live.update(render(DashboardFrame(env, last_step, cumulative, turn)))
	if next_obs.done:
	break
	observation = next_obs
	if frame_delay > 0:
	time.sleep(frame_delay)

	status = "RESOLVED" if env.state.resolved else "UNRESOLVED"
	color = "green" if env.state.resolved else "red"
	console.print()
	console.print(
	Panel(
	Text.assemble(
	("status: ", "dim"),
	(status, f"bold {color}"),
	(" final reward: ", "dim"),
	(f"{env.state.cumulative_reward:+.1f}", f"bold {color}"),
	(" MTTR steps: ", "dim"),
	(str(env.state.step_count) if env.state.resolved else "—", "bold"),
	(" wrong fixes: ", "dim"),
	(str(env.state.wrong_fixes), "bold"),
	(" oversight flags: ", "dim"),
	(", ".join(env.state.oversight_flags) or "—", "bold"),
	),
	title="Episode Summary",
	border_style=color,
	)
	)


	def _parse_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser(description="ChaosOps AI live dashboard")
	parser.add_argument(
	"--scenario",
	type=str,
	default=FailureType.AUTOSCALER_COST_CUT.value,
	choices=[ft.value for ft in FailureType],
	help="failure type to inject",
	)
	parser.add_argument(
	"--difficulty",
	type=str,
	default=DifficultyTier.MEDIUM.value,
	choices=[d.value for d in DifficultyTier],
	)
	parser.add_argument(
	"--policy",
	type=str,
	default="oracle",
	choices=["oracle", "heuristic", "random"],
	)
	parser.add_argument("--seed", type=int, default=42)
	parser.add_argument(
	"--frame-delay",
	type=float,
	default=0.6,
	help="seconds between turns; set to 0 for fastest playback",
	)
	return parser.parse_args()


	def main() -> None:
	args = _parse_args()
	run_dashboard(
	failure_type=FailureType(args.scenario),
	difficulty=DifficultyTier(args.difficulty),
	policy_name=args.policy,
	seed=args.seed,
	frame_delay=args.frame_delay,
	)


	if __name__ == "__main__":
	main()