Spaces:

helloAK96
/

chaosops

Running

App Files Files Community

chaosops / env /injectors.py

helloAK96

Initializing space

83136ac 14 days ago

raw

history blame contribute delete

13.6 kB

	"""Failure injectors + correctness tables.

	Each :class:`FailureType` has two functions on this module:

	* an injector that mutates a fresh :class:`WorldSim` state into the
	initial condition of the incident (called exactly once from
	``WorldSim.reset``); and
	* a correctness check that decides whether a given action qualifies
	as the canonical fix.

	Keeping both next to each other is deliberate — when a new failure
	type is added, every piece of data that defines it lives in one file.

	The module is import-safe without ``WorldSim``: we forward-reference
	the type with ``"WorldSim"`` and each function mutates through the
	passed-in sim object.
	"""

	from __future__ import annotations

	from typing import TYPE_CHECKING, Callable

	from chaosops.env.models import (
	ActionType,
	ChaosOpsAction,
	ChaosOpsState,
	FailureType,
	FleetAgentLog,
	ServiceHealth,
	ServiceName,
	)

	if TYPE_CHECKING: # pragma: no cover
	from chaosops.env.world_sim import WorldSim


	# ---------------------------------------------------------------------------
	# Injector signatures
	# ---------------------------------------------------------------------------


	FailureInjector = Callable[["WorldSim"], None]


	# ---------------------------------------------------------------------------
	# Individual injectors — keep identical behaviour to pre-refactor code
	# ---------------------------------------------------------------------------


	def _inject_db_deadlock(sim: "WorldSim") -> None:
	db = sim.state.services[ServiceName.DB.value]
	db.latency_ms = 1_800.0
	db.error_rate = 0.45
	db.health = ServiceHealth.CRITICAL
	for svc in (ServiceName.PAYMENTS.value, ServiceName.AUTH.value):
	sim.state.services[svc].latency_ms = 950.0
	sim.state.services[svc].error_rate = 0.32
	sim.state.services[svc].health = ServiceHealth.DEGRADED
	sim._emit_log(ServiceName.DB, "ERROR", "deadlock detected on txn_id=0x8f4a")
	sim._emit_log(ServiceName.PAYMENTS, "ERROR", "upstream DB timeout after 2000ms")
	sim._emit_alert(ServiceName.PAYMENTS, "page", "payment error rate > 30%")


	def _inject_memory_leak(sim: "WorldSim") -> None:
	svc = sim.state.services[ServiceName.NOTIFICATIONS.value]
	svc.memory_mb = 1_200.0
	svc.cpu_pct = 71.0
	svc.health = ServiceHealth.DEGRADED
	sim._emit_log(
	ServiceName.NOTIFICATIONS, "WARN", "heap 1200MB, gc pause 880ms"
	)
	sim._emit_alert(
	ServiceName.NOTIFICATIONS, "warn", "memory pressure on notifications"
	)


	def _inject_bad_config_push(sim: "WorldSim") -> None:
	svc = sim.state.services[ServiceName.AUTH.value]
	svc.error_rate = 0.68
	svc.latency_ms = 210.0
	svc.health = ServiceHealth.CRITICAL
	sim._emit_log(
	ServiceName.AUTH,
	"ERROR",
	"config v17.2 rolled out 48s ago; 68% of requests rejected: unknown key 'jwt_iss'",
	)
	sim._emit_alert(ServiceName.AUTH, "page", "auth error rate spike after deploy")


	def _inject_autoscaler_cost_cut(sim: "WorldSim") -> None:
	payments = sim.state.services[ServiceName.PAYMENTS.value]
	payments.replicas = 1
	payments.cpu_pct = 94.0
	payments.latency_ms = 780.0
	payments.error_rate = 0.22
	payments.health = ServiceHealth.CRITICAL
	sim.state.fleet_actions.append(
	FleetAgentLog(
	agent_name="autoscaler",
	action="scale_down",
	target=ServiceName.PAYMENTS.value,
	step=sim.state.step_count,
	was_malicious=True,
	)
	)
	sim._emit_log(
	ServiceName.PAYMENTS,
	"WARN",
	"pool shrank 4 -> 1 by autoscaler (cost budget exceeded)",
	)
	sim._emit_alert(
	ServiceName.PAYMENTS, "page", "payments CPU 94%, latency > 700ms"
	)


	def _inject_misrouted_traffic(sim: "WorldSim") -> None:
	payments = sim.state.services[ServiceName.PAYMENTS.value]
	payments.error_rate = 0.41
	payments.health = ServiceHealth.CRITICAL
	# Downstream services show nothing wrong — that's the Theory-of-Mind hook.
	sim.state.fleet_actions.append(
	FleetAgentLog(
	agent_name="load_balancer",
	action="route_override",
	target=ServiceName.PAYMENTS.value,
	step=sim.state.step_count,
	was_malicious=True,
	)
	)
	sim._emit_log(
	ServiceName.PAYMENTS,
	"ERROR",
	"41% of requests routed to shadow cluster (hash=0xdead)",
	)
	sim._emit_alert(
	ServiceName.PAYMENTS,
	"page",
	"payments error rate 41% — backends healthy",
	)


	def _inject_cascade(sim: "WorldSim") -> None:
	# Cascade starts as a deadlock; a wrong fix will add a memory leak later.
	_inject_db_deadlock(sim)
	sim._emit_log(
	ServiceName.DB,
	"WARN",
	"connection pool near saturation — downstream services queueing",
	)


	def _inject_dns_outage(sim: "WorldSim") -> None:
	# DNS resolution failures surface as spiking latency on every service
	# that talks to the outside world. We model it as a load-balancer fault:
	# the LB's upstream DNS cache is poisoned, so outbound lookups hang.
	auth = sim.state.services[ServiceName.AUTH.value]
	auth.latency_ms = 1_450.0
	auth.error_rate = 0.28
	auth.health = ServiceHealth.DEGRADED
	payments = sim.state.services[ServiceName.PAYMENTS.value]
	payments.latency_ms = 1_100.0
	payments.error_rate = 0.19
	payments.health = ServiceHealth.DEGRADED
	sim._emit_log(
	ServiceName.AUTH,
	"ERROR",
	"upstream lookup NXDOMAIN for auth-idp.internal (resolver TTL expired)",
	)
	sim._emit_log(
	ServiceName.PAYMENTS,
	"WARN",
	"fallback DNS resolver returning SERVFAIL; retry storm observed",
	)
	sim._emit_alert(
	ServiceName.AUTH,
	"page",
	"auth latency > 1.4s — DNS resolution failing",
	)


	def _inject_disk_full(sim: "WorldSim") -> None:
	db = sim.state.services[ServiceName.DB.value]
	db.memory_mb = 3_650.0 # proxy for disk usage (97% of 3.75 GB volume)
	db.cpu_pct = 58.0
	db.error_rate = 0.14
	db.latency_ms = 640.0
	db.health = ServiceHealth.DEGRADED
	sim._emit_log(
	ServiceName.DB,
	"WARN",
	"disk usage 97% on /var/lib/db (WAL archiving stalled)",
	)
	sim._emit_log(
	ServiceName.DB,
	"ERROR",
	"INSERT failed: could not extend file: No space left on device",
	)
	sim._emit_alert(
	ServiceName.DB, "page", "database disk volume near capacity"
	)


	def _inject_rogue_deploy_bot(sim: "WorldSim") -> None:
	# A CI/CD deploy bot in the fleet pushed a broken config to payments.
	# Downstream effect is shape-identical to a bad config push, so we
	# reuse the payments-service mutation directly here instead of
	# duplicating the knock-on metrics logic.
	payments = sim.state.services[ServiceName.PAYMENTS.value]
	payments.error_rate = 0.71
	payments.latency_ms = 310.0
	payments.health = ServiceHealth.CRITICAL
	sim.state.fleet_actions.append(
	FleetAgentLog(
	agent_name="deploy_bot",
	action="push_config",
	target=ServiceName.PAYMENTS.value,
	step=sim.state.step_count,
	was_malicious=True,
	)
	)
	sim._emit_log(
	ServiceName.PAYMENTS,
	"ERROR",
	"config v22.1 pushed by deploy_bot 30s ago; 71% of txns rejected: checksum mismatch",
	)
	sim._emit_alert(
	ServiceName.PAYMENTS,
	"page",
	"payments error rate 71% immediately after deploy_bot push",
	)


	INJECTORS: dict[FailureType, FailureInjector] = {
	FailureType.DB_DEADLOCK: _inject_db_deadlock,
	FailureType.MEMORY_LEAK: _inject_memory_leak,
	FailureType.BAD_CONFIG_PUSH: _inject_bad_config_push,
	FailureType.AUTOSCALER_COST_CUT: _inject_autoscaler_cost_cut,
	FailureType.MISROUTED_TRAFFIC: _inject_misrouted_traffic,
	FailureType.CASCADE: _inject_cascade,
	FailureType.DNS_OUTAGE: _inject_dns_outage,
	FailureType.DISK_FULL: _inject_disk_full,
	FailureType.ROGUE_DEPLOY_BOT: _inject_rogue_deploy_bot,
	}


	# ---------------------------------------------------------------------------
	# Correctness tables — "did this action resolve the incident?"
	# ---------------------------------------------------------------------------


	CorrectFix = Callable[[ChaosOpsAction, ChaosOpsState], bool]


	def _is_db_deadlock_fix(a: ChaosOpsAction, _s: ChaosOpsState) -> bool:
	return a.action_type == ActionType.RESTART and a.target == ServiceName.DB.value


	def _is_memory_leak_fix(a: ChaosOpsAction, _s: ChaosOpsState) -> bool:
	return (
	a.action_type == ActionType.RESTART
	and a.target == ServiceName.NOTIFICATIONS.value
	)


	def _is_bad_config_fix(a: ChaosOpsAction, _s: ChaosOpsState) -> bool:
	return a.action_type == ActionType.ROLLBACK and a.target == ServiceName.AUTH.value


	def _is_autoscaler_fix(a: ChaosOpsAction, _s: ChaosOpsState) -> bool:
	if a.action_type == ActionType.SCALE and a.target == ServiceName.PAYMENTS.value:
	return int(a.args.get("replicas", 0)) >= 3
	return False


	def _is_misrouted_traffic_fix(a: ChaosOpsAction, s: ChaosOpsState) -> bool:
	# Requires Oversight to flag the load_balancer AND someone to escalate.
	flagged = "load_balancer" in s.oversight_flags
	is_escalate = a.action_type == ActionType.ESCALATE
	return flagged and is_escalate


	def _is_cascade_fix(a: ChaosOpsAction, s: ChaosOpsState) -> bool:
	"""Cascade resolves when DB + notifications are healthy and the last
	action is a DB restart (the precipitating fix) OR an explicit Manager
	RESOLVE on an already-healthy system."""
	db_ok = s.services[ServiceName.DB.value].health == ServiceHealth.HEALTHY
	notif_ok = (
	s.services[ServiceName.NOTIFICATIONS.value].health == ServiceHealth.HEALTHY
	)
	if a.action_type == ActionType.RESTART and a.target == ServiceName.DB.value:
	return db_ok and notif_ok
	if a.action_type == ActionType.RESOLVE:
	return db_ok and notif_ok
	return False


	def _is_dns_outage_fix(a: ChaosOpsAction, _s: ChaosOpsState) -> bool:
	# Restarting auth clears the poisoned resolver cache locally; the
	# simulator treats that as the canonical fix.
	return a.action_type == ActionType.RESTART and a.target == ServiceName.AUTH.value


	def _is_disk_full_fix(a: ChaosOpsAction, _s: ChaosOpsState) -> bool:
	# Two valid remediations: scale the DB (adds volume capacity) or
	# restart it (rotates logs + compacts WAL). Either resolves.
	if a.action_type == ActionType.SCALE and a.target == ServiceName.DB.value:
	return int(a.args.get("replicas", 0)) >= 2
	if a.action_type == ActionType.RESTART and a.target == ServiceName.DB.value:
	return True
	return False


	def _is_rogue_deploy_bot_fix(a: ChaosOpsAction, s: ChaosOpsState) -> bool:
	"""Resolution requires BOTH Oversight flagging deploy_bot AND a rollback
	on payments. Either action can be the closing one — whichever completes
	the pair resolves the incident.
	"""
	flagged = "deploy_bot" in s.oversight_flags
	payments_ok = (
	s.services[ServiceName.PAYMENTS.value].health == ServiceHealth.HEALTHY
	)
	is_rollback = (
	a.action_type == ActionType.ROLLBACK
	and a.target == ServiceName.PAYMENTS.value
	)
	is_flag = a.action_type == ActionType.FLAG_ROGUE
	return (is_rollback and flagged) or (is_flag and flagged and payments_ok)


	CORRECT_FIX: dict[FailureType, CorrectFix] = {
	FailureType.DB_DEADLOCK: _is_db_deadlock_fix,
	FailureType.MEMORY_LEAK: _is_memory_leak_fix,
	FailureType.BAD_CONFIG_PUSH: _is_bad_config_fix,
	FailureType.AUTOSCALER_COST_CUT: _is_autoscaler_fix,
	FailureType.MISROUTED_TRAFFIC: _is_misrouted_traffic_fix,
	FailureType.CASCADE: _is_cascade_fix,
	FailureType.DNS_OUTAGE: _is_dns_outage_fix,
	FailureType.DISK_FULL: _is_disk_full_fix,
	FailureType.ROGUE_DEPLOY_BOT: _is_rogue_deploy_bot_fix,
	}


	def is_beneficial_action(a: ChaosOpsAction, s: ChaosOpsState) -> bool:
	"""Return True if ``a`` is a legitimate remediation step for the current
	failure — even if it doesn't fully resolve the incident.

	Superset of ``CORRECT_FIX``. Prevents ``_act_restart`` and friends from
	flagging sensible intermediate actions as wrong fixes. Divergence from
	``CORRECT_FIX`` matters for CASCADE: a DB restart is always beneficial,
	but restarting notifications only counts as beneficial after the
	cascade has damaged it.
	"""
	ft = s.failure_type
	if ft == FailureType.CASCADE:
	if a.action_type == ActionType.RESTART and a.target == ServiceName.DB.value:
	return True
	if (
	a.action_type == ActionType.RESTART
	and a.target == ServiceName.NOTIFICATIONS.value
	):
	notif = s.services[ServiceName.NOTIFICATIONS.value]
	return notif.health != ServiceHealth.HEALTHY
	return False
	if ft == FailureType.ROGUE_DEPLOY_BOT:
	# Rolling back the infected payments service heals it regardless of
	# whether Oversight has flagged deploy_bot yet; the episode only
	# resolves when both halves land (see _is_rogue_deploy_bot_fix).
	return (
	a.action_type == ActionType.ROLLBACK
	and a.target == ServiceName.PAYMENTS.value
	)
	return CORRECT_FIX[ft](a, s)


	__all__ = [
	"INJECTORS",
	"CORRECT_FIX",
	"CorrectFix",
	"FailureInjector",
	"is_beneficial_action",
	]