Spaces:

helloAK96
/

chaosops

Running

App Files Files Community

helloAK96 commited on 14 days ago

Commit

1d27c7d

1 Parent(s): 56464b6

rebuilding

Browse files

Files changed (13) hide show

.gitattributes +35 -0
.omc/project-memory.json +120 -0
.omc/state/agent-replay-47169e9f-c0c1-431f-bf0f-84312b895ce6.jsonl +1 -0
.omc/state/checkpoints/checkpoint-2026-04-25T09-09-34-592Z.json +16 -0
.omc/state/hud-state.json +6 -0
.omc/state/hud-stdin-cache.json +1 -0
.omc/state/idle-notif-cooldown.json +3 -0
.omc/state/subagent-tracking.json +7 -0
Dockerfile +38 -0
FETCH_HEAD +0 -0
README.md +1 -0
app.py +254 -0
requirements.txt +16 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.omc/project-memory.json ADDED Viewed

	@@ -0,0 +1,120 @@

+{
+  "version": "1.0.0",
+  "lastScanned": 1777108314607,
+  "projectRoot": "/Users/aayushashokkhopade/Desktop/meta_hack/chaosops",
+  "techStack": {
+    "languages": [],
+    "frameworks": [],
+    "packageManager": null,
+    "runtime": null
+  },
+  "build": {
+    "buildCommand": null,
+    "testCommand": null,
+    "lintCommand": null,
+    "devCommand": null,
+    "scripts": {}
+  },
+  "conventions": {
+    "namingStyle": null,
+    "importStyle": null,
+    "testPattern": null,
+    "fileOrganization": null
+  },
+  "structure": {
+    "isMonorepo": false,
+    "workspaces": [],
+    "mainDirectories": [],
+    "gitBranches": null
+  },
+  "customNotes": [],
+  "directoryMap": {
+    "__pycache__": {
+      "path": "__pycache__",
+      "purpose": null,
+      "fileCount": 1,
+      "lastAccessed": 1777108314594,
+      "keyFiles": [
+        "__init__.cpython-311.pyc"
+      ]
+    },
+    "agents": {
+      "path": "agents",
+      "purpose": null,
+      "fileCount": 5,
+      "lastAccessed": 1777108314595,
+      "keyFiles": [
+        "__init__.py",
+        "llm_adapter.py",
+        "policies.py",
+        "runner.py",
+        "trained_policy.py"
+      ]
+    },
+    "curriculum": {
+      "path": "curriculum",
+      "purpose": null,
+      "fileCount": 2,
+      "lastAccessed": 1777108314595,
+      "keyFiles": [
+        "__init__.py",
+        "generator.py"
+      ]
+    },
+    "dashboard": {
+      "path": "dashboard",
+      "purpose": null,
+      "fileCount": 3,
+      "lastAccessed": 1777108314595,
+      "keyFiles": [
+        "__init__.py",
+        "terminal.py",
+        "transcript.py"
+      ]
+    },
+    "env": {
+      "path": "env",
+      "purpose": null,
+      "fileCount": 9,
+      "lastAccessed": 1777108314596,
+      "keyFiles": [
+        "__init__.py",
+        "action_handlers.py",
+        "environment.py",
+        "injectors.py",
+        "metrics.py"
+      ]
+    },
+    "rewards": {
+      "path": "rewards",
+      "purpose": null,
+      "fileCount": 2,
+      "lastAccessed": 1777108314596,
+      "keyFiles": [
+        "__init__.py",
+        "reward_fn.py"
+      ]
+    },
+    "train": {
+      "path": "train",
+      "purpose": null,
+      "fileCount": 4,
+      "lastAccessed": 1777108314596,
+      "keyFiles": [
+        "__init__.py",
+        "baseline.py",
+        "evaluate.py",
+        "grpo_train.py"
+      ]
+    }
+  },
+  "hotPaths": [
+    {
+      "path": "README.md",
+      "accessCount": 2,
+      "lastAccessed": 1777108362096,
+      "type": "file"
+    }
+  ],
+  "userDirectives": []
+}

.omc/state/agent-replay-47169e9f-c0c1-431f-bf0f-84312b895ce6.jsonl ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"t":0,"agent":"a1e8a1b","agent_type":"unknown","event":"agent_stop","success":true}

.omc/state/checkpoints/checkpoint-2026-04-25T09-09-34-592Z.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "created_at": "2026-04-25T09:09:34.591Z",
+  "trigger": "auto",
+  "active_modes": {},
+  "todo_summary": {
+    "pending": 0,
+    "in_progress": 0,
+    "completed": 0
+  },
+  "wisdom_exported": false,
+  "background_jobs": {
+    "active": [],
+    "recent": [],
+    "stats": null
+  }
+}

.omc/state/hud-state.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "timestamp": "2026-04-25T09:06:26.159Z",
+  "backgroundTasks": [],
+  "sessionStartTimestamp": "2026-04-25T08:15:37.276Z",
+  "sessionId": "47169e9f-c0c1-431f-bf0f-84312b895ce6"
+}

.omc/state/hud-stdin-cache.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"session_id":"47169e9f-c0c1-431f-bf0f-84312b895ce6","transcript_path":"/Users/aayushashokkhopade/.claude/projects/-Users-aayushashokkhopade-Desktop-meta-hack/47169e9f-c0c1-431f-bf0f-84312b895ce6.jsonl","cwd":"/Users/aayushashokkhopade/Desktop/meta_hack/chaosops","model":{"id":"claude-opus-4-7","display_name":"Opus 4.7"},"workspace":{"current_dir":"/Users/aayushashokkhopade/Desktop/meta_hack/chaosops","project_dir":"/Users/aayushashokkhopade/Desktop/meta_hack","added_dirs":[]},"version":"2.1.114","output_style":{"name":"default"},"cost":{"total_cost_usd":45.634932250000006,"total_duration_ms":261722691,"total_api_duration_ms":4784907,"total_lines_added":1711,"total_lines_removed":214},"context_window":{"total_input_tokens":93753,"total_output_tokens":292190,"context_window_size":200000,"current_usage":{"input_tokens":6,"output_tokens":463,"cache_creation_input_tokens":1978,"cache_read_input_tokens":49566},"used_percentage":26,"remaining_percentage":74},"exceeds_200k_tokens":false}

.omc/state/idle-notif-cooldown.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "lastSentAt": "2026-04-25T09:13:09.398Z"
+}

.omc/state/subagent-tracking.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "agents": [],
+  "total_spawned": 0,
+  "total_completed": 0,
+  "total_failed": 0,
+  "last_updated": "2026-04-25T09:11:53.929Z"
+}

Dockerfile ADDED Viewed

	@@ -0,0 +1,38 @@

+# ChaosOps AI — Hugging Face Spaces Dockerfile
+#
+# Hugging Face Spaces convention:
+# * Image must run as a non-root user (uid 1000).
+# * App listens on port 7860 (Spaces routes external traffic to this port).
+# * /home/user/app is the working directory the Space picks up automatically.
+#
+# Build pipeline is split so pip-install layer is cached independently of the
+# app code — every code edit only re-uploads the small final COPY.
+FROM python:3.11-slim
+# System deps:
+#   git   — pip needs this to install `chaosops` from the GitHub source.
+#   curl  — handy for in-container HF Hub debug; small footprint.
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        git \
+        curl \
+    && rm -rf /var/lib/apt/lists/*
+# Non-root user (Spaces requirement).
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+# Cache pip layer independently of source.
+COPY --chown=user:user requirements.txt .
+RUN pip install --no-cache-dir --user -r requirements.txt
+# Copy the rest of the Space (app.py, README.md, etc.).
+COPY --chown=user:user . .
+# Spaces routes traffic to 7860; Gradio binds here.
+EXPOSE 7860
+CMD ["python", "app.py"]

FETCH_HEAD ADDED Viewed

File without changes

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ emoji: 🌖
 colorFrom: purple
 colorTo: indigo
 sdk: docker
 pinned: false
 license: mit
 short_description: handling chaos

 colorFrom: purple
 colorTo: indigo
 sdk: docker
+app_port: 7860
 pinned: false
 license: mit
 short_description: handling chaos

app.py ADDED Viewed

	@@ -0,0 +1,254 @@

+"""ChaosOps AI — Hugging Face Space entry point.
+Gradio UI that lets a judge replay any incident scenario with any policy
+(random / heuristic / oracle / trained) and watch the multi-agent response
+unfold step-by-step. The trained-policy lane activates when the environment
+variable ``CHAOSOPS_ADAPTER_PATH`` points at a LoRA adapter directory —
+otherwise the Space still runs, silently falling back to the heuristic so
+the UI works during cold-start or when no checkpoint has been uploaded yet.
+Deploy layout:
+    hf_space/
+        app.py            — this file (entry point HF Spaces picks up)
+        requirements.txt  — pulls chaosops from GitHub + Gradio + torch stack
+        README.md         — HF Space card (YAML frontmatter)
+"""
+from __future__ import annotations
+import html
+import os
+from pathlib import Path
+import gradio as gr
+from chaosops.agents.policies import (
+    Policy,
+    heuristic_policy,
+    oracle_policy,
+    random_policy,
+)
+from chaosops.agents.runner import EpisodeResult, run_episode
+from chaosops.dashboard.transcript import ROLE_TAG, render_transcript
+from chaosops.env.environment import ChaosOpsEnvironment
+from chaosops.env.models import AgentRole, DifficultyTier, FailureType
+from chaosops.env.world_sim import Scenario
+ADAPTER_ENV = "CHAOSOPS_ADAPTER_PATH"
+_TRAINED_POLICY_CACHE = None
+# ---------------------------------------------------------------------------
+# Policy resolution
+# ---------------------------------------------------------------------------
+def _lazy_trained_policy():
+    """Load the trained LoRA adapter once per process, lazily.
+    ``CHAOSOPS_ADAPTER_PATH`` accepts either:
+      * a local filesystem path (used in Colab / local dev), or
+      * an HF Hub repo id like ``VatsalHF30/chaosops-grpo-lora`` (Spaces).
+    For repo ids we materialise the adapter to local disk via
+    ``snapshot_download`` on the first call — the second call hits the
+    in-process cache and is free.
+    """
+    global _TRAINED_POLICY_CACHE
+    if _TRAINED_POLICY_CACHE is not None:
+        return _TRAINED_POLICY_CACHE
+    adapter_ref = os.environ.get(ADAPTER_ENV)
+    if not adapter_ref:
+        return None
+    local_path = Path(adapter_ref)
+    if not local_path.exists():
+        # Treat the value as an HF Hub repo id and snapshot_download it.
+        try:
+            from huggingface_hub import snapshot_download
+        except ImportError:
+            return None
+        try:
+            local_path = Path(
+                snapshot_download(repo_id=adapter_ref, repo_type="model")
+            )
+        except Exception:
+            # Network failure / private repo / typo — fall back to heuristic.
+            return None
+    from chaosops.agents.trained_policy import TrainedPolicy
+    _TRAINED_POLICY_CACHE = TrainedPolicy.from_adapter(local_path)
+    return _TRAINED_POLICY_CACHE
+def _build_policy(name: str, scenario: Scenario) -> Policy:
+    if name == "random":
+        return random_policy(seed=scenario.seed)
+    if name == "heuristic":
+        return heuristic_policy(seed=scenario.seed)
+    if name == "oracle":
+        return oracle_policy(scenario.failure_type)
+    if name == "trained":
+        trained = _lazy_trained_policy()
+        if trained is None:
+            # Graceful fallback — Space is still useful before adapter lands.
+            return heuristic_policy(seed=scenario.seed)
+        return trained.as_policy()
+    raise ValueError(f"unknown policy '{name}'")
+# ---------------------------------------------------------------------------
+# Rendering helpers
+# ---------------------------------------------------------------------------
+_ROLE_COLOR: dict[str, str] = {
+    "SRE": "#2980b9",
+    "DEV": "#16a085",
+    "MGR": "#8e44ad",
+    "OVS": "#c0392b",
+}
+def _render_chat_html(result: EpisodeResult) -> str:
+    """Render the episode as a coloured chat log for the Gradio HTML widget."""
+    blocks: list[str] = []
+    for step in result.steps:
+        tag = ROLE_TAG[step.role]
+        color = _ROLE_COLOR.get(tag, "#333")
+        args = step.action.args or {}
+        args_str = " ".join(f"{k}={v}" for k, v in args.items())
+        target = step.action.target or "-"
+        summary = (
+            f"{step.action.action_type.value} target={target}"
+            + (f" {args_str}" if args_str else "")
+        )
+        blocks.append(
+            f'<div style="margin-bottom:6px;">'
+            f'<span style="color:{color};font-weight:600;">t{step.turn:02d} [{tag}]</span> '
+            f'<span style="font-family:monospace;">{html.escape(summary)}</span> '
+            f'<span style="color:#888;">reward={step.reward:+.1f}</span>'
+            f"</div>"
+        )
+    footer = (
+        f'<hr style="margin:10px 0;">'
+        f'<div><b>resolved:</b> {result.resolved} · '
+        f'<b>steps:</b> {result.final_step} · '
+        f'<b>cum_reward:</b> {result.cumulative_reward:+.1f} · '
+        f'<b>wrong_fixes:</b> {result.wrong_fixes} · '
+        f'<b>oversight_flags:</b> {result.oversight_flags or "[]"}</div>'
+    )
+    return '<div style="font-size:13px;line-height:1.5;">' + "".join(blocks) + footer + "</div>"
+# ---------------------------------------------------------------------------
+# Episode runner (called from the Gradio button)
+# ---------------------------------------------------------------------------
+def run_scenario(failure: str, difficulty: str, policy_name: str, seed: int):
+    scenario = Scenario.from_type(
+        FailureType(failure),
+        seed=int(seed),
+        difficulty=DifficultyTier(difficulty),
+    )
+    policy = _build_policy(policy_name, scenario)
+    env = ChaosOpsEnvironment()
+    result = run_episode(env, scenario, {r: policy for r in AgentRole})
+    chat_html = _render_chat_html(result)
+    transcript = render_transcript(result)
+    summary = {
+        "failure_type": failure,
+        "difficulty": difficulty,
+        "policy": policy_name,
+        "seed": int(seed),
+        "resolved": result.resolved,
+        "steps_to_resolve": result.final_step if result.resolved else None,
+        "cumulative_reward": round(result.cumulative_reward, 2),
+        "wrong_fixes": result.wrong_fixes,
+        "oversight_flags": result.oversight_flags,
+    }
+    return chat_html, summary, transcript
+# ---------------------------------------------------------------------------
+# UI
+# ---------------------------------------------------------------------------
+INTRO_MARKDOWN = """
+# ChaosOps AI — Multi-Agent Incident-Response Gym
+A reinforcement-learning environment where a **four-agent fleet**
+(SRE · Dev · Manager · **Oversight**) resolves a randomly injected
+infrastructure incident. The fourth agent is a **scalable-oversight model**
+whose job is to detect when *another AI in the fleet* (autoscaler,
+load_balancer, deploy_bot) caused the incident — before the remediation
+team touches the services.
+**Policies**
+- `random` · hard lower bound
+- `heuristic` · what a decent human SRE would try
+- `oracle` · cheats (knows ground truth) — upper-bound curve
+- `trained` · our GRPO-tuned Qwen 2.5 0.5B LoRA checkpoint
+Pick a failure type, smash **Run episode**, watch the team coordinate (or fail).
+"""
+def build_demo() -> gr.Blocks:
+    failure_choices = [f.value for f in FailureType]
+    tier_choices = [t.value for t in DifficultyTier]
+    policy_choices = ["random", "heuristic", "oracle", "trained"]
+    with gr.Blocks(title="ChaosOps AI") as demo:
+        gr.Markdown(INTRO_MARKDOWN)
+        with gr.Row():
+            with gr.Column(scale=1):
+                failure = gr.Dropdown(
+                    failure_choices,
+                    value="rogue_deploy_bot",
+                    label="Failure type",
+                )
+                difficulty = gr.Dropdown(
+                    tier_choices,
+                    value="hard",
+                    label="Difficulty",
+                )
+                policy = gr.Dropdown(
+                    policy_choices,
+                    value="oracle",
+                    label="Policy",
+                )
+                seed = gr.Number(value=42, precision=0, label="Seed")
+                run_btn = gr.Button("▶ Run episode", variant="primary")
+                gr.Markdown(
+                    "_Trained policy requires `CHAOSOPS_ADAPTER_PATH` to be "
+                    "set on the Space. It falls back to the heuristic otherwise._"
+                )
+            with gr.Column(scale=2):
+                chat_out = gr.HTML(label="Episode chat")
+                summary_out = gr.JSON(label="Summary")
+        transcript_out = gr.Textbox(
+            label="Full transcript (reward breakdown)",
+            lines=18,
+        )
+        run_btn.click(
+            run_scenario,
+            inputs=[failure, difficulty, policy, seed],
+            outputs=[chat_out, summary_out, transcript_out],
+        )
+    return demo
+if __name__ == "__main__":
+    # Docker Spaces route external traffic to port 7860; bind on 0.0.0.0 so
+    # the container's network namespace exposes the server beyond localhost.
+    build_demo().launch(server_name="0.0.0.0", server_port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+gradio>=4.44.0
+pydantic>=2.0.0
+rich>=13.7.0
+matplotlib>=3.7.0
+# Pull the ChaosOps package straight from GitHub so the Space has the latest
+# env/agents/dashboard code.
+chaosops @ git+https://github.com/vatsalllll/chaos_ops.git@main
+# Trained-policy lane (optional at cold-start, required before CHAOSOPS_ADAPTER_PATH is set)
+torch>=2.3.0
+transformers>=4.44.0
+peft>=0.12.0
+accelerate>=0.33.0
+safetensors>=0.4.3
+# Explicit pin so snapshot_download() (used to fetch the LoRA adapter from
+# the Hub on Space cold start) is guaranteed available.
+huggingface_hub>=0.24.0