rebuilding
Browse files- .gitattributes +35 -0
- .omc/project-memory.json +120 -0
- .omc/state/agent-replay-47169e9f-c0c1-431f-bf0f-84312b895ce6.jsonl +1 -0
- .omc/state/checkpoints/checkpoint-2026-04-25T09-09-34-592Z.json +16 -0
- .omc/state/hud-state.json +6 -0
- .omc/state/hud-stdin-cache.json +1 -0
- .omc/state/idle-notif-cooldown.json +3 -0
- .omc/state/subagent-tracking.json +7 -0
- Dockerfile +38 -0
- FETCH_HEAD +0 -0
- README.md +1 -0
- app.py +254 -0
- requirements.txt +16 -0
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.omc/project-memory.json
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"version": "1.0.0",
|
| 3 |
+
"lastScanned": 1777108314607,
|
| 4 |
+
"projectRoot": "/Users/aayushashokkhopade/Desktop/meta_hack/chaosops",
|
| 5 |
+
"techStack": {
|
| 6 |
+
"languages": [],
|
| 7 |
+
"frameworks": [],
|
| 8 |
+
"packageManager": null,
|
| 9 |
+
"runtime": null
|
| 10 |
+
},
|
| 11 |
+
"build": {
|
| 12 |
+
"buildCommand": null,
|
| 13 |
+
"testCommand": null,
|
| 14 |
+
"lintCommand": null,
|
| 15 |
+
"devCommand": null,
|
| 16 |
+
"scripts": {}
|
| 17 |
+
},
|
| 18 |
+
"conventions": {
|
| 19 |
+
"namingStyle": null,
|
| 20 |
+
"importStyle": null,
|
| 21 |
+
"testPattern": null,
|
| 22 |
+
"fileOrganization": null
|
| 23 |
+
},
|
| 24 |
+
"structure": {
|
| 25 |
+
"isMonorepo": false,
|
| 26 |
+
"workspaces": [],
|
| 27 |
+
"mainDirectories": [],
|
| 28 |
+
"gitBranches": null
|
| 29 |
+
},
|
| 30 |
+
"customNotes": [],
|
| 31 |
+
"directoryMap": {
|
| 32 |
+
"__pycache__": {
|
| 33 |
+
"path": "__pycache__",
|
| 34 |
+
"purpose": null,
|
| 35 |
+
"fileCount": 1,
|
| 36 |
+
"lastAccessed": 1777108314594,
|
| 37 |
+
"keyFiles": [
|
| 38 |
+
"__init__.cpython-311.pyc"
|
| 39 |
+
]
|
| 40 |
+
},
|
| 41 |
+
"agents": {
|
| 42 |
+
"path": "agents",
|
| 43 |
+
"purpose": null,
|
| 44 |
+
"fileCount": 5,
|
| 45 |
+
"lastAccessed": 1777108314595,
|
| 46 |
+
"keyFiles": [
|
| 47 |
+
"__init__.py",
|
| 48 |
+
"llm_adapter.py",
|
| 49 |
+
"policies.py",
|
| 50 |
+
"runner.py",
|
| 51 |
+
"trained_policy.py"
|
| 52 |
+
]
|
| 53 |
+
},
|
| 54 |
+
"curriculum": {
|
| 55 |
+
"path": "curriculum",
|
| 56 |
+
"purpose": null,
|
| 57 |
+
"fileCount": 2,
|
| 58 |
+
"lastAccessed": 1777108314595,
|
| 59 |
+
"keyFiles": [
|
| 60 |
+
"__init__.py",
|
| 61 |
+
"generator.py"
|
| 62 |
+
]
|
| 63 |
+
},
|
| 64 |
+
"dashboard": {
|
| 65 |
+
"path": "dashboard",
|
| 66 |
+
"purpose": null,
|
| 67 |
+
"fileCount": 3,
|
| 68 |
+
"lastAccessed": 1777108314595,
|
| 69 |
+
"keyFiles": [
|
| 70 |
+
"__init__.py",
|
| 71 |
+
"terminal.py",
|
| 72 |
+
"transcript.py"
|
| 73 |
+
]
|
| 74 |
+
},
|
| 75 |
+
"env": {
|
| 76 |
+
"path": "env",
|
| 77 |
+
"purpose": null,
|
| 78 |
+
"fileCount": 9,
|
| 79 |
+
"lastAccessed": 1777108314596,
|
| 80 |
+
"keyFiles": [
|
| 81 |
+
"__init__.py",
|
| 82 |
+
"action_handlers.py",
|
| 83 |
+
"environment.py",
|
| 84 |
+
"injectors.py",
|
| 85 |
+
"metrics.py"
|
| 86 |
+
]
|
| 87 |
+
},
|
| 88 |
+
"rewards": {
|
| 89 |
+
"path": "rewards",
|
| 90 |
+
"purpose": null,
|
| 91 |
+
"fileCount": 2,
|
| 92 |
+
"lastAccessed": 1777108314596,
|
| 93 |
+
"keyFiles": [
|
| 94 |
+
"__init__.py",
|
| 95 |
+
"reward_fn.py"
|
| 96 |
+
]
|
| 97 |
+
},
|
| 98 |
+
"train": {
|
| 99 |
+
"path": "train",
|
| 100 |
+
"purpose": null,
|
| 101 |
+
"fileCount": 4,
|
| 102 |
+
"lastAccessed": 1777108314596,
|
| 103 |
+
"keyFiles": [
|
| 104 |
+
"__init__.py",
|
| 105 |
+
"baseline.py",
|
| 106 |
+
"evaluate.py",
|
| 107 |
+
"grpo_train.py"
|
| 108 |
+
]
|
| 109 |
+
}
|
| 110 |
+
},
|
| 111 |
+
"hotPaths": [
|
| 112 |
+
{
|
| 113 |
+
"path": "README.md",
|
| 114 |
+
"accessCount": 2,
|
| 115 |
+
"lastAccessed": 1777108362096,
|
| 116 |
+
"type": "file"
|
| 117 |
+
}
|
| 118 |
+
],
|
| 119 |
+
"userDirectives": []
|
| 120 |
+
}
|
.omc/state/agent-replay-47169e9f-c0c1-431f-bf0f-84312b895ce6.jsonl
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"t":0,"agent":"a1e8a1b","agent_type":"unknown","event":"agent_stop","success":true}
|
.omc/state/checkpoints/checkpoint-2026-04-25T09-09-34-592Z.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"created_at": "2026-04-25T09:09:34.591Z",
|
| 3 |
+
"trigger": "auto",
|
| 4 |
+
"active_modes": {},
|
| 5 |
+
"todo_summary": {
|
| 6 |
+
"pending": 0,
|
| 7 |
+
"in_progress": 0,
|
| 8 |
+
"completed": 0
|
| 9 |
+
},
|
| 10 |
+
"wisdom_exported": false,
|
| 11 |
+
"background_jobs": {
|
| 12 |
+
"active": [],
|
| 13 |
+
"recent": [],
|
| 14 |
+
"stats": null
|
| 15 |
+
}
|
| 16 |
+
}
|
.omc/state/hud-state.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"timestamp": "2026-04-25T09:06:26.159Z",
|
| 3 |
+
"backgroundTasks": [],
|
| 4 |
+
"sessionStartTimestamp": "2026-04-25T08:15:37.276Z",
|
| 5 |
+
"sessionId": "47169e9f-c0c1-431f-bf0f-84312b895ce6"
|
| 6 |
+
}
|
.omc/state/hud-stdin-cache.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"session_id":"47169e9f-c0c1-431f-bf0f-84312b895ce6","transcript_path":"/Users/aayushashokkhopade/.claude/projects/-Users-aayushashokkhopade-Desktop-meta-hack/47169e9f-c0c1-431f-bf0f-84312b895ce6.jsonl","cwd":"/Users/aayushashokkhopade/Desktop/meta_hack/chaosops","model":{"id":"claude-opus-4-7","display_name":"Opus 4.7"},"workspace":{"current_dir":"/Users/aayushashokkhopade/Desktop/meta_hack/chaosops","project_dir":"/Users/aayushashokkhopade/Desktop/meta_hack","added_dirs":[]},"version":"2.1.114","output_style":{"name":"default"},"cost":{"total_cost_usd":45.634932250000006,"total_duration_ms":261722691,"total_api_duration_ms":4784907,"total_lines_added":1711,"total_lines_removed":214},"context_window":{"total_input_tokens":93753,"total_output_tokens":292190,"context_window_size":200000,"current_usage":{"input_tokens":6,"output_tokens":463,"cache_creation_input_tokens":1978,"cache_read_input_tokens":49566},"used_percentage":26,"remaining_percentage":74},"exceeds_200k_tokens":false}
|
.omc/state/idle-notif-cooldown.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"lastSentAt": "2026-04-25T09:13:09.398Z"
|
| 3 |
+
}
|
.omc/state/subagent-tracking.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"agents": [],
|
| 3 |
+
"total_spawned": 0,
|
| 4 |
+
"total_completed": 0,
|
| 5 |
+
"total_failed": 0,
|
| 6 |
+
"last_updated": "2026-04-25T09:11:53.929Z"
|
| 7 |
+
}
|
Dockerfile
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ChaosOps AI — Hugging Face Spaces Dockerfile
|
| 2 |
+
#
|
| 3 |
+
# Hugging Face Spaces convention:
|
| 4 |
+
# * Image must run as a non-root user (uid 1000).
|
| 5 |
+
# * App listens on port 7860 (Spaces routes external traffic to this port).
|
| 6 |
+
# * /home/user/app is the working directory the Space picks up automatically.
|
| 7 |
+
#
|
| 8 |
+
# Build pipeline is split so pip-install layer is cached independently of the
|
| 9 |
+
# app code — every code edit only re-uploads the small final COPY.
|
| 10 |
+
|
| 11 |
+
FROM python:3.11-slim
|
| 12 |
+
|
| 13 |
+
# System deps:
|
| 14 |
+
# git — pip needs this to install `chaosops` from the GitHub source.
|
| 15 |
+
# curl — handy for in-container HF Hub debug; small footprint.
|
| 16 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 17 |
+
git \
|
| 18 |
+
curl \
|
| 19 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 20 |
+
|
| 21 |
+
# Non-root user (Spaces requirement).
|
| 22 |
+
RUN useradd -m -u 1000 user
|
| 23 |
+
USER user
|
| 24 |
+
ENV HOME=/home/user \
|
| 25 |
+
PATH=/home/user/.local/bin:$PATH
|
| 26 |
+
WORKDIR $HOME/app
|
| 27 |
+
|
| 28 |
+
# Cache pip layer independently of source.
|
| 29 |
+
COPY --chown=user:user requirements.txt .
|
| 30 |
+
RUN pip install --no-cache-dir --user -r requirements.txt
|
| 31 |
+
|
| 32 |
+
# Copy the rest of the Space (app.py, README.md, etc.).
|
| 33 |
+
COPY --chown=user:user . .
|
| 34 |
+
|
| 35 |
+
# Spaces routes traffic to 7860; Gradio binds here.
|
| 36 |
+
EXPOSE 7860
|
| 37 |
+
|
| 38 |
+
CMD ["python", "app.py"]
|
FETCH_HEAD
ADDED
|
File without changes
|
README.md
CHANGED
|
@@ -4,6 +4,7 @@ emoji: 🌖
|
|
| 4 |
colorFrom: purple
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: docker
|
|
|
|
| 7 |
pinned: false
|
| 8 |
license: mit
|
| 9 |
short_description: handling chaos
|
|
|
|
| 4 |
colorFrom: purple
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
pinned: false
|
| 9 |
license: mit
|
| 10 |
short_description: handling chaos
|
app.py
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""ChaosOps AI — Hugging Face Space entry point.
|
| 2 |
+
|
| 3 |
+
Gradio UI that lets a judge replay any incident scenario with any policy
|
| 4 |
+
(random / heuristic / oracle / trained) and watch the multi-agent response
|
| 5 |
+
unfold step-by-step. The trained-policy lane activates when the environment
|
| 6 |
+
variable ``CHAOSOPS_ADAPTER_PATH`` points at a LoRA adapter directory —
|
| 7 |
+
otherwise the Space still runs, silently falling back to the heuristic so
|
| 8 |
+
the UI works during cold-start or when no checkpoint has been uploaded yet.
|
| 9 |
+
|
| 10 |
+
Deploy layout:
|
| 11 |
+
hf_space/
|
| 12 |
+
app.py — this file (entry point HF Spaces picks up)
|
| 13 |
+
requirements.txt — pulls chaosops from GitHub + Gradio + torch stack
|
| 14 |
+
README.md — HF Space card (YAML frontmatter)
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from __future__ import annotations
|
| 18 |
+
|
| 19 |
+
import html
|
| 20 |
+
import os
|
| 21 |
+
from pathlib import Path
|
| 22 |
+
|
| 23 |
+
import gradio as gr
|
| 24 |
+
|
| 25 |
+
from chaosops.agents.policies import (
|
| 26 |
+
Policy,
|
| 27 |
+
heuristic_policy,
|
| 28 |
+
oracle_policy,
|
| 29 |
+
random_policy,
|
| 30 |
+
)
|
| 31 |
+
from chaosops.agents.runner import EpisodeResult, run_episode
|
| 32 |
+
from chaosops.dashboard.transcript import ROLE_TAG, render_transcript
|
| 33 |
+
from chaosops.env.environment import ChaosOpsEnvironment
|
| 34 |
+
from chaosops.env.models import AgentRole, DifficultyTier, FailureType
|
| 35 |
+
from chaosops.env.world_sim import Scenario
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
ADAPTER_ENV = "CHAOSOPS_ADAPTER_PATH"
|
| 39 |
+
_TRAINED_POLICY_CACHE = None
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# ---------------------------------------------------------------------------
|
| 43 |
+
# Policy resolution
|
| 44 |
+
# ---------------------------------------------------------------------------
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def _lazy_trained_policy():
|
| 48 |
+
"""Load the trained LoRA adapter once per process, lazily.
|
| 49 |
+
|
| 50 |
+
``CHAOSOPS_ADAPTER_PATH`` accepts either:
|
| 51 |
+
* a local filesystem path (used in Colab / local dev), or
|
| 52 |
+
* an HF Hub repo id like ``VatsalHF30/chaosops-grpo-lora`` (Spaces).
|
| 53 |
+
|
| 54 |
+
For repo ids we materialise the adapter to local disk via
|
| 55 |
+
``snapshot_download`` on the first call — the second call hits the
|
| 56 |
+
in-process cache and is free.
|
| 57 |
+
"""
|
| 58 |
+
global _TRAINED_POLICY_CACHE
|
| 59 |
+
if _TRAINED_POLICY_CACHE is not None:
|
| 60 |
+
return _TRAINED_POLICY_CACHE
|
| 61 |
+
adapter_ref = os.environ.get(ADAPTER_ENV)
|
| 62 |
+
if not adapter_ref:
|
| 63 |
+
return None
|
| 64 |
+
|
| 65 |
+
local_path = Path(adapter_ref)
|
| 66 |
+
if not local_path.exists():
|
| 67 |
+
# Treat the value as an HF Hub repo id and snapshot_download it.
|
| 68 |
+
try:
|
| 69 |
+
from huggingface_hub import snapshot_download
|
| 70 |
+
except ImportError:
|
| 71 |
+
return None
|
| 72 |
+
try:
|
| 73 |
+
local_path = Path(
|
| 74 |
+
snapshot_download(repo_id=adapter_ref, repo_type="model")
|
| 75 |
+
)
|
| 76 |
+
except Exception:
|
| 77 |
+
# Network failure / private repo / typo — fall back to heuristic.
|
| 78 |
+
return None
|
| 79 |
+
|
| 80 |
+
from chaosops.agents.trained_policy import TrainedPolicy
|
| 81 |
+
|
| 82 |
+
_TRAINED_POLICY_CACHE = TrainedPolicy.from_adapter(local_path)
|
| 83 |
+
return _TRAINED_POLICY_CACHE
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def _build_policy(name: str, scenario: Scenario) -> Policy:
|
| 87 |
+
if name == "random":
|
| 88 |
+
return random_policy(seed=scenario.seed)
|
| 89 |
+
if name == "heuristic":
|
| 90 |
+
return heuristic_policy(seed=scenario.seed)
|
| 91 |
+
if name == "oracle":
|
| 92 |
+
return oracle_policy(scenario.failure_type)
|
| 93 |
+
if name == "trained":
|
| 94 |
+
trained = _lazy_trained_policy()
|
| 95 |
+
if trained is None:
|
| 96 |
+
# Graceful fallback — Space is still useful before adapter lands.
|
| 97 |
+
return heuristic_policy(seed=scenario.seed)
|
| 98 |
+
return trained.as_policy()
|
| 99 |
+
raise ValueError(f"unknown policy '{name}'")
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
# ---------------------------------------------------------------------------
|
| 103 |
+
# Rendering helpers
|
| 104 |
+
# ---------------------------------------------------------------------------
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
_ROLE_COLOR: dict[str, str] = {
|
| 108 |
+
"SRE": "#2980b9",
|
| 109 |
+
"DEV": "#16a085",
|
| 110 |
+
"MGR": "#8e44ad",
|
| 111 |
+
"OVS": "#c0392b",
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def _render_chat_html(result: EpisodeResult) -> str:
|
| 116 |
+
"""Render the episode as a coloured chat log for the Gradio HTML widget."""
|
| 117 |
+
blocks: list[str] = []
|
| 118 |
+
for step in result.steps:
|
| 119 |
+
tag = ROLE_TAG[step.role]
|
| 120 |
+
color = _ROLE_COLOR.get(tag, "#333")
|
| 121 |
+
args = step.action.args or {}
|
| 122 |
+
args_str = " ".join(f"{k}={v}" for k, v in args.items())
|
| 123 |
+
target = step.action.target or "-"
|
| 124 |
+
summary = (
|
| 125 |
+
f"{step.action.action_type.value} target={target}"
|
| 126 |
+
+ (f" {args_str}" if args_str else "")
|
| 127 |
+
)
|
| 128 |
+
blocks.append(
|
| 129 |
+
f'<div style="margin-bottom:6px;">'
|
| 130 |
+
f'<span style="color:{color};font-weight:600;">t{step.turn:02d} [{tag}]</span> '
|
| 131 |
+
f'<span style="font-family:monospace;">{html.escape(summary)}</span> '
|
| 132 |
+
f'<span style="color:#888;">reward={step.reward:+.1f}</span>'
|
| 133 |
+
f"</div>"
|
| 134 |
+
)
|
| 135 |
+
footer = (
|
| 136 |
+
f'<hr style="margin:10px 0;">'
|
| 137 |
+
f'<div><b>resolved:</b> {result.resolved} · '
|
| 138 |
+
f'<b>steps:</b> {result.final_step} · '
|
| 139 |
+
f'<b>cum_reward:</b> {result.cumulative_reward:+.1f} · '
|
| 140 |
+
f'<b>wrong_fixes:</b> {result.wrong_fixes} · '
|
| 141 |
+
f'<b>oversight_flags:</b> {result.oversight_flags or "[]"}</div>'
|
| 142 |
+
)
|
| 143 |
+
return '<div style="font-size:13px;line-height:1.5;">' + "".join(blocks) + footer + "</div>"
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
# ---------------------------------------------------------------------------
|
| 147 |
+
# Episode runner (called from the Gradio button)
|
| 148 |
+
# ---------------------------------------------------------------------------
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def run_scenario(failure: str, difficulty: str, policy_name: str, seed: int):
|
| 152 |
+
scenario = Scenario.from_type(
|
| 153 |
+
FailureType(failure),
|
| 154 |
+
seed=int(seed),
|
| 155 |
+
difficulty=DifficultyTier(difficulty),
|
| 156 |
+
)
|
| 157 |
+
policy = _build_policy(policy_name, scenario)
|
| 158 |
+
env = ChaosOpsEnvironment()
|
| 159 |
+
result = run_episode(env, scenario, {r: policy for r in AgentRole})
|
| 160 |
+
|
| 161 |
+
chat_html = _render_chat_html(result)
|
| 162 |
+
transcript = render_transcript(result)
|
| 163 |
+
|
| 164 |
+
summary = {
|
| 165 |
+
"failure_type": failure,
|
| 166 |
+
"difficulty": difficulty,
|
| 167 |
+
"policy": policy_name,
|
| 168 |
+
"seed": int(seed),
|
| 169 |
+
"resolved": result.resolved,
|
| 170 |
+
"steps_to_resolve": result.final_step if result.resolved else None,
|
| 171 |
+
"cumulative_reward": round(result.cumulative_reward, 2),
|
| 172 |
+
"wrong_fixes": result.wrong_fixes,
|
| 173 |
+
"oversight_flags": result.oversight_flags,
|
| 174 |
+
}
|
| 175 |
+
return chat_html, summary, transcript
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
# ---------------------------------------------------------------------------
|
| 179 |
+
# UI
|
| 180 |
+
# ---------------------------------------------------------------------------
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
INTRO_MARKDOWN = """
|
| 184 |
+
# ChaosOps AI — Multi-Agent Incident-Response Gym
|
| 185 |
+
|
| 186 |
+
A reinforcement-learning environment where a **four-agent fleet**
|
| 187 |
+
(SRE · Dev · Manager · **Oversight**) resolves a randomly injected
|
| 188 |
+
infrastructure incident. The fourth agent is a **scalable-oversight model**
|
| 189 |
+
whose job is to detect when *another AI in the fleet* (autoscaler,
|
| 190 |
+
load_balancer, deploy_bot) caused the incident — before the remediation
|
| 191 |
+
team touches the services.
|
| 192 |
+
|
| 193 |
+
**Policies**
|
| 194 |
+
- `random` · hard lower bound
|
| 195 |
+
- `heuristic` · what a decent human SRE would try
|
| 196 |
+
- `oracle` · cheats (knows ground truth) — upper-bound curve
|
| 197 |
+
- `trained` · our GRPO-tuned Qwen 2.5 0.5B LoRA checkpoint
|
| 198 |
+
|
| 199 |
+
Pick a failure type, smash **Run episode**, watch the team coordinate (or fail).
|
| 200 |
+
"""
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
def build_demo() -> gr.Blocks:
|
| 204 |
+
failure_choices = [f.value for f in FailureType]
|
| 205 |
+
tier_choices = [t.value for t in DifficultyTier]
|
| 206 |
+
policy_choices = ["random", "heuristic", "oracle", "trained"]
|
| 207 |
+
|
| 208 |
+
with gr.Blocks(title="ChaosOps AI") as demo:
|
| 209 |
+
gr.Markdown(INTRO_MARKDOWN)
|
| 210 |
+
|
| 211 |
+
with gr.Row():
|
| 212 |
+
with gr.Column(scale=1):
|
| 213 |
+
failure = gr.Dropdown(
|
| 214 |
+
failure_choices,
|
| 215 |
+
value="rogue_deploy_bot",
|
| 216 |
+
label="Failure type",
|
| 217 |
+
)
|
| 218 |
+
difficulty = gr.Dropdown(
|
| 219 |
+
tier_choices,
|
| 220 |
+
value="hard",
|
| 221 |
+
label="Difficulty",
|
| 222 |
+
)
|
| 223 |
+
policy = gr.Dropdown(
|
| 224 |
+
policy_choices,
|
| 225 |
+
value="oracle",
|
| 226 |
+
label="Policy",
|
| 227 |
+
)
|
| 228 |
+
seed = gr.Number(value=42, precision=0, label="Seed")
|
| 229 |
+
run_btn = gr.Button("▶ Run episode", variant="primary")
|
| 230 |
+
gr.Markdown(
|
| 231 |
+
"_Trained policy requires `CHAOSOPS_ADAPTER_PATH` to be "
|
| 232 |
+
"set on the Space. It falls back to the heuristic otherwise._"
|
| 233 |
+
)
|
| 234 |
+
with gr.Column(scale=2):
|
| 235 |
+
chat_out = gr.HTML(label="Episode chat")
|
| 236 |
+
summary_out = gr.JSON(label="Summary")
|
| 237 |
+
transcript_out = gr.Textbox(
|
| 238 |
+
label="Full transcript (reward breakdown)",
|
| 239 |
+
lines=18,
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
run_btn.click(
|
| 243 |
+
run_scenario,
|
| 244 |
+
inputs=[failure, difficulty, policy, seed],
|
| 245 |
+
outputs=[chat_out, summary_out, transcript_out],
|
| 246 |
+
)
|
| 247 |
+
|
| 248 |
+
return demo
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
if __name__ == "__main__":
|
| 252 |
+
# Docker Spaces route external traffic to port 7860; bind on 0.0.0.0 so
|
| 253 |
+
# the container's network namespace exposes the server beyond localhost.
|
| 254 |
+
build_demo().launch(server_name="0.0.0.0", server_port=7860)
|
requirements.txt
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=4.44.0
|
| 2 |
+
pydantic>=2.0.0
|
| 3 |
+
rich>=13.7.0
|
| 4 |
+
matplotlib>=3.7.0
|
| 5 |
+
# Pull the ChaosOps package straight from GitHub so the Space has the latest
|
| 6 |
+
# env/agents/dashboard code.
|
| 7 |
+
chaosops @ git+https://github.com/vatsalllll/chaos_ops.git@main
|
| 8 |
+
# Trained-policy lane (optional at cold-start, required before CHAOSOPS_ADAPTER_PATH is set)
|
| 9 |
+
torch>=2.3.0
|
| 10 |
+
transformers>=4.44.0
|
| 11 |
+
peft>=0.12.0
|
| 12 |
+
accelerate>=0.33.0
|
| 13 |
+
safetensors>=0.4.3
|
| 14 |
+
# Explicit pin so snapshot_download() (used to fetch the LoRA adapter from
|
| 15 |
+
# the Hub on Space cold start) is guaranteed available.
|
| 16 |
+
huggingface_hub>=0.24.0
|