Spaces:
Sleeping
Sleeping
fix: async lifespan init + JSON root for HF health check
Browse files- Dockerfile +4 -0
- README.md +1 -0
- server/app.py +41 -57
Dockerfile
CHANGED
|
@@ -18,4 +18,8 @@ RUN touch server/__init__.py
|
|
| 18 |
EXPOSE 7860
|
| 19 |
ENV PYTHONPATH=/app
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
|
| 18 |
EXPOSE 7860
|
| 19 |
ENV PYTHONPATH=/app
|
| 20 |
|
| 21 |
+
# Healthcheck so HF knows the container is genuinely ready
|
| 22 |
+
HEALTHCHECK --interval=10s --timeout=5s --start-period=15s --retries=3 \
|
| 23 |
+
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:7860/health')" || exit 1
|
| 24 |
+
|
| 25 |
CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -123,6 +123,7 @@ Dense signals throughout the trajectory:
|
|
| 123 |
|
| 124 |
| Method | Path | Description |
|
| 125 |
|---|---|---|
|
|
|
|
| 126 |
| GET | `/health` | `{"status": "ok", "version": "0.1.0"}` |
|
| 127 |
| POST | `/reset?task_id=...&scenario_index=...` | Start new episode |
|
| 128 |
| POST | `/step` | Submit action (JSON body) |
|
|
|
|
| 123 |
|
| 124 |
| Method | Path | Description |
|
| 125 |
|---|---|---|
|
| 126 |
+
| GET | `/` | `{"status": "running", ...}` β Space health check |
|
| 127 |
| GET | `/health` | `{"status": "ok", "version": "0.1.0"}` |
|
| 128 |
| POST | `/reset?task_id=...&scenario_index=...` | Start new episode |
|
| 129 |
| POST | `/step` | Submit action (JSON body) |
|
server/app.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
server/app.py β FastAPI server for Cloud Incident Response OpenEnv.
|
| 3 |
|
| 4 |
Endpoints:
|
| 5 |
-
GET /
|
| 6 |
GET /health Health check
|
| 7 |
POST /reset Start new episode
|
| 8 |
POST /step Submit action
|
|
@@ -21,9 +21,9 @@ import sys
|
|
| 21 |
|
| 22 |
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 23 |
|
|
|
|
| 24 |
from fastapi import FastAPI, HTTPException, Query
|
| 25 |
from fastapi.middleware.cors import CORSMiddleware
|
| 26 |
-
from fastapi.responses import HTMLResponse
|
| 27 |
|
| 28 |
from server.models import Action
|
| 29 |
from server.environment import IncidentEnvironment
|
|
@@ -31,6 +31,25 @@ from tasks import list_tasks, ALL_TASKS
|
|
| 31 |
|
| 32 |
_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
app = FastAPI(
|
| 35 |
title="Cloud Incident Response β OpenEnv",
|
| 36 |
version="0.1.0",
|
|
@@ -38,6 +57,7 @@ app = FastAPI(
|
|
| 38 |
"OpenEnv environment for training AI agents on cloud SRE incident response. "
|
| 39 |
"Covers cascading failures, OOM kills, CDN storms, and network partitions."
|
| 40 |
),
|
|
|
|
| 41 |
)
|
| 42 |
|
| 43 |
app.add_middleware(
|
|
@@ -47,64 +67,24 @@ app.add_middleware(
|
|
| 47 |
allow_headers=["*"],
|
| 48 |
)
|
| 49 |
|
| 50 |
-
env = IncidentEnvironment()
|
| 51 |
|
|
|
|
| 52 |
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
@app.get("/", response_class=HTMLResponse)
|
| 56 |
def root():
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
table { border-collapse: collapse; width: 100%; margin: 16px 0; }
|
| 71 |
-
th, td { text-align: left; padding: 8px 12px;
|
| 72 |
-
border-bottom: 1px solid #e5e7eb; font-size: .9rem; }
|
| 73 |
-
th { background: #f9fafb; font-weight: 600; }
|
| 74 |
-
a { color: #0066cc; }
|
| 75 |
-
code { background: #f3f4f6; padding: 1px 5px; border-radius: 3px;
|
| 76 |
-
font-size: .85rem; }
|
| 77 |
-
</style>
|
| 78 |
-
</head>
|
| 79 |
-
<body>
|
| 80 |
-
<h1>🚨 Cloud Incident Response — OpenEnv</h1>
|
| 81 |
-
<div>
|
| 82 |
-
<span class="tag">openenv</span><span class="tag">sre</span>
|
| 83 |
-
<span class="tag">cloud</span><span class="tag">real-world</span>
|
| 84 |
-
<span class="tag">agentic</span>
|
| 85 |
-
</div>
|
| 86 |
-
<p>Status: <span class="status">✓ Running</span></p>
|
| 87 |
-
<p>
|
| 88 |
-
OpenEnv environment for training and evaluating AI agents on
|
| 89 |
-
cloud SRE incident response. Covers cross-service cascading failures,
|
| 90 |
-
OOM kills, CDN cache storms, and BGP network partitions.
|
| 91 |
-
</p>
|
| 92 |
-
<table>
|
| 93 |
-
<tr><th>Task</th><th>Difficulty</th><th>Max Steps</th></tr>
|
| 94 |
-
<tr><td><code>alert_classification</code></td><td>Easy</td><td>3</td></tr>
|
| 95 |
-
<tr><td><code>root_cause_analysis</code></td><td>Medium</td><td>10</td></tr>
|
| 96 |
-
<tr><td><code>remediation_planning</code></td><td>Hard</td><td>15</td></tr>
|
| 97 |
-
</table>
|
| 98 |
-
<p>
|
| 99 |
-
<a href="/docs">📖 API Docs (Swagger)</a> ·
|
| 100 |
-
<a href="/tasks">📋 Tasks</a> ·
|
| 101 |
-
<a href="/health">❤ Health</a>
|
| 102 |
-
</p>
|
| 103 |
-
</body>
|
| 104 |
-
</html>"""
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
# ββ Core endpoints βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 108 |
|
| 109 |
@app.get("/health")
|
| 110 |
def health():
|
|
@@ -117,6 +97,7 @@ def reset(
|
|
| 117 |
scenario_index: int = Query(default=0),
|
| 118 |
):
|
| 119 |
"""Start a new episode. Returns the initial observation."""
|
|
|
|
| 120 |
try:
|
| 121 |
obs = env.reset(task_id=task_id, scenario_index=scenario_index)
|
| 122 |
return obs.model_dump()
|
|
@@ -129,6 +110,7 @@ def reset(
|
|
| 129 |
@app.post("/step")
|
| 130 |
def step(action: Action):
|
| 131 |
"""Submit one action. Returns observation, reward, done, info."""
|
|
|
|
| 132 |
try:
|
| 133 |
obs, reward, done, info = env.step(action)
|
| 134 |
return {
|
|
@@ -146,6 +128,7 @@ def step(action: Action):
|
|
| 146 |
@app.get("/state")
|
| 147 |
def state():
|
| 148 |
"""Return the full current episode state."""
|
|
|
|
| 149 |
try:
|
| 150 |
return env.state().model_dump()
|
| 151 |
except RuntimeError as e:
|
|
@@ -206,6 +189,7 @@ def tasks():
|
|
| 206 |
@app.get("/grader")
|
| 207 |
def grader():
|
| 208 |
"""Score the current episode. Returns total in [0.0, 1.0]."""
|
|
|
|
| 209 |
try:
|
| 210 |
s = env.state()
|
| 211 |
from graders import grade
|
|
|
|
| 2 |
server/app.py β FastAPI server for Cloud Incident Response OpenEnv.
|
| 3 |
|
| 4 |
Endpoints:
|
| 5 |
+
GET / JSON health/status (triggers HF Space "Running" status)
|
| 6 |
GET /health Health check
|
| 7 |
POST /reset Start new episode
|
| 8 |
POST /step Submit action
|
|
|
|
| 21 |
|
| 22 |
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 23 |
|
| 24 |
+
from contextlib import asynccontextmanager
|
| 25 |
from fastapi import FastAPI, HTTPException, Query
|
| 26 |
from fastapi.middleware.cors import CORSMiddleware
|
|
|
|
| 27 |
|
| 28 |
from server.models import Action
|
| 29 |
from server.environment import IncidentEnvironment
|
|
|
|
| 31 |
|
| 32 |
_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 33 |
|
| 34 |
+
# ββ Global env instance β initialised in lifespan, not at import time ββββββββ
|
| 35 |
+
_env: IncidentEnvironment | None = None
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
@asynccontextmanager
|
| 39 |
+
async def lifespan(app: FastAPI):
|
| 40 |
+
"""Initialise heavy objects after the server is already accepting requests."""
|
| 41 |
+
global _env
|
| 42 |
+
_env = IncidentEnvironment()
|
| 43 |
+
yield
|
| 44 |
+
# cleanup (nothing needed)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def _get_env() -> IncidentEnvironment:
|
| 48 |
+
if _env is None:
|
| 49 |
+
raise HTTPException(status_code=503, detail="Environment initialising β retry in a moment")
|
| 50 |
+
return _env
|
| 51 |
+
|
| 52 |
+
|
| 53 |
app = FastAPI(
|
| 54 |
title="Cloud Incident Response β OpenEnv",
|
| 55 |
version="0.1.0",
|
|
|
|
| 57 |
"OpenEnv environment for training AI agents on cloud SRE incident response. "
|
| 58 |
"Covers cascading failures, OOM kills, CDN storms, and network partitions."
|
| 59 |
),
|
| 60 |
+
lifespan=lifespan,
|
| 61 |
)
|
| 62 |
|
| 63 |
app.add_middleware(
|
|
|
|
| 67 |
allow_headers=["*"],
|
| 68 |
)
|
| 69 |
|
|
|
|
| 70 |
|
| 71 |
+
# ββ Root β plain JSON so HF health checker flips badge to Running βββββββββββββ
|
| 72 |
|
| 73 |
+
@app.get("/")
|
|
|
|
|
|
|
| 74 |
def root():
|
| 75 |
+
"""Plain JSON root β required for HF Space to show Running status."""
|
| 76 |
+
return {
|
| 77 |
+
"status": "running",
|
| 78 |
+
"name": "cloud-incident-response",
|
| 79 |
+
"version": "0.1.0",
|
| 80 |
+
"description": "OpenEnv environment for cloud SRE incident response",
|
| 81 |
+
"tasks": ["alert_classification", "root_cause_analysis", "remediation_planning"],
|
| 82 |
+
"docs": "/docs",
|
| 83 |
+
"health": "/health",
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
# ββ Core endpoints ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
@app.get("/health")
|
| 90 |
def health():
|
|
|
|
| 97 |
scenario_index: int = Query(default=0),
|
| 98 |
):
|
| 99 |
"""Start a new episode. Returns the initial observation."""
|
| 100 |
+
env = _get_env()
|
| 101 |
try:
|
| 102 |
obs = env.reset(task_id=task_id, scenario_index=scenario_index)
|
| 103 |
return obs.model_dump()
|
|
|
|
| 110 |
@app.post("/step")
|
| 111 |
def step(action: Action):
|
| 112 |
"""Submit one action. Returns observation, reward, done, info."""
|
| 113 |
+
env = _get_env()
|
| 114 |
try:
|
| 115 |
obs, reward, done, info = env.step(action)
|
| 116 |
return {
|
|
|
|
| 128 |
@app.get("/state")
|
| 129 |
def state():
|
| 130 |
"""Return the full current episode state."""
|
| 131 |
+
env = _get_env()
|
| 132 |
try:
|
| 133 |
return env.state().model_dump()
|
| 134 |
except RuntimeError as e:
|
|
|
|
| 189 |
@app.get("/grader")
|
| 190 |
def grader():
|
| 191 |
"""Score the current episode. Returns total in [0.0, 1.0]."""
|
| 192 |
+
env = _get_env()
|
| 193 |
try:
|
| 194 |
s = env.state()
|
| 195 |
from graders import grade
|