Elliot89 commited on
Commit
2b5d42a
Β·
1 Parent(s): fb49640

fix: async lifespan init + JSON root for HF health check

Browse files
Files changed (3) hide show
  1. Dockerfile +4 -0
  2. README.md +1 -0
  3. server/app.py +41 -57
Dockerfile CHANGED
@@ -18,4 +18,8 @@ RUN touch server/__init__.py
18
  EXPOSE 7860
19
  ENV PYTHONPATH=/app
20
 
 
 
 
 
21
  CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
 
18
  EXPOSE 7860
19
  ENV PYTHONPATH=/app
20
 
21
+ # Healthcheck so HF knows the container is genuinely ready
22
+ HEALTHCHECK --interval=10s --timeout=5s --start-period=15s --retries=3 \
23
+ CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:7860/health')" || exit 1
24
+
25
  CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -123,6 +123,7 @@ Dense signals throughout the trajectory:
123
 
124
  | Method | Path | Description |
125
  |---|---|---|
 
126
  | GET | `/health` | `{"status": "ok", "version": "0.1.0"}` |
127
  | POST | `/reset?task_id=...&scenario_index=...` | Start new episode |
128
  | POST | `/step` | Submit action (JSON body) |
 
123
 
124
  | Method | Path | Description |
125
  |---|---|---|
126
+ | GET | `/` | `{"status": "running", ...}` β€” Space health check |
127
  | GET | `/health` | `{"status": "ok", "version": "0.1.0"}` |
128
  | POST | `/reset?task_id=...&scenario_index=...` | Start new episode |
129
  | POST | `/step` | Submit action (JSON body) |
server/app.py CHANGED
@@ -2,7 +2,7 @@
2
  server/app.py β€” FastAPI server for Cloud Incident Response OpenEnv.
3
 
4
  Endpoints:
5
- GET / HTML landing page (triggers HF Space "Running" status)
6
  GET /health Health check
7
  POST /reset Start new episode
8
  POST /step Submit action
@@ -21,9 +21,9 @@ import sys
21
 
22
  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
23
 
 
24
  from fastapi import FastAPI, HTTPException, Query
25
  from fastapi.middleware.cors import CORSMiddleware
26
- from fastapi.responses import HTMLResponse
27
 
28
  from server.models import Action
29
  from server.environment import IncidentEnvironment
@@ -31,6 +31,25 @@ from tasks import list_tasks, ALL_TASKS
31
 
32
  _ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  app = FastAPI(
35
  title="Cloud Incident Response β€” OpenEnv",
36
  version="0.1.0",
@@ -38,6 +57,7 @@ app = FastAPI(
38
  "OpenEnv environment for training AI agents on cloud SRE incident response. "
39
  "Covers cascading failures, OOM kills, CDN storms, and network partitions."
40
  ),
 
41
  )
42
 
43
  app.add_middleware(
@@ -47,64 +67,24 @@ app.add_middleware(
47
  allow_headers=["*"],
48
  )
49
 
50
- env = IncidentEnvironment()
51
 
 
52
 
53
- # ── Landing page (required for HF Space Running status) ─────────────────────
54
-
55
- @app.get("/", response_class=HTMLResponse)
56
  def root():
57
- return """<!DOCTYPE html>
58
- <html lang="en">
59
- <head>
60
- <meta charset="UTF-8">
61
- <title>Cloud Incident Response β€” OpenEnv</title>
62
- <style>
63
- body { font-family: -apple-system, sans-serif; max-width: 680px;
64
- margin: 60px auto; padding: 0 20px; color: #1a1a1a; }
65
- h1 { font-size: 1.6rem; margin-bottom: 4px; }
66
- .tag { display:inline-block; background:#e8f4fd; color:#0066cc;
67
- padding:2px 8px; border-radius:4px; font-size:.8rem;
68
- margin-right:4px; margin-bottom:8px; }
69
- .status { color: #16a34a; font-weight: 600; }
70
- table { border-collapse: collapse; width: 100%; margin: 16px 0; }
71
- th, td { text-align: left; padding: 8px 12px;
72
- border-bottom: 1px solid #e5e7eb; font-size: .9rem; }
73
- th { background: #f9fafb; font-weight: 600; }
74
- a { color: #0066cc; }
75
- code { background: #f3f4f6; padding: 1px 5px; border-radius: 3px;
76
- font-size: .85rem; }
77
- </style>
78
- </head>
79
- <body>
80
- <h1>&#x1F6A8; Cloud Incident Response &mdash; OpenEnv</h1>
81
- <div>
82
- <span class="tag">openenv</span><span class="tag">sre</span>
83
- <span class="tag">cloud</span><span class="tag">real-world</span>
84
- <span class="tag">agentic</span>
85
- </div>
86
- <p>Status: <span class="status">&#x2713; Running</span></p>
87
- <p>
88
- OpenEnv environment for training and evaluating AI agents on
89
- cloud SRE incident response. Covers cross-service cascading failures,
90
- OOM kills, CDN cache storms, and BGP network partitions.
91
- </p>
92
- <table>
93
- <tr><th>Task</th><th>Difficulty</th><th>Max Steps</th></tr>
94
- <tr><td><code>alert_classification</code></td><td>Easy</td><td>3</td></tr>
95
- <tr><td><code>root_cause_analysis</code></td><td>Medium</td><td>10</td></tr>
96
- <tr><td><code>remediation_planning</code></td><td>Hard</td><td>15</td></tr>
97
- </table>
98
- <p>
99
- <a href="/docs">&#x1F4D6; API Docs (Swagger)</a> &nbsp;&middot;&nbsp;
100
- <a href="/tasks">&#x1F4CB; Tasks</a> &nbsp;&middot;&nbsp;
101
- <a href="/health">&#x2764; Health</a>
102
- </p>
103
- </body>
104
- </html>"""
105
-
106
-
107
- # ── Core endpoints ───────────────────────────────────────────────────────────
108
 
109
  @app.get("/health")
110
  def health():
@@ -117,6 +97,7 @@ def reset(
117
  scenario_index: int = Query(default=0),
118
  ):
119
  """Start a new episode. Returns the initial observation."""
 
120
  try:
121
  obs = env.reset(task_id=task_id, scenario_index=scenario_index)
122
  return obs.model_dump()
@@ -129,6 +110,7 @@ def reset(
129
  @app.post("/step")
130
  def step(action: Action):
131
  """Submit one action. Returns observation, reward, done, info."""
 
132
  try:
133
  obs, reward, done, info = env.step(action)
134
  return {
@@ -146,6 +128,7 @@ def step(action: Action):
146
  @app.get("/state")
147
  def state():
148
  """Return the full current episode state."""
 
149
  try:
150
  return env.state().model_dump()
151
  except RuntimeError as e:
@@ -206,6 +189,7 @@ def tasks():
206
  @app.get("/grader")
207
  def grader():
208
  """Score the current episode. Returns total in [0.0, 1.0]."""
 
209
  try:
210
  s = env.state()
211
  from graders import grade
 
2
  server/app.py β€” FastAPI server for Cloud Incident Response OpenEnv.
3
 
4
  Endpoints:
5
+ GET / JSON health/status (triggers HF Space "Running" status)
6
  GET /health Health check
7
  POST /reset Start new episode
8
  POST /step Submit action
 
21
 
22
  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
23
 
24
+ from contextlib import asynccontextmanager
25
  from fastapi import FastAPI, HTTPException, Query
26
  from fastapi.middleware.cors import CORSMiddleware
 
27
 
28
  from server.models import Action
29
  from server.environment import IncidentEnvironment
 
31
 
32
  _ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
33
 
34
+ # ── Global env instance β€” initialised in lifespan, not at import time ────────
35
+ _env: IncidentEnvironment | None = None
36
+
37
+
38
+ @asynccontextmanager
39
+ async def lifespan(app: FastAPI):
40
+ """Initialise heavy objects after the server is already accepting requests."""
41
+ global _env
42
+ _env = IncidentEnvironment()
43
+ yield
44
+ # cleanup (nothing needed)
45
+
46
+
47
+ def _get_env() -> IncidentEnvironment:
48
+ if _env is None:
49
+ raise HTTPException(status_code=503, detail="Environment initialising β€” retry in a moment")
50
+ return _env
51
+
52
+
53
  app = FastAPI(
54
  title="Cloud Incident Response β€” OpenEnv",
55
  version="0.1.0",
 
57
  "OpenEnv environment for training AI agents on cloud SRE incident response. "
58
  "Covers cascading failures, OOM kills, CDN storms, and network partitions."
59
  ),
60
+ lifespan=lifespan,
61
  )
62
 
63
  app.add_middleware(
 
67
  allow_headers=["*"],
68
  )
69
 
 
70
 
71
+ # ── Root β€” plain JSON so HF health checker flips badge to Running ─────────────
72
 
73
+ @app.get("/")
 
 
74
  def root():
75
+ """Plain JSON root β€” required for HF Space to show Running status."""
76
+ return {
77
+ "status": "running",
78
+ "name": "cloud-incident-response",
79
+ "version": "0.1.0",
80
+ "description": "OpenEnv environment for cloud SRE incident response",
81
+ "tasks": ["alert_classification", "root_cause_analysis", "remediation_planning"],
82
+ "docs": "/docs",
83
+ "health": "/health",
84
+ }
85
+
86
+
87
+ # ── Core endpoints ────────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
  @app.get("/health")
90
  def health():
 
97
  scenario_index: int = Query(default=0),
98
  ):
99
  """Start a new episode. Returns the initial observation."""
100
+ env = _get_env()
101
  try:
102
  obs = env.reset(task_id=task_id, scenario_index=scenario_index)
103
  return obs.model_dump()
 
110
  @app.post("/step")
111
  def step(action: Action):
112
  """Submit one action. Returns observation, reward, done, info."""
113
+ env = _get_env()
114
  try:
115
  obs, reward, done, info = env.step(action)
116
  return {
 
128
  @app.get("/state")
129
  def state():
130
  """Return the full current episode state."""
131
+ env = _get_env()
132
  try:
133
  return env.state().model_dump()
134
  except RuntimeError as e:
 
189
  @app.get("/grader")
190
  def grader():
191
  """Score the current episode. Returns total in [0.0, 1.0]."""
192
+ env = _get_env()
193
  try:
194
  s = env.state()
195
  from graders import grade