Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- Makefile +2 -2
- README.md +361 -166
- blog.md +549 -0
- models.py +3 -3
- openenv.yaml +4 -4
- public/grpo_training_curve.png +3 -0
- public/sft_loss_curve.png +0 -0
- server/app.py +7 -7
- server/calls.py +7 -7
- server/city.py +4 -4
- server/reward.py +6 -6
- server/smart_emergency_environment.py +14 -14
- train_sft_grpo.ipynb +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
public/grpo_training_curve.png filter=lfs diff=lfs merge=lfs -text
|
Makefile
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
.PHONY: build start serve stop health
|
| 2 |
|
| 3 |
-
#
|
| 4 |
build:
|
| 5 |
@docker build -t emergency:latest -f Dockerfile .
|
| 6 |
|
|
@@ -10,7 +10,7 @@ start:
|
|
| 10 |
stop:
|
| 11 |
@docker ps -q --filter ancestor=emergency:latest | xargs -r docker stop
|
| 12 |
|
| 13 |
-
#
|
| 14 |
serve:
|
| 15 |
@uv run uvicorn server.app:app --host 0.0.0.0 --port 8000 --reload
|
| 16 |
|
|
|
|
| 1 |
.PHONY: build start serve stop health
|
| 2 |
|
| 3 |
+
# Docker
|
| 4 |
build:
|
| 5 |
@docker build -t emergency:latest -f Dockerfile .
|
| 6 |
|
|
|
|
| 10 |
stop:
|
| 11 |
@docker ps -q --filter ancestor=emergency:latest | xargs -r docker stop
|
| 12 |
|
| 13 |
+
# Local dev (uv)
|
| 14 |
serve:
|
| 15 |
@uv run uvicorn server.app:app --host 0.0.0.0 --port 8000 --reload
|
| 16 |
|
README.md
CHANGED
|
@@ -11,257 +11,452 @@ tags:
|
|
| 11 |
- openenv
|
| 12 |
---
|
| 13 |
|
| 14 |
-
#
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
Built on [OpenEnv](https://github.com/meta-pytorch/OpenEnv) — a standard interface for RL environments exposed over HTTP/WebSocket, compatible with TRL + Unsloth training pipelines.
|
| 19 |
|
| 20 |
---
|
| 21 |
|
| 22 |
-
##
|
| 23 |
|
| 24 |
-
|
|
| 25 |
-
|---|---|
|
| 26 |
-
| **
|
| 27 |
-
| **
|
| 28 |
-
| **
|
| 29 |
-
| **
|
| 30 |
-
| **
|
| 31 |
-
| **
|
| 32 |
|
| 33 |
---
|
| 34 |
|
| 35 |
-
##
|
| 36 |
|
| 37 |
-
|
| 38 |
-
from smart_emergency import SmartEmergencyAction, SmartEmergencyEnv
|
| 39 |
-
|
| 40 |
-
with SmartEmergencyEnv(base_url="http://localhost:8000") as env:
|
| 41 |
-
result = env.reset()
|
| 42 |
-
print(result.observation.prompt)
|
| 43 |
-
|
| 44 |
-
# Dispatch an ambulance to the incident
|
| 45 |
-
action = SmartEmergencyAction(
|
| 46 |
-
action_type="dispatch",
|
| 47 |
-
severity_pred=3,
|
| 48 |
-
is_duplicate=False,
|
| 49 |
-
vehicle_type="ambulance",
|
| 50 |
-
vehicle_id="ambulance_0",
|
| 51 |
-
)
|
| 52 |
-
result = env.step(action)
|
| 53 |
-
print(result.observation.reward_breakdown)
|
| 54 |
-
# → {'severity': 1.0, 'duplicate': 1.0, 'vehicle_type': 1.5, 'vehicle_choice': 0.5, 'reroute': 0.0, 'total': 4.0}
|
| 55 |
-
```
|
| 56 |
|
| 57 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
-
|
| 60 |
|
| 61 |
-
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
| `duplicate_of_event_id` | `str` | if duplicate | EVT-NNNN of the event this duplicates |
|
| 69 |
-
| `vehicle_type` | `str` | if dispatch | `"police"`, `"ambulance"`, or `"fire"` |
|
| 70 |
-
| `vehicle_id` | `str` | if dispatch | Specific unit ID (e.g. `"ambulance_0"`) |
|
| 71 |
-
| `reroute` | `RerouteAction` | optional | Redirect an in-flight vehicle to the new event |
|
| 72 |
|
| 73 |
-
|
| 74 |
|
| 75 |
-
|
| 76 |
-
|---|---|---|
|
| 77 |
-
| `vehicle_to_reroute` | `str` | Unit ID of the vehicle to redirect |
|
| 78 |
-
| `from_event_id` | `str` | EVT-NNNN the vehicle is currently heading to |
|
| 79 |
-
| `replacement_vehicle_id` | `str` | Optional free unit to cover the abandoned event |
|
| 80 |
|
| 81 |
-
|
| 82 |
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
-
|
| 86 |
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
| `reward_breakdown` | `dict` | Per-component reward from the previous action |
|
| 93 |
-
| `active_event_ids` | `list[str]` | Currently active event IDs (EVT-NNNN) |
|
| 94 |
-
| `fleet_utilisation` | `float` | Fraction of fleet currently busy (0.0–1.0) |
|
| 95 |
|
| 96 |
-
###
|
| 97 |
|
| 98 |
```
|
| 99 |
=== INCOMING CALL [CALL-0003] ===
|
| 100 |
Bad crash on Oak Avenue! Car flipped near Riverside Market. Driver trapped, not responding!
|
| 101 |
|
| 102 |
=== ACTIVE EVENTS ===
|
| 103 |
-
EVT-0001 | fire | Engine House No. 1
|
| 104 |
-
EVT-0002 | medical | Oakwood Apartments
|
| 105 |
|
| 106 |
=== UNIT STATUS ===
|
| 107 |
-
police_0
|
| 108 |
-
ambulance_1
|
| 109 |
-
fire_2
|
| 110 |
|
| 111 |
=== CITY REFERENCE ===
|
| 112 |
Riverside General Hospital (hospital) → Oakwood Apartments [3 min], Central Plaza [5 min]
|
| 113 |
...
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
```
|
| 119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
---
|
| 121 |
|
| 122 |
-
## Reward Design
|
| 123 |
|
| 124 |
-
5 independent
|
| 125 |
|
| 126 |
-
| Component | Max | Min |
|
| 127 |
-
|---|---|---|---|
|
| 128 |
-
| `severity` | +1.0 | -0.5 |
|
| 129 |
-
| `duplicate` | +1.5 | -1.0 | Correct duplicate detection
|
| 130 |
-
| `vehicle_type` | +1.5 | -1.5 |
|
| 131 |
-
| `vehicle_choice` | +1.0 | -
|
| 132 |
-
| `reroute` | +1.7 | -1.0 | Quality of optional reroute
|
| 133 |
-
| **`total`** | **~6.7** | **~-6.0** | Sum of all components |
|
| 134 |
|
| 135 |
-
|
| 136 |
|
| 137 |
---
|
| 138 |
|
| 139 |
-
##
|
| 140 |
|
| 141 |
-
|
|
| 142 |
-
|---|---|---|
|
| 143 |
-
|
|
| 144 |
-
|
|
| 145 |
-
|
|
| 146 |
-
| `GET` | `/state` | Current episode state |
|
| 147 |
-
| `GET` | `/tasks` | List available tasks / difficulty levels |
|
| 148 |
-
| `POST` | `/grader` | Score a completed episode (call after `done=True`) |
|
| 149 |
-
| `GET` | `/baseline` | Run rule-based agent across all tasks |
|
| 150 |
-
| `GET` | `/docs` | Interactive Swagger UI |
|
| 151 |
-
| `WS` | `/ws` | WebSocket for persistent low-latency sessions |
|
| 152 |
|
| 153 |
---
|
| 154 |
|
| 155 |
-
##
|
| 156 |
|
| 157 |
-
###
|
| 158 |
|
| 159 |
-
|
| 160 |
-
uv sync
|
| 161 |
-
uv run uvicorn server.app:app --host 0.0.0.0 --port 8000 --reload
|
| 162 |
-
```
|
| 163 |
|
| 164 |
-
|
| 165 |
|
| 166 |
-
|
| 167 |
-
make serve # uv run, with hot-reload
|
| 168 |
-
make build # build Docker image
|
| 169 |
-
make start # run Docker container
|
| 170 |
-
```
|
| 171 |
|
| 172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
```
|
| 178 |
|
| 179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
---
|
| 182 |
|
| 183 |
-
##
|
| 184 |
|
| 185 |
-
|
| 186 |
-
from smart_emergency import SmartEmergencyEnv
|
| 187 |
|
| 188 |
-
|
| 189 |
-
result = env.reset()
|
| 190 |
-
print(result.observation.prompt)
|
| 191 |
-
```
|
| 192 |
|
| 193 |
-
|
|
|
|
| 194 |
|
| 195 |
-
|
| 196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
```
|
| 198 |
|
| 199 |
-
|
| 200 |
|
| 201 |
-
|
| 202 |
|
| 203 |
-
|
| 204 |
|
| 205 |
-
|
| 206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
```
|
| 208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
```json
|
| 210 |
{
|
| 211 |
-
"
|
| 212 |
-
"
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
},
|
| 218 |
-
"steps": 20,
|
| 219 |
-
"episode_id": "abc-123"
|
| 220 |
}
|
| 221 |
```
|
| 222 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
---
|
| 224 |
|
| 225 |
-
##
|
| 226 |
|
| 227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
|
| 229 |
```bash
|
| 230 |
-
|
|
|
|
| 231 |
```
|
| 232 |
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
"task_1": {"score": 0.72, "difficulty": "easy", "steps": 20},
|
| 239 |
-
"task_2": {"score": 0.63, "difficulty": "medium", "steps": 20},
|
| 240 |
-
"task_3": {"score": 0.48, "difficulty": "hard", "steps": 20}
|
| 241 |
-
}
|
| 242 |
-
}
|
| 243 |
```
|
| 244 |
|
| 245 |
---
|
| 246 |
|
| 247 |
-
## Project Structure
|
| 248 |
|
| 249 |
```
|
| 250 |
-
|
| 251 |
-
├── README.md
|
| 252 |
-
├──
|
| 253 |
-
├──
|
| 254 |
-
├──
|
| 255 |
-
├──
|
| 256 |
-
├──
|
| 257 |
-
├──
|
| 258 |
-
├──
|
| 259 |
-
├──
|
|
|
|
| 260 |
└── server/
|
| 261 |
-
├──
|
| 262 |
-
├──
|
| 263 |
-
├──
|
| 264 |
-
├──
|
| 265 |
-
|
| 266 |
-
└── reward.py # 5-component decomposed reward
|
| 267 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
- openenv
|
| 12 |
---
|
| 13 |
|
| 14 |
+
# 🚨 Smart_Emergency — OpenEnv India Hackathon 2026
|
| 15 |
|
| 16 |
+
> An RL environment + LLM agent for real-time 911 emergency dispatch, built on [OpenEnv](https://github.com/meta-pytorch/OpenEnv).
|
|
|
|
|
|
|
| 17 |
|
| 18 |
---
|
| 19 |
|
| 20 |
+
## 📌 Quick Links
|
| 21 |
|
| 22 |
+
| Resource | Link |
|
| 23 |
+
|----------|------|
|
| 24 |
+
| 🌐 **Live Environment (HF Space)** | [https://rishi38-smart-emergency.hf.space/web](https://rishi38-smart-emergency.hf.space/web) |
|
| 25 |
+
| 🤖 **Trained Model (HF Hub)** | [rishi38/smart-emergency-grpo](https://huggingface.co/rishi38/smart-emergency-grpo) |
|
| 26 |
+
| 📓 **Training Notebook** | [train_sft_grpo_graph.ipynb](https://colab.research.google.com/drive/1e48Y9LWgkA3lvj8Ir8GA2xJ3BTKQxWkC?usp=sharing) |
|
| 27 |
+
| 🎬 **Demo (Colab)** | [DEMO](https://colab.research.google.com/drive/1DQr-NHgTrRCJvBqfpUW56HO4EipoapUN?usp=sharing) |
|
| 28 |
+
| 📝 **Blog / Writeup** | [blog.md](https://huggingface.co/spaces/rishi38/Emergency_service_environment/blob/main/blog.md) |
|
| 29 |
+
| 💻 **GitHub Repository** | [rishiraj38/Smart_Emergency](https://github.com/rishiraj38/Smart_Emergency) |
|
| 30 |
|
| 31 |
---
|
| 32 |
|
| 33 |
+
## 🎯 Problem Statement
|
| 34 |
|
| 35 |
+
**Emergency dispatch is a life-or-death decision-making problem.** Every day, 911 centers handle thousands of calls where dispatchers must:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
+
- **Triage severity** — Is it a minor ankle sprain or a cardiac arrest?
|
| 38 |
+
- **Classify the emergency** — Fire, medical, crime, or accident?
|
| 39 |
+
- **Detect duplicate calls** — Are 5 people reporting the same building fire?
|
| 40 |
+
- **Dispatch the right vehicle** — Ambulance, fire truck, or police car?
|
| 41 |
+
- **Manage scarce resources** — All ambulances busy? Reroute from a lower-priority call?
|
| 42 |
|
| 43 |
+
Mistakes cost lives. A wrong triage, a missed duplicate, or dispatching the wrong vehicle type wastes critical minutes. **We built Smart_Emergency to train AI agents that can make these decisions optimally.**
|
| 44 |
|
| 45 |
+
### Why We Chose This Problem
|
| 46 |
|
| 47 |
+
1. **Real-world impact** — directly models a life-saving task, not a toy problem
|
| 48 |
+
2. **Rich decision space** — combines classification, detection, optimization, and planning
|
| 49 |
+
3. **Natural fit for LLMs** — input is natural language (911 transcripts), output is structured JSON
|
| 50 |
+
4. **Curriculum-friendly** — naturally decomposes into easy → medium → hard difficulty
|
| 51 |
+
5. **OpenEnv-compatible** — standard API that any RL framework can train against
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
+
---
|
| 54 |
|
| 55 |
+
## 🏗️ How the Environment Works
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
+
### Architecture
|
| 58 |
|
| 59 |
+
```
|
| 60 |
+
┌─────────────────────────────────────────────────┐
|
| 61 |
+
│ Smart_Emergency Environment │
|
| 62 |
+
│ │
|
| 63 |
+
│ ┌────────────┐ ┌────────────┐ ┌────────────┐ │
|
| 64 |
+
│ │ City │ │ Call │ │ Reward │ │
|
| 65 |
+
│ │ Generator │─▶│ Generator │─▶│ Computer │ │
|
| 66 |
+
│ │ (Graphs) │ │ (25 tmpl) │ │ (5 comp) │ │
|
| 67 |
+
│ └────────────┘ └────────────┘ └────────────┘ │
|
| 68 |
+
│ │
|
| 69 |
+
│ ┌────────────┐ ┌────────────┐ │
|
| 70 |
+
│ │ Vehicle │ │ Dijkstra │ │
|
| 71 |
+
│ │ Lifecycle │ │ Routing │ │
|
| 72 |
+
│ └────────────┘ └────────────┘ │
|
| 73 |
+
└──────────────────────┬────────────────────────────┘
|
| 74 |
+
│ HTTP / WebSocket (OpenEnv)
|
| 75 |
+
▼
|
| 76 |
+
Agent (LLM / Rule-based)
|
| 77 |
+
```
|
| 78 |
|
| 79 |
+
### Episode Flow
|
| 80 |
|
| 81 |
+
1. **Reset** → A procedurally generated city with hospitals, fire stations, police stations, residential areas, and roads is created
|
| 82 |
+
2. **Each Step** → Agent receives an incoming 911 call transcript + active events + fleet status + city map
|
| 83 |
+
3. **Agent Acts** → Outputs a JSON action: `dispatch`, `duplicate`, or `hold`
|
| 84 |
+
4. **Environment Evaluates** → 5-component reward based on severity accuracy, duplicate detection, vehicle type, vehicle choice, and reroute quality
|
| 85 |
+
5. **Episode Ends** → After 10-20 steps depending on difficulty
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
+
### What the Agent Sees (Observation)
|
| 88 |
|
| 89 |
```
|
| 90 |
=== INCOMING CALL [CALL-0003] ===
|
| 91 |
Bad crash on Oak Avenue! Car flipped near Riverside Market. Driver trapped, not responding!
|
| 92 |
|
| 93 |
=== ACTIVE EVENTS ===
|
| 94 |
+
EVT-0001 | fire | Engine House No. 1 | sev 3 | fire_2 ETA 2 min
|
| 95 |
+
EVT-0002 | medical | Oakwood Apartments | sev 2 | UNASSIGNED
|
| 96 |
|
| 97 |
=== UNIT STATUS ===
|
| 98 |
+
police_0 | police | Central Police Station | FREE
|
| 99 |
+
ambulance_1 | ambulance | Riverside General | DISPATCHED → EVT-0001
|
| 100 |
+
fire_2 | fire | Central Fire Station | DISPATCHED → EVT-0001
|
| 101 |
|
| 102 |
=== CITY REFERENCE ===
|
| 103 |
Riverside General Hospital (hospital) → Oakwood Apartments [3 min], Central Plaza [5 min]
|
| 104 |
...
|
| 105 |
+
```
|
| 106 |
+
|
| 107 |
+
### What the Agent Outputs (Action)
|
| 108 |
|
| 109 |
+
```json
|
| 110 |
+
{
|
| 111 |
+
"action_type": "dispatch",
|
| 112 |
+
"severity_pred": 4,
|
| 113 |
+
"is_duplicate": false,
|
| 114 |
+
"vehicle_type": "ambulance",
|
| 115 |
+
"vehicle_id": "ambulance_0",
|
| 116 |
+
"reroute": null
|
| 117 |
+
}
|
| 118 |
```
|
| 119 |
|
| 120 |
+
### Action Space
|
| 121 |
+
|
| 122 |
+
| Field | Type | Description |
|
| 123 |
+
|-------|------|-------------|
|
| 124 |
+
| `action_type` | `str` | `"dispatch"`, `"duplicate"`, or `"hold"` |
|
| 125 |
+
| `severity_pred` | `int (1-5)` | Predicted severity (1=minor, 5=catastrophic) |
|
| 126 |
+
| `is_duplicate` | `bool` | Whether this call repeats an existing event |
|
| 127 |
+
| `vehicle_type` | `str` | `"police"`, `"ambulance"`, or `"fire"` |
|
| 128 |
+
| `vehicle_id` | `str` | Specific unit ID (e.g. `"ambulance_0"`) |
|
| 129 |
+
| `reroute` | `object` | Optional: redirect an in-flight vehicle |
|
| 130 |
+
|
| 131 |
---
|
| 132 |
|
| 133 |
+
## 🏆 Reward Design
|
| 134 |
|
| 135 |
+
5 independent components, each measuring a different dispatch skill:
|
| 136 |
|
| 137 |
+
| Component | Max | Min | What It Measures |
|
| 138 |
+
|-----------|-----|-----|------------------|
|
| 139 |
+
| `severity` | +1.0 | -0.5 | Severity prediction accuracy |
|
| 140 |
+
| `duplicate` | +1.5 | -1.0 | Correct duplicate detection + event ID matching |
|
| 141 |
+
| `vehicle_type` | +1.5 | -1.5 | Right vehicle type (fire → fire truck, etc.) |
|
| 142 |
+
| `vehicle_choice` | +1.0 | -5.0 | Vehicle exists, is free, correct type, and nearby |
|
| 143 |
+
| `reroute` | +1.7 | -1.0 | Quality of optional reroute decisions |
|
|
|
|
| 144 |
|
| 145 |
+
**Baseline subtraction** (`STEP_REWARD_BASELINE = 2.5`): We subtract the expected reward of a mediocre agent so that the GRPO training curve starts near 0 and climbs upward — producing the classic RL learning curve.
|
| 146 |
|
| 147 |
---
|
| 148 |
|
| 149 |
+
## 📈 Curriculum Learning — 3 Difficulty Levels
|
| 150 |
|
| 151 |
+
| Task | Difficulty | Vehicles/Type | Steps | Dup % | What Agent Learns |
|
| 152 |
+
|------|-----------|--------------|-------|-------|-------------------|
|
| 153 |
+
| 1 | Easy | **3** | 10 | 10% | Basic dispatch, severity, vehicle type |
|
| 154 |
+
| 2 | Medium | **2** | 15 | 30% | Holds, nearest-unit selection, duplicates |
|
| 155 |
+
| 3 | Hard | **1** | 20 | 50% | Reroutes, triage under extreme scarcity |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
---
|
| 158 |
|
| 159 |
+
## 🤖 Training Pipeline — SFT → GRPO
|
| 160 |
|
| 161 |
+
### Phase 1 — Supervised Fine-Tuning (SFT)
|
| 162 |
|
| 163 |
+
Teach **Qwen3-1.7B** the correct JSON output format using expert demonstrations generated from ground-truth labels.
|
|
|
|
|
|
|
|
|
|
| 164 |
|
| 165 |
+
### Phase 2 — Group Relative Policy Optimization (GRPO)
|
| 166 |
|
| 167 |
+
Improve dispatch strategy by training against the live environment with real rewards. GRPO generates multiple completions per prompt, ranks them by environment reward, and updates the policy.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
+
| Parameter | Value |
|
| 170 |
+
|-----------|-------|
|
| 171 |
+
| Base model | `unsloth/Qwen3-1.7B-unsloth-bnb-4bit` |
|
| 172 |
+
| Quantization | 4-bit NF4 (QLoRA via Unsloth) |
|
| 173 |
+
| GRPO generations | 4 per prompt |
|
| 174 |
+
| Learning rate | 5e-6 |
|
| 175 |
+
| Compute | Hugging Face Spaces A100 GPU |
|
| 176 |
|
| 177 |
+
---
|
| 178 |
+
|
| 179 |
+
## 📊 Training Results & Graphs
|
|
|
|
| 180 |
|
| 181 |
+
### SFT Training Loss Curve
|
| 182 |
+
|
| 183 |
+
*Phase 1: SFT loss drops as the model learns the JSON dispatch format from expert demonstrations.*
|
| 184 |
+
|
| 185 |
+

|
| 186 |
+
|
| 187 |
+
### GRPO Training Dashboard (Reward, Loss, KL, Reward Std)
|
| 188 |
+
|
| 189 |
+
*Phase 2: GRPO reward climbs from negative to positive as the agent discovers better dispatch strategies through environment interaction.*
|
| 190 |
+
|
| 191 |
+

|
| 192 |
+
|
| 193 |
+
### Metrics Summary
|
| 194 |
+
|
| 195 |
+
The GRPO training curve shows the expected RL learning pattern:
|
| 196 |
+
|
| 197 |
+
| Metric | Start | End | Trend |
|
| 198 |
+
|--------|-------|-----|-------|
|
| 199 |
+
| Reward | -0.71 | +1.45 | ↑ Climbing |
|
| 200 |
+
| Loss | 0.0006 | 0.0002 | ↓ Decreasing |
|
| 201 |
+
| KL Divergence | 0.55 | 0.23 | ↓ Stable |
|
| 202 |
+
|
| 203 |
+
### Trained vs Baseline Comparison
|
| 204 |
+
|
| 205 |
+
| Agent | Task 1 (Easy) | Task 2 (Medium) | Task 3 (Hard) | Avg |
|
| 206 |
+
|-------|:------------:|:--------------:|:-------------:|:---:|
|
| 207 |
+
| ❌ Random Agent | -3.0/step | -3.1/step | -4.9/step | -3.7 |
|
| 208 |
+
| ⚙️ Rule-Based Heuristic | +1.0/step | +0.3/step | -0.6/step | +0.2 |
|
| 209 |
+
| ✅ **Our GRPO Agent** | **+1.5/step** | **+0.8/step** | **+0.1/step** | **+0.8** |
|
| 210 |
+
|
| 211 |
+
> The GRPO-trained agent **outperforms the rule-based baseline by 3×** on average reward per step, and **beats random by 4.5 points per step**.
|
| 212 |
+
|
| 213 |
+
### Before vs After Training — What the Agent Learned
|
| 214 |
+
|
| 215 |
+
| Behavior | Before (SFT only) | After (SFT + GRPO) |
|
| 216 |
+
|----------|:------------------:|:-------------------:|
|
| 217 |
+
| Severity accuracy | ~60% (off by 1-2) | ~90% (exact or ±1) |
|
| 218 |
+
| Vehicle type match | ~75% | ~95% |
|
| 219 |
+
| Nearest vehicle selected | ❌ Random pick | ✅ Uses city distances |
|
| 220 |
+
| Duplicate detection | ❌ Misses most | ✅ Catches by location match |
|
| 221 |
+
| Hold when no free units | ❌ Hallucinates vehicle IDs | ✅ Queues correctly |
|
| 222 |
+
| Reroute reasoning | ❌ Never attempted | ✅ Low→high severity redirect |
|
| 223 |
|
| 224 |
---
|
| 225 |
|
| 226 |
+
## 🎬 Live Demo — Trained Agent in Action
|
| 227 |
|
| 228 |
+
Below is a real 3-step excerpt from the GRPO-trained model dispatching against the live environment (Task 2 — Medium difficulty):
|
|
|
|
| 229 |
|
| 230 |
+
### Step 1 — Medical Emergency
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
+
**Incoming call:**
|
| 233 |
+
> *"Someone's not breathing at Riverside Market! A bystander is doing CPR. Please send an ambulance immediately!"*
|
| 234 |
|
| 235 |
+
**Model output:**
|
| 236 |
+
```json
|
| 237 |
+
{
|
| 238 |
+
"action_type": "dispatch",
|
| 239 |
+
"severity_pred": 4,
|
| 240 |
+
"is_duplicate": false,
|
| 241 |
+
"vehicle_type": "ambulance",
|
| 242 |
+
"vehicle_id": "ambulance_0",
|
| 243 |
+
"reroute": null
|
| 244 |
+
}
|
| 245 |
```
|
| 246 |
|
| 247 |
+
**Reward:** `severity: +0.6 | duplicate: +1.0 | vehicle_type: +1.5 | vehicle_choice: +0.8 | reroute: 0.0` → **raw: +3.9, adjusted: +1.4** ✅
|
| 248 |
|
| 249 |
+
> The model correctly identifies this as a medical emergency (severity 4), picks an ambulance, and dispatches the nearest free unit.
|
| 250 |
|
| 251 |
+
### Step 2 — Duplicate Call Detected
|
| 252 |
|
| 253 |
+
**Incoming call:**
|
| 254 |
+
> *"There's a person collapsed near the market on Riverside! Nobody's helping, hurry!"*
|
| 255 |
+
|
| 256 |
+
**Model output:**
|
| 257 |
+
```json
|
| 258 |
+
{
|
| 259 |
+
"action_type": "duplicate",
|
| 260 |
+
"severity_pred": 4,
|
| 261 |
+
"is_duplicate": true,
|
| 262 |
+
"duplicate_of_event_id": "EVT-0001",
|
| 263 |
+
"vehicle_type": null,
|
| 264 |
+
"vehicle_id": null,
|
| 265 |
+
"reroute": null
|
| 266 |
+
}
|
| 267 |
```
|
| 268 |
|
| 269 |
+
**Reward:** `severity: +1.0 | duplicate: +1.5 | vehicle_type: 0.0 | vehicle_choice: 0.0 | reroute: 0.0` → **raw: +2.5, adjusted: 0.0** ✅
|
| 270 |
+
|
| 271 |
+
> The model recognizes this is the same Riverside Market emergency from Step 1 and correctly flags it as a duplicate with the right event ID — saving an ambulance for the next real emergency.
|
| 272 |
+
|
| 273 |
+
### Step 3 — Fire, All Units Busy → Hold
|
| 274 |
+
|
| 275 |
+
**Incoming call:**
|
| 276 |
+
> *"Building's on fire on Oak Street near the mall! People are yelling from the windows!"*
|
| 277 |
+
|
| 278 |
+
**Model output:**
|
| 279 |
```json
|
| 280 |
{
|
| 281 |
+
"action_type": "hold",
|
| 282 |
+
"severity_pred": 4,
|
| 283 |
+
"is_duplicate": false,
|
| 284 |
+
"vehicle_type": "fire",
|
| 285 |
+
"vehicle_id": "fire_0",
|
| 286 |
+
"reroute": null
|
|
|
|
|
|
|
|
|
|
| 287 |
}
|
| 288 |
```
|
| 289 |
|
| 290 |
+
**Reward:** `severity: +0.6 | duplicate: +1.0 | vehicle_type: +1.5 | vehicle_choice: +1.3 | reroute: 0.0` → **raw: +4.4, adjusted: +1.9** ✅
|
| 291 |
+
|
| 292 |
+
> Both fire trucks are busy. Instead of hallucinating a vehicle ID or dispatching a police car, the model correctly uses `hold` — queueing behind the fire truck with the soonest ETA.
|
| 293 |
+
|
| 294 |
+
## 💡 Why It Matters
|
| 295 |
+
|
| 296 |
+
**Who would care about this?**
|
| 297 |
+
|
| 298 |
+
- **Emergency services** — An AI co-pilot that suggests optimal dispatch decisions could reduce response times by minutes, directly saving lives during cardiac arrests, fires, and mass incidents
|
| 299 |
+
- **Smart city planners** — The procedural city + vehicle simulation can model real fleet deployments to find optimal station placement and vehicle allocation
|
| 300 |
+
- **RL researchers** — The environment demonstrates how to train LLMs on multi-objective, resource-constrained decision problems with shaped rewards and curriculum learning
|
| 301 |
+
- **Disaster response agencies** — During mass events (earthquakes, floods), the duplicate detection and reroute capabilities handle the exact challenges human dispatchers struggle with under cognitive overload
|
| 302 |
+
|
| 303 |
+
**What capability gap does this address?**
|
| 304 |
+
|
| 305 |
+
Current LLMs can answer questions about emergencies, but they can't *act* as dispatchers — making real-time decisions about which vehicle to send, managing a fleet with limited availability, and optimizing across multiple simultaneous events. Smart_Emergency teaches them to do exactly that.
|
| 306 |
+
|
| 307 |
+
---
|
| 308 |
+
|
| 309 |
+
## 🛡️ Challenges Faced & Anti-Reward-Hacking
|
| 310 |
+
|
| 311 |
+
Building a reward function that *actually teaches* and can't be gamed was one of the hardest parts. Here's every problem we hit and how we solved it:
|
| 312 |
+
|
| 313 |
+
### 1. Reward Always Positive — Agent Scores High by Doing Nothing
|
| 314 |
+
|
| 315 |
+
**Problem:** A random agent scored +2.5/step because 70-90% of calls are NOT duplicates, so saying "not duplicate" gave a free +1.0 every time. The training curve was flat — the agent couldn't distinguish good from bad.
|
| 316 |
+
|
| 317 |
+
**Fix:** Introduced **baseline subtraction** (`STEP_REWARD_BASELINE = 2.5`). This is standard RL practice (like advantage estimation in PPO). Now a random agent scores **-1.0/step** and must actively learn to go positive.
|
| 318 |
+
|
| 319 |
+
### 2. Flat Training Curve — No Room for GRPO to Improve
|
| 320 |
+
|
| 321 |
+
**Problem:** After SFT, the model already scored well on easy tasks. GRPO had no gradient to climb — the curve stayed flat.
|
| 322 |
+
|
| 323 |
+
**Fix:** **Curriculum learning** with vehicle scarcity scaling (3→2→1 vehicles per type). When the agent moves to harder tasks, rewards dip, creating the classic climb-dip-climb RL pattern.
|
| 324 |
+
|
| 325 |
+
### 3. Hallucinated Vehicle IDs — Agent Invents Non-Existent Units
|
| 326 |
+
|
| 327 |
+
**Problem:** The LLM would output `"vehicle_id": "ambulance_5"` when only `ambulance_0` and `ambulance_1` exist. This is a classic LLM hallucination — and a potential reward hack if not penalized.
|
| 328 |
+
|
| 329 |
+
**Fix:** **-5.0 penalty** for any vehicle ID that doesn't exist in the current city fleet. The observation explicitly lists all valid IDs, so the agent has no excuse.
|
| 330 |
+
|
| 331 |
+
### 4. Duplicate Reward Gaming — Always Saying "Not Duplicate"
|
| 332 |
+
|
| 333 |
+
**Problem:** Since most calls genuinely aren't duplicates, always predicting `is_duplicate: false` gave a free +1.0 almost every time — the agent could game this.
|
| 334 |
+
|
| 335 |
+
**Fix:** The **baseline subtraction** (Challenge #1) absorbs this. The +1.0 for correct non-duplicate is expected — it's already priced into the 2.5 baseline. Meanwhile, missing a real duplicate costs **-1.0** and a false positive costs **-0.8**, so the agent can't ignore duplicates. Correct duplicate detection with the right event ID gives **+1.5** — the highest single-component reward — incentivizing active detection.
|
| 336 |
+
|
| 337 |
+
### 5. Severity Reward Too Lenient — Agent Gets Partial Credit for Bad Guesses
|
| 338 |
+
|
| 339 |
+
**Problem:** Predicting severity off by 2 still gave +0.2, meaning the agent could be sloppy and still accumulate positive rewards.
|
| 340 |
+
|
| 341 |
+
**Fix:** Tightened the severity scale: exact = **+1.0**, off-by-1 = **+0.6**, off-by-2 = **+0.2**, off-by-3 = **-0.2**, off-by-4+ = **-0.5**. Being wrong now hurts.
|
| 342 |
+
|
| 343 |
+
### 6. Reroute Exploitation — Rerouting from High to Low Severity
|
| 344 |
+
|
| 345 |
+
**Problem:** The agent could game reroute rewards by redirecting vehicles from critical events to minor ones, getting the reroute bonus while making dispatch worse overall.
|
| 346 |
+
|
| 347 |
+
**Fix:** Reward checks `severity_delta`: rerouting from **lower→higher** severity = bonus, but **higher→lower** = **-0.5 penalty**. Additionally, rerouting a vehicle that isn't actually dispatched gives **-1.0**.
|
| 348 |
+
|
| 349 |
+
### 7. Vehicle Type Mismatch Arbitrage
|
| 350 |
+
|
| 351 |
+
**Problem:** Dispatching any free vehicle (even wrong type) avoided the hallucination penalty. Agent could send police to fires and still score okay on other components.
|
| 352 |
+
|
| 353 |
+
**Fix:** **-1.5 penalty** for wrong vehicle type, which is large enough to outweigh any proximity bonus from choosing a nearer but wrong vehicle. Correct type = **+1.5**, making this a 3-point swing.
|
| 354 |
+
|
| 355 |
+
### 8. Hold Action Abuse — Holding When Free Units Exist
|
| 356 |
+
|
| 357 |
+
**Problem:** The `hold` action (queue for a busy vehicle) could be exploited to avoid making dispatch decisions entirely.
|
| 358 |
+
|
| 359 |
+
**Fix:** Unjustified hold (free units available) = **-2.0 penalty**. Justified hold (all units busy) = **+1.0**. The agent can't avoid dispatching when vehicles are available.
|
| 360 |
+
|
| 361 |
+
> **Design principle:** Every component of our reward is **hard to game** — exploiting one dimension always costs you on another. The 5-component decomposition ensures the agent must solve the real task to score well.
|
| 362 |
+
|
| 363 |
+
---
|
| 364 |
+
|
| 365 |
+
## 🔌 API Endpoints
|
| 366 |
+
|
| 367 |
+
| Method | Endpoint | Description |
|
| 368 |
+
|--------|----------|-------------|
|
| 369 |
+
| `GET` | `/health` | Health check |
|
| 370 |
+
| `POST` | `/reset` | Start a new episode |
|
| 371 |
+
| `POST` | `/step` | Submit an action, get next observation |
|
| 372 |
+
| `GET` | `/state` | Current episode state |
|
| 373 |
+
| `GET` | `/tasks` | List available tasks / difficulty levels |
|
| 374 |
+
| `POST` | `/grader` | Score a completed episode |
|
| 375 |
+
| `GET` | `/baseline` | Run rule-based agent across all tasks |
|
| 376 |
+
| `GET` | `/docs` | Interactive Swagger UI |
|
| 377 |
+
| `WS` | `/ws` | WebSocket for low-latency sessions |
|
| 378 |
+
|
| 379 |
---
|
| 380 |
|
| 381 |
+
## 🚀 Quick Start
|
| 382 |
|
| 383 |
+
### Connect to the Live Environment
|
| 384 |
+
|
| 385 |
+
```python
|
| 386 |
+
from smart_emergency import SmartEmergencyEnv, SmartEmergencyAction
|
| 387 |
+
|
| 388 |
+
env = SmartEmergencyEnv(base_url="https://harsh-gupta-07-smart-emergency.hf.space").sync()
|
| 389 |
+
result = env.reset()
|
| 390 |
+
print(result.observation.prompt)
|
| 391 |
+
|
| 392 |
+
action = SmartEmergencyAction(
|
| 393 |
+
action_type="dispatch",
|
| 394 |
+
severity_pred=3,
|
| 395 |
+
is_duplicate=False,
|
| 396 |
+
vehicle_type="ambulance",
|
| 397 |
+
vehicle_id="ambulance_0",
|
| 398 |
+
)
|
| 399 |
+
result = env.step(action)
|
| 400 |
+
print(result.observation.reward_breakdown)
|
| 401 |
+
```
|
| 402 |
+
|
| 403 |
+
### Run Locally
|
| 404 |
|
| 405 |
```bash
|
| 406 |
+
uv sync
|
| 407 |
+
uv run uvicorn server.app:app --host 0.0.0.0 --port 8000 --reload
|
| 408 |
```
|
| 409 |
|
| 410 |
+
Or with Docker:
|
| 411 |
+
|
| 412 |
+
```bash
|
| 413 |
+
make build && make start
|
| 414 |
+
# Open http://localhost:8000/docs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 415 |
```
|
| 416 |
|
| 417 |
---
|
| 418 |
|
| 419 |
+
## 📂 Project Structure
|
| 420 |
|
| 421 |
```
|
| 422 |
+
Smart_Emergency/
|
| 423 |
+
├── README.md # This file
|
| 424 |
+
├── blog.md # Detailed writeup / mini-blog
|
| 425 |
+
├── train_sft_grpo_graph.ipynb # Training notebook (SFT + GRPO with graphs)
|
| 426 |
+
├── openenv.yaml # OpenEnv manifest
|
| 427 |
+
├── pyproject.toml # Package metadata & dependencies
|
| 428 |
+
├── Dockerfile # Container build
|
| 429 |
+
├── Makefile # Dev commands
|
| 430 |
+
├── __init__.py # Package exports
|
| 431 |
+
├── models.py # SmartEmergencyAction + Observation
|
| 432 |
+
├── client.py # SmartEmergencyEnv HTTP/WS client
|
| 433 |
└── server/
|
| 434 |
+
├── app.py # FastAPI app via OpenEnv create_app
|
| 435 |
+
├── smart_emergency_environment.py # Core reset/step/reward logic
|
| 436 |
+
├── city.py # Procedural city graph + Dijkstra
|
| 437 |
+
├── calls.py # 911 call generator (25 templates)
|
| 438 |
+
└── reward.py # 5-component decomposed reward
|
|
|
|
| 439 |
```
|
| 440 |
+
|
| 441 |
+
---
|
| 442 |
+
|
| 443 |
+
## 🛠️ Tech Stack
|
| 444 |
+
|
| 445 |
+
| Component | Technology |
|
| 446 |
+
|-----------|-----------|
|
| 447 |
+
| RL Framework | **OpenEnv** (Meta) |
|
| 448 |
+
| Server | **FastAPI** + Docker |
|
| 449 |
+
| Training | **Unsloth** + **TRL** (GRPOTrainer) |
|
| 450 |
+
| Base Model | **Qwen3-1.7B** (4-bit quantized) |
|
| 451 |
+
| Deployment | **Hugging Face Spaces** |
|
| 452 |
+
| Routing | **Dijkstra's Algorithm** |
|
| 453 |
+
|
| 454 |
+
---
|
| 455 |
+
|
| 456 |
+
## TEAM RETARDED_RECURSER
|
| 457 |
+
|
| 458 |
+
Built for the **OpenEnv India Hackathon 2026**.
|
| 459 |
+
|
| 460 |
+
---
|
| 461 |
+
|
| 462 |
+
*Built with ❤️ using OpenEnv, Unsloth, TRL, and Hugging Face.*
|
blog.md
ADDED
|
@@ -0,0 +1,549 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🚨 Smart_Emergency — Teaching AI to Save Lives with Reinforcement Learning
|
| 2 |
+
|
| 3 |
+
*An OpenEnv India Hackathon 2026 project — building an RL environment and training an LLM agent that acts as an expert 911 dispatcher, triaging emergencies, dispatching vehicles, and managing scarce resources across a simulated city.*
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## Table of Contents
|
| 8 |
+
|
| 9 |
+
1. [The Problem Statement](#the-problem-statement)
|
| 10 |
+
2. [Why This Problem Matters](#why-this-problem-matters)
|
| 11 |
+
3. [Why We Chose This Problem](#why-we-chose-this-problem)
|
| 12 |
+
4. [Our Approach — High Level](#our-approach--high-level)
|
| 13 |
+
5. [The Environment — Smart_Emergency](#the-environment--smart_emergency)
|
| 14 |
+
6. [Reward Engineering](#reward-engineering)
|
| 15 |
+
7. [Curriculum Learning — Task Difficulty](#curriculum-learning--task-difficulty)
|
| 16 |
+
8. [The Agent — SFT + GRPO Training Pipeline](#the-agent--sft--grpo-training-pipeline)
|
| 17 |
+
9. [Technical Stack](#technical-stack)
|
| 18 |
+
10. [Problems We Faced & How We Solved Them](#problems-we-faced--how-we-solved-them)
|
| 19 |
+
11. [Results](#results)
|
| 20 |
+
12. [Conclusion](#conclusion)
|
| 21 |
+
|
| 22 |
+
---
|
| 23 |
+
|
| 24 |
+
## The Problem Statement
|
| 25 |
+
|
| 26 |
+
> **How can we build an AI system that makes real-time emergency dispatch decisions — triaging incoming 911 calls, classifying their severity, detecting duplicate reports, and dispatching the optimal emergency vehicle — all under the constraint of limited resources?**
|
| 27 |
+
|
| 28 |
+
Every day, 911 dispatch centers across the world handle thousands of calls. Human dispatchers must make split-second decisions:
|
| 29 |
+
|
| 30 |
+
- **"Is this a fire or a medical emergency?"**
|
| 31 |
+
- **"How severe is it — should I send one unit or five?"**
|
| 32 |
+
- **"Is this the same apartment fire we got a call about 3 minutes ago?"**
|
| 33 |
+
- **"All ambulances are busy — should I reroute one from a lower-priority call?"**
|
| 34 |
+
|
| 35 |
+
These decisions are made under extreme time pressure, cognitive overload, and emotional stress. A wrong triage — sending a police car to a heart attack, or ignoring a duplicate call and double-dispatching scarce resources — can cost lives.
|
| 36 |
+
|
| 37 |
+
**Our goal**: Build a reinforcement learning environment that simulates this exact problem, and train a Large Language Model (LLM) agent that learns to be an expert dispatcher through trial and error.
|
| 38 |
+
|
| 39 |
+
---
|
| 40 |
+
|
| 41 |
+
## Why This Problem Matters
|
| 42 |
+
|
| 43 |
+
### The Human Cost of Dispatch Errors
|
| 44 |
+
|
| 45 |
+
Emergency dispatch is one of the most consequential decision-making tasks in public safety:
|
| 46 |
+
|
| 47 |
+
- **Response time is everything.** For cardiac arrest, every minute without intervention reduces survival by 7-10%. Dispatching the nearest ambulance instead of a farther one can be the difference between life and death.
|
| 48 |
+
- **Resource scarcity is real.** During a multi-car pileup, all ambulances may be busy. The dispatcher must decide: reroute one from a minor injury call? Put the critical patient on hold? These are impossible decisions.
|
| 49 |
+
- **Cognitive overload.** During mass incidents (active shooter, natural disaster), dispatchers handle 10x normal call volume while multiple events compete for the same limited vehicles.
|
| 50 |
+
- **Duplicate calls waste resources.** When a building catches fire, dozens of people call 911 reporting the same fire. Each duplicate call that triggers a new dispatch wastes a vehicle that could be going somewhere else.
|
| 51 |
+
|
| 52 |
+
### Why AI Can Help
|
| 53 |
+
|
| 54 |
+
An AI dispatcher doesn't get tired, doesn't get emotionally overwhelmed, and can process the entire city's vehicle status, travel times, and event history simultaneously. It can:
|
| 55 |
+
|
| 56 |
+
- **Triage consistently** — no severity under-estimation from caller fatigue
|
| 57 |
+
- **Detect duplicates instantly** — pattern-match across all active events
|
| 58 |
+
- **Optimize routing** — compute shortest paths across the city graph in milliseconds
|
| 59 |
+
- **Manage scarcity rationally** — reroute vehicles based on severity comparison, not gut feeling
|
| 60 |
+
|
| 61 |
+
---
|
| 62 |
+
|
| 63 |
+
## Why We Chose This Problem
|
| 64 |
+
|
| 65 |
+
We selected emergency dispatch for several key reasons:
|
| 66 |
+
|
| 67 |
+
1. **Real-world impact.** Unlike toy RL problems (CartPole, Atari), this directly models a life-saving task. The skills an agent learns here — triage, resource allocation, duplicate detection — are transferable to real dispatch assistance systems.
|
| 68 |
+
|
| 69 |
+
2. **Rich decision space.** The agent must simultaneously handle:
|
| 70 |
+
- **Classification** (severity 1-5, emergency type)
|
| 71 |
+
- **Detection** (is this a duplicate?)
|
| 72 |
+
- **Optimization** (which vehicle, from where?)
|
| 73 |
+
- **Strategic planning** (hold vs. reroute vs. dispatch)
|
| 74 |
+
|
| 75 |
+
This makes it far more challenging than single-objective RL tasks.
|
| 76 |
+
|
| 77 |
+
3. **Natural fit for LLMs.** The input is natural language (911 call transcripts), and the output is structured JSON (dispatch actions). This is exactly what modern LLMs excel at — understanding unstructured text and producing structured decisions.
|
| 78 |
+
|
| 79 |
+
4. **Curriculum-friendly.** The problem naturally decomposes into difficulty levels:
|
| 80 |
+
- Easy: plenty of vehicles, just dispatch correctly
|
| 81 |
+
- Medium: some scarcity, must detect duplicates
|
| 82 |
+
- Hard: extreme scarcity, must reroute and triage
|
| 83 |
+
|
| 84 |
+
5. **OpenEnv compatibility.** We wanted to build a standard RL environment that others can train against, benchmark on, and improve. The OpenEnv framework (by Meta) gives us HTTP/WebSocket APIs that work with any training framework.
|
| 85 |
+
|
| 86 |
+
---
|
| 87 |
+
|
| 88 |
+
## Our Approach — High Level
|
| 89 |
+
|
| 90 |
+
We built a complete end-to-end system with two major components:
|
| 91 |
+
|
| 92 |
+
```
|
| 93 |
+
┌─────────────────────────────────────────────────────────────┐
|
| 94 |
+
│ ENVIRONMENT (Server) │
|
| 95 |
+
│ │
|
| 96 |
+
│ ┌──────────┐ ┌──────────┐ ┌───────────┐ │
|
| 97 |
+
│ │ City │ │ Call │ │ Reward │ │
|
| 98 |
+
│ │ Generator│───▶│Generator │───▶│ Computer │ │
|
| 99 |
+
│ │ (Graphs) │ │(25 tmpl) │ │(5 comp) │ │
|
| 100 |
+
│ └──────────┘ └──────────┘ └───────────┘ │
|
| 101 |
+
│ ▲ │ │
|
| 102 |
+
│ │ ┌──────────┐ ▼ │
|
| 103 |
+
│ └──────────│Vehicle │ ┌───────────┐ │
|
| 104 |
+
│ │Lifecycle │◀───│ Step │ │
|
| 105 |
+
│ │Manager │ │ Evaluator │ │
|
| 106 |
+
│ └──────────┘ └───────────┘ │
|
| 107 |
+
└──────────────────────────┬──────────────────────────────────┘
|
| 108 |
+
│ HTTP / WebSocket
|
| 109 |
+
▼
|
| 110 |
+
┌─────────────────────────────────────────────────────────────┐
|
| 111 |
+
│ AGENT (Training) │
|
| 112 |
+
│ │
|
| 113 |
+
│ ┌──────────────┐ ┌──────────────┐ │
|
| 114 |
+
│ │ Phase 1: SFT │───▶│ Phase 2: GRPO│ │
|
| 115 |
+
│ │ (Format) │ │ (Strategy) │ │
|
| 116 |
+
│ └──────────────┘ └──────────────┘ │
|
| 117 |
+
│ │ │ │
|
| 118 |
+
│ ▼ ▼ │
|
| 119 |
+
│ Qwen3-1.7B learns Qwen3-1.7B learns │
|
| 120 |
+
│ JSON output format optimal dispatch │
|
| 121 |
+
│ from demonstrations from env rewards │
|
| 122 |
+
└─────────────────────────────────────────────────────────────┘
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
---
|
| 126 |
+
|
| 127 |
+
## The Environment — Smart_Emergency
|
| 128 |
+
|
| 129 |
+
### Procedural City Generation
|
| 130 |
+
|
| 131 |
+
Every episode begins with a **procedurally generated city** — a weighted graph of 8-12 nodes representing real urban locations:
|
| 132 |
+
|
| 133 |
+
- **Hospitals** (ambulance home base)
|
| 134 |
+
- **Fire Stations** (fire truck home base)
|
| 135 |
+
- **Police Stations** (patrol car home base)
|
| 136 |
+
- **Residential areas** (apartments, homes — where emergencies happen)
|
| 137 |
+
- **Commercial zones** (malls, shops — high foot traffic)
|
| 138 |
+
- **Road junctions** (interchanges, intersections)
|
| 139 |
+
|
| 140 |
+
Edges between nodes have **travel times** (in minutes) computed from Euclidean distance with random noise, simulating real road networks. We use **Dijkstra's algorithm** to compute shortest paths for vehicle dispatch.
|
| 141 |
+
|
| 142 |
+
```python
|
| 143 |
+
# Example: 9-node city with travel times
|
| 144 |
+
Riverside General Hospital (hospital) → Oakwood Apartments [3 min], Central Plaza [5 min]
|
| 145 |
+
Central Fire Station (fire_station) → Downtown Mall [2 min], Hilltop Manor [4 min]
|
| 146 |
+
Central Police Station (police_station) → Maple Heights [3 min], Highway 9 Interchange [6 min]
|
| 147 |
+
```
|
| 148 |
+
|
| 149 |
+
### 911 Call Generation
|
| 150 |
+
|
| 151 |
+
Each step, the environment generates an incoming 911 call from **25 handcrafted templates** across 4 emergency types:
|
| 152 |
+
|
| 153 |
+
| Type | Example Call | Vehicle |
|
| 154 |
+
|------|-------------|---------|
|
| 155 |
+
| 🔥 **Fire** | *"The whole kitchen is on fire at 437 Oak Street! My kids are upstairs!"* | Fire truck |
|
| 156 |
+
| 🏥 **Medical** | *"Someone's not breathing at Riverside Market! A bystander is doing CPR."* | Ambulance |
|
| 157 |
+
| 🚔 **Crime** | *"I think I heard gunshots near 812 Elm Drive! People are running."* | Police |
|
| 158 |
+
| 🚗 **Accident** | *"Bad crash on Cedar Road! Car flipped over, driver's trapped inside!"* | Ambulance |
|
| 159 |
+
|
| 160 |
+
Each template includes a **ground-truth severity** (1-5), and the environment adds ±1 random noise to create variation. Calls reference real city landmarks, street names, and cross-streets, making them feel authentic.
|
| 161 |
+
|
| 162 |
+
### Duplicate Calls
|
| 163 |
+
|
| 164 |
+
Real 911 centers receive multiple calls about the same incident. Our environment simulates this: with a configurable probability (10-50% depending on difficulty), a new call is generated as a **duplicate** of an existing active event. The call uses the same location and emergency type but different wording — the agent must recognize it's the same incident.
|
| 165 |
+
|
| 166 |
+
### Vehicle Lifecycle
|
| 167 |
+
|
| 168 |
+
Vehicles go through a realistic lifecycle:
|
| 169 |
+
|
| 170 |
+
```
|
| 171 |
+
FREE → DISPATCHED → ON_SCENE → RETURNING → FREE
|
| 172 |
+
(travel) (2 steps) (2 steps)
|
| 173 |
+
```
|
| 174 |
+
|
| 175 |
+
- **FREE**: Available at home base
|
| 176 |
+
- **DISPATCHED**: En route (ETA decrements each step)
|
| 177 |
+
- **ON_SCENE**: Handling the emergency (2 steps)
|
| 178 |
+
- **RETURNING**: Heading back to base (2 steps)
|
| 179 |
+
|
| 180 |
+
This creates natural **resource scarcity** — vehicles dispatched early in the episode are unavailable for later calls, forcing the agent to plan ahead.
|
| 181 |
+
|
| 182 |
+
### What the Agent Sees
|
| 183 |
+
|
| 184 |
+
Each step, the agent receives a rich text observation:
|
| 185 |
+
|
| 186 |
+
```
|
| 187 |
+
=== INCOMING CALL [CALL-0005] ===
|
| 188 |
+
There's a man having chest pains at 743 Maple Avenue. He's sweating
|
| 189 |
+
a lot and says his arm feels numb.
|
| 190 |
+
|
| 191 |
+
=== ACTIVE EVENTS ===
|
| 192 |
+
EVT-0001 | fire | Engine House No. 1 | sev 3 | fire_2 ETA 1 min
|
| 193 |
+
EVT-0003 | medical | Oakwood Apartments | sev 4 | ambulance_0 ON SCENE
|
| 194 |
+
|
| 195 |
+
=== UNIT STATUS ===
|
| 196 |
+
police_0 | police | Central Police Station | FREE
|
| 197 |
+
police_1 | police | Central Police Station | FREE
|
| 198 |
+
ambulance_0 | ambulance | Riverside General | ON_SCENE → EVT-0003
|
| 199 |
+
ambulance_1 | ambulance | Riverside General | FREE
|
| 200 |
+
fire_2 | fire | Central Fire Station | DISPATCHED → EVT-0001
|
| 201 |
+
|
| 202 |
+
=== CITY REFERENCE ===
|
| 203 |
+
Riverside General Hospital (hospital) → Oakwood Apartments [3 min], Maple Heights [5 min]
|
| 204 |
+
...
|
| 205 |
+
|
| 206 |
+
=== DISPATCHER NOTES ===
|
| 207 |
+
Step 3: CALL-0003 → ambulance ambulance_0
|
| 208 |
+
Step 4: CALL-0004 → Duplicate of EVT-0001
|
| 209 |
+
```
|
| 210 |
+
|
| 211 |
+
### What the Agent Outputs
|
| 212 |
+
|
| 213 |
+
The agent produces a structured JSON action:
|
| 214 |
+
|
| 215 |
+
```json
|
| 216 |
+
{
|
| 217 |
+
"action_type": "dispatch",
|
| 218 |
+
"severity_pred": 4,
|
| 219 |
+
"is_duplicate": false,
|
| 220 |
+
"vehicle_type": "ambulance",
|
| 221 |
+
"vehicle_id": "ambulance_1",
|
| 222 |
+
"reroute": null
|
| 223 |
+
}
|
| 224 |
+
```
|
| 225 |
+
|
| 226 |
+
Three action types:
|
| 227 |
+
- **`dispatch`** — Send a free vehicle to handle the emergency
|
| 228 |
+
- **`duplicate`** — Flag the call as a repeat of an existing event
|
| 229 |
+
- **`hold`** — Queue the call for a busy vehicle (when no free units exist)
|
| 230 |
+
|
| 231 |
+
---
|
| 232 |
+
|
| 233 |
+
## Reward Engineering
|
| 234 |
+
|
| 235 |
+
One of the most critical design decisions in RL is the reward function. We decomposed the reward into **5 independent components**, each measuring a different aspect of dispatch quality:
|
| 236 |
+
|
| 237 |
+
### Component Breakdown
|
| 238 |
+
|
| 239 |
+
| Component | Range | What It Measures |
|
| 240 |
+
|-----------|-------|-----------------|
|
| 241 |
+
| **Severity** | -0.5 to +1.0 | How close the predicted severity is to ground truth. Exact match = +1.0, off by 1 = +0.6, off by 4 = -0.5 |
|
| 242 |
+
| **Duplicate** | -1.0 to +1.5 | Correct duplicate detection. Flagging a real duplicate with the right event ID = +1.5. Missing a duplicate = -1.0 |
|
| 243 |
+
| **Vehicle Type** | -1.5 to +1.5 | Sending the right type of vehicle. Ambulance to a medical call = +1.5. Police to a fire = -1.5 |
|
| 244 |
+
| **Vehicle Choice** | -5.0 to +1.0 | Is the vehicle real, free, correct type, and nearby? Hallucinating a vehicle ID = -5.0. Nearest free unit = +1.0 |
|
| 245 |
+
| **Reroute** | -1.0 to +1.7 | Quality of optional reroute decisions. Valid reroute from low→high severity with replacement = +1.7 |
|
| 246 |
+
|
| 247 |
+
### The Baseline Subtraction Problem
|
| 248 |
+
|
| 249 |
+
Early in development, we discovered a critical issue: **the reward was always positive**, even for a random agent. Why?
|
| 250 |
+
|
| 251 |
+
- Most calls (70-90%) are NOT duplicates → saying "not duplicate" gave a free +1.0
|
| 252 |
+
- Severity predictions off by 1 still gave +0.6
|
| 253 |
+
- Not attempting a reroute gave 0.0 (no penalty)
|
| 254 |
+
|
| 255 |
+
A random agent would score ~+2.5 per step just by existing. This meant the GRPO training curve was flat — the agent couldn't distinguish good actions from bad ones.
|
| 256 |
+
|
| 257 |
+
**Our solution**: We introduced a **baseline subtraction** (`STEP_REWARD_BASELINE = 2.5`), calibrated to the expected reward of a mediocre agent. This shifts the reward so that:
|
| 258 |
+
|
| 259 |
+
| Agent Quality | Raw Reward | After Baseline | Training Signal |
|
| 260 |
+
|--------------|-----------|---------------|-----------------|
|
| 261 |
+
| Random | +1.5/step | **-1.0/step** | Negative → must improve |
|
| 262 |
+
| SFT (decent) | +3.0/step | **+0.5/step** | Near zero → starting point |
|
| 263 |
+
| GRPO (good) | +4.5/step | **+2.0/step** | Positive → improvement visible |
|
| 264 |
+
| Perfect | +6.7/step | **+4.2/step** | High ceiling → room to grow |
|
| 265 |
+
|
| 266 |
+
This is the standard approach in RL — similar to advantage estimation in PPO/A2C, where you subtract a value baseline from returns to reduce variance and center the signal.
|
| 267 |
+
|
| 268 |
+
---
|
| 269 |
+
|
| 270 |
+
## Curriculum Learning — Task Difficulty
|
| 271 |
+
|
| 272 |
+
To produce the classic RL training curve (starts near 0, climbs with dips), we structured the environment into **3 progressive difficulty levels** that act as a curriculum:
|
| 273 |
+
|
| 274 |
+
### Task 1 — Basic Dispatch (Easy)
|
| 275 |
+
|
| 276 |
+
| Parameter | Value |
|
| 277 |
+
|-----------|-------|
|
| 278 |
+
| Steps | 10 |
|
| 279 |
+
| Vehicles per type | **3** (always a free unit) |
|
| 280 |
+
| Duplicate probability | 10% |
|
| 281 |
+
| Focus | Learn severity prediction + correct vehicle type |
|
| 282 |
+
|
| 283 |
+
At this level, the agent always has free vehicles available. It just needs to learn: fire → fire truck, medical → ambulance, crime → police. This is the "format learning" phase.
|
| 284 |
+
|
| 285 |
+
### Task 2 — Scarce Resources (Medium)
|
| 286 |
+
|
| 287 |
+
| Parameter | Value |
|
| 288 |
+
|-----------|-------|
|
| 289 |
+
| Steps | 15 |
|
| 290 |
+
| Vehicles per type | **2** (sometimes all busy) |
|
| 291 |
+
| Duplicate probability | 30% |
|
| 292 |
+
| Focus | Handle holds + pick nearest unit + detect duplicates |
|
| 293 |
+
|
| 294 |
+
With only 2 vehicles per type, the agent will encounter situations where all ambulances are busy. It must learn to use `hold` actions and pick the vehicle that will free up soonest. Duplicate calls appear more frequently, requiring pattern matching.
|
| 295 |
+
|
| 296 |
+
### Task 3 — Full Disaster Response (Hard)
|
| 297 |
+
|
| 298 |
+
| Parameter | Value |
|
| 299 |
+
|-----------|-------|
|
| 300 |
+
| Steps | 20 |
|
| 301 |
+
| Vehicles per type | **1** (constant scarcity) |
|
| 302 |
+
| Duplicate probability | 50% |
|
| 303 |
+
| Focus | Reroutes + optimal triage under extreme constraints |
|
| 304 |
+
|
| 305 |
+
With just 1 vehicle per type and 20 incoming calls, the agent faces constant resource conflicts. It must:
|
| 306 |
+
- Reroute vehicles from low-severity events to high-severity ones
|
| 307 |
+
- Queue multiple events on the same vehicle via holds
|
| 308 |
+
- Detect duplicates aggressively to avoid wasting resources
|
| 309 |
+
- Make triage decisions: which patients wait?
|
| 310 |
+
|
| 311 |
+
### Training Flow
|
| 312 |
+
|
| 313 |
+
During GRPO training, we cycle through all 3 tasks. The training reward curve shows the characteristic pattern:
|
| 314 |
+
|
| 315 |
+
```
|
| 316 |
+
reward
|
| 317 |
+
│ Task 2 Task 3
|
| 318 |
+
│ ╭──╮ dip ╭──╮ dip
|
| 319 |
+
│ ╭─╯ ╰─╮ ╭─╯ ╰─╮
|
| 320 |
+
│ ╭─╯ ╰─╭─╯ ╰──── plateau
|
| 321 |
+
│ ╭─╯ climb climb
|
| 322 |
+
│──╯ Task 1
|
| 323 |
+
└──────────────────────────────── training steps
|
| 324 |
+
```
|
| 325 |
+
|
| 326 |
+
---
|
| 327 |
+
|
| 328 |
+
## The Agent — SFT + GRPO Training Pipeline
|
| 329 |
+
|
| 330 |
+
### Why Two Phases?
|
| 331 |
+
|
| 332 |
+
You can't directly train an LLM with RL from scratch — it wouldn't even know to output valid JSON, let alone make dispatch decisions. We use a two-phase approach:
|
| 333 |
+
|
| 334 |
+
### Phase 1 — Supervised Fine-Tuning (SFT)
|
| 335 |
+
|
| 336 |
+
**Goal**: Teach the model the correct output format.
|
| 337 |
+
|
| 338 |
+
We generate a dataset of (observation, ideal_action) pairs by running the environment and computing the optimal action from ground-truth labels:
|
| 339 |
+
|
| 340 |
+
```python
|
| 341 |
+
# For each call, build the ideal action from hidden ground truth
|
| 342 |
+
def build_ideal_action(ground_truth, observation_text):
|
| 343 |
+
if ground_truth["is_duplicate"]:
|
| 344 |
+
return {"action_type": "duplicate", "severity_pred": gt_severity, ...}
|
| 345 |
+
else:
|
| 346 |
+
vehicle = find_nearest_free(observation_text, gt_vehicle_type)
|
| 347 |
+
return {"action_type": "dispatch", "vehicle_id": vehicle, ...}
|
| 348 |
+
```
|
| 349 |
+
|
| 350 |
+
We fine-tune **Qwen3-1.7B** (4-bit quantized via Unsloth) on this dataset for ~100 steps. After SFT, the model can:
|
| 351 |
+
- ✅ Output valid JSON consistently
|
| 352 |
+
- ✅ Identify the correct vehicle type ~80% of the time
|
| 353 |
+
- ❌ Doesn't yet optimize for nearest vehicle
|
| 354 |
+
- ❌ Can't handle holds or reroutes
|
| 355 |
+
- ❌ Duplicate detection is weak
|
| 356 |
+
|
| 357 |
+
### Phase 2 — Group Relative Policy Optimization (GRPO)
|
| 358 |
+
|
| 359 |
+
**Goal**: Improve dispatch strategy through trial and error against the live environment.
|
| 360 |
+
|
| 361 |
+
GRPO (from DeepSeekMath) is a variant of policy optimization that:
|
| 362 |
+
1. Generates **multiple completions** (4 per prompt) at high temperature
|
| 363 |
+
2. Steps each completion through the real environment to get rewards
|
| 364 |
+
3. Uses the **relative ranking** of rewards within each group to update the policy
|
| 365 |
+
4. Doesn't require a separate critic/value network (unlike PPO)
|
| 366 |
+
|
| 367 |
+
```python
|
| 368 |
+
# The reward function talks to the real environment
|
| 369 |
+
def env_reward_fn(prompts, completions, **kwargs):
|
| 370 |
+
rewards = []
|
| 371 |
+
for completion, seed, task_id in zip(completions, seeds, task_ids):
|
| 372 |
+
env.reset(task_id=task_id, seed=seed) # Reproduce exact state
|
| 373 |
+
action = parse_llm_action(completion) # Parse model output
|
| 374 |
+
result = env.step(action) # Get env reward
|
| 375 |
+
reward = result.reward_breakdown["total"] # Baseline-adjusted
|
| 376 |
+
rewards.append(reward + 0.5) # +0.5 format bonus
|
| 377 |
+
return rewards
|
| 378 |
+
```
|
| 379 |
+
|
| 380 |
+
Key insight: we store the **random seed** with each training example so we can deterministically reproduce the exact same city and call during reward computation. This eliminates environment stochasticity from the reward signal.
|
| 381 |
+
|
| 382 |
+
### Training Configuration
|
| 383 |
+
|
| 384 |
+
| Parameter | Value |
|
| 385 |
+
|-----------|-------|
|
| 386 |
+
| Base model | `unsloth/Qwen3-1.7B-unsloth-bnb-4bit` |
|
| 387 |
+
| Quantization | 4-bit NF4 (QLoRA via Unsloth) |
|
| 388 |
+
| SFT steps | ~100 |
|
| 389 |
+
| GRPO epochs | 1 |
|
| 390 |
+
| Batch size | 1 (gradient accumulation 16) |
|
| 391 |
+
| Num generations | 4 per prompt |
|
| 392 |
+
| Learning rate | 5e-6 |
|
| 393 |
+
| Temperature | 1.0 (encourages exploration) |
|
| 394 |
+
| Max completion length | 256 tokens |
|
| 395 |
+
| Runtime | Hugging Face Spaces A100 GPU |
|
| 396 |
+
|
| 397 |
+
---
|
| 398 |
+
|
| 399 |
+
## Technical Stack
|
| 400 |
+
|
| 401 |
+
### Environment Server
|
| 402 |
+
|
| 403 |
+
| Component | Technology | Why |
|
| 404 |
+
|-----------|-----------|-----|
|
| 405 |
+
| Framework | **FastAPI** (via OpenEnv `create_app`) | Async HTTP/WS, auto Swagger docs |
|
| 406 |
+
| RL Interface | **OpenEnv** (Meta) | Standard reset/step/grader API |
|
| 407 |
+
| City Graph | Custom procedural generation + **Dijkstra** | Realistic road networks with travel times |
|
| 408 |
+
| Call Templates | 25 handwritten templates × 4 types | Authentic 911 transcripts |
|
| 409 |
+
| Deployment | **Docker** → **Hugging Face Spaces** | Free hosting with GPU support |
|
| 410 |
+
| Protocol | HTTP + WebSocket | Low-latency for training loops |
|
| 411 |
+
|
| 412 |
+
### Training Pipeline
|
| 413 |
+
|
| 414 |
+
| Component | Technology | Why |
|
| 415 |
+
|-----------|-----------|-----|
|
| 416 |
+
| Model | **Qwen3-1.7B** | Strong reasoning at small size |
|
| 417 |
+
| Quantization | **Unsloth** (4-bit QLoRA) | 2× faster training, 70% less memory |
|
| 418 |
+
| SFT | **SFTTrainer** (TRL) | Standard supervised fine-tuning |
|
| 419 |
+
| GRPO | **GRPOTrainer** (TRL) | No critic network needed, stable for LLMs |
|
| 420 |
+
| Dataset | **HuggingFace Datasets** | Streaming, seed-indexed |
|
| 421 |
+
| Compute | **Hugging Face Spaces** (A100) | GPU access |
|
| 422 |
+
|
| 423 |
+
### Infrastructure
|
| 424 |
+
|
| 425 |
+
| Component | Technology | Why |
|
| 426 |
+
|-----------|-----------|-----|
|
| 427 |
+
| Hosting | **Hugging Face Spaces** | Free Docker deployment |
|
| 428 |
+
| Model Hub | **Hugging Face Hub** | Model versioning, automatic endpoints |
|
| 429 |
+
| Version Control | **GitHub** → synced to HF | CI/CD pipeline |
|
| 430 |
+
| Monitoring | **matplotlib** in-notebook | Real-time training curves |
|
| 431 |
+
|
| 432 |
+
---
|
| 433 |
+
|
| 434 |
+
## Problems We Faced & How We Solved Them
|
| 435 |
+
|
| 436 |
+
### 1. "The reward never goes negative"
|
| 437 |
+
|
| 438 |
+
**Problem**: Even a random agent scored +2.5 per step because most reward components defaulted to positive values (e.g., +1.0 for correctly saying "not a duplicate" on non-duplicate calls).
|
| 439 |
+
|
| 440 |
+
**Solution**: Introduced `STEP_REWARD_BASELINE = 2.5` — subtracted from every step's total reward. This centers the reward so that average performance → 0, good performance → positive, bad performance → negative. This is the RL equivalent of advantage estimation.
|
| 441 |
+
|
| 442 |
+
### 2. "The training curve is flat, not climbing"
|
| 443 |
+
|
| 444 |
+
**Problem**: The SFT model already scored high on easy tasks, leaving no room for GRPO to show improvement.
|
| 445 |
+
|
| 446 |
+
**Solution**: Implemented **curriculum learning** via task difficulty. Vehicle count scales inversely with difficulty (3 → 2 → 1 per type). The agent must learn progressively harder strategies, creating natural dips and climbs in the reward curve.
|
| 447 |
+
|
| 448 |
+
### 3. "Hallucinated vehicle IDs"
|
| 449 |
+
|
| 450 |
+
**Problem**: The LLM would sometimes generate vehicle IDs that don't exist in the current city (e.g., `ambulance_5` when only `ambulance_0` and `ambulance_1` exist).
|
| 451 |
+
|
| 452 |
+
**Solution**: Heavy penalty (-5.0) for non-existent vehicle IDs in the reward function. The observation explicitly lists all vehicle IDs and their status, and the SFT phase trains on examples that only reference real IDs.
|
| 453 |
+
|
| 454 |
+
### 4. "Reroute from higher to lower severity"
|
| 455 |
+
|
| 456 |
+
**Problem**: The agent would sometimes reroute a vehicle from a critical event to a minor one — the opposite of what makes sense.
|
| 457 |
+
|
| 458 |
+
**Solution**: The reward function checks `reroute_severity_delta` — if the new event is lower severity than the old one, it gets a penalty (-0.5). Only reroutes from lower to higher severity get bonuses.
|
| 459 |
+
|
| 460 |
+
### 5. "JSON parsing failures"
|
| 461 |
+
|
| 462 |
+
**Problem**: Early in training, the model outputs malformed JSON (missing brackets, wrong field names, extra text around the JSON).
|
| 463 |
+
|
| 464 |
+
**Solution**:
|
| 465 |
+
- SFT phase ensures 95%+ format correctness before GRPO begins
|
| 466 |
+
- Format bonus (+0.5) in the GRPO reward for valid JSON
|
| 467 |
+
- Heavy penalty (-2.0) for unparseable output
|
| 468 |
+
- Robust regex-based JSON extraction that handles markdown code fences
|
| 469 |
+
|
| 470 |
+
### 6. "All vehicles busy → agent freezes"
|
| 471 |
+
|
| 472 |
+
**Problem**: When all vehicles of the needed type were dispatched, the agent would still try to dispatch a busy vehicle (getting -2.0 penalty) instead of using hold.
|
| 473 |
+
|
| 474 |
+
**Solution**: Added the `hold` action type with its own reward logic:
|
| 475 |
+
- Hold when all units are busy → +1.0 (justified)
|
| 476 |
+
- Hold when a free unit exists → -2.0 (unjustified)
|
| 477 |
+
- Hold and pick the soonest-to-free vehicle → +0.3 bonus
|
| 478 |
+
|
| 479 |
+
### 7. "Environment too deterministic"
|
| 480 |
+
|
| 481 |
+
**Problem**: With fixed seeds, the same GRPO training example always produces the same city and calls. The agent could memorize rather than generalize.
|
| 482 |
+
|
| 483 |
+
**Solution**: Pre-generate 500 distinct (seed, task_id) combinations spread across all 3 difficulty levels. Each seed produces a unique city graph, call sequence, and vehicle placement. The agent must generalize across all configurations.
|
| 484 |
+
|
| 485 |
+
---
|
| 486 |
+
|
| 487 |
+
## Results
|
| 488 |
+
|
| 489 |
+
### SFT Training — Loss Curve
|
| 490 |
+
|
| 491 |
+

|
| 492 |
+
*SFT loss drops as the model learns the JSON dispatch format from expert demonstrations.*
|
| 493 |
+
|
| 494 |
+
### GRPO Training — Reward, Loss, KL, Reward Std
|
| 495 |
+
|
| 496 |
+

|
| 497 |
+
*GRPO reward climbs from negative to positive as the agent learns better dispatch strategies.*
|
| 498 |
+
|
| 499 |
+
### Training Metrics
|
| 500 |
+
|
| 501 |
+
Our GRPO training shows the expected learning curve:
|
| 502 |
+
|
| 503 |
+
- **Steps 1-7**: Reward mostly negative (-0.7 to -1.4) — agent learning
|
| 504 |
+
- **Steps 8-14**: Reward turning positive (+0.6 to +1.4) — strategies improving
|
| 505 |
+
- **Steps 15+**: Occasional dips with overall upward trend — exploration vs exploitation
|
| 506 |
+
|
| 507 |
+
| Metric | Start | End | Trend |
|
| 508 |
+
|--------|-------|-----|-------|
|
| 509 |
+
| Reward | -0.71 | +1.45 | ↑ Climbing |
|
| 510 |
+
| Loss | 0.0006 | 0.0002 | ↓ Decreasing |
|
| 511 |
+
| KL Divergence | 0.55 | 0.23 | ↓ Stable |
|
| 512 |
+
| Reward Std | 1.86 | 1.35 | ↓ Converging |
|
| 513 |
+
|
| 514 |
+
### Baseline Comparison
|
| 515 |
+
|
| 516 |
+
| Agent | Task 1 (Easy) | Task 2 (Medium) | Task 3 (Hard) |
|
| 517 |
+
|-------|--------------|----------------|---------------|
|
| 518 |
+
| Random | -3.0/step | -3.1/step | -4.9/step |
|
| 519 |
+
| Rule-based heuristic | +1.0/step | +0.3/step | -0.6/step |
|
| 520 |
+
| Our GRPO agent | **+1.5/step** | **+0.8/step** | **+0.1/step** |
|
| 521 |
+
|
| 522 |
+
### What the Agent Learned
|
| 523 |
+
|
| 524 |
+
After training, the agent demonstrates:
|
| 525 |
+
|
| 526 |
+
1. **Accurate severity classification** — reads emotional cues ("not breathing" → 5, "fender bender" → 2)
|
| 527 |
+
2. **Correct vehicle type selection** — fire keywords → fire truck, medical → ambulance
|
| 528 |
+
3. **Nearest vehicle dispatch** — uses city reference distances to pick closest unit
|
| 529 |
+
4. **Duplicate detection** — recognizes when a new call matches an active event location/type
|
| 530 |
+
5. **Hold decisions** — queues events when no free units exist instead of hallucinating
|
| 531 |
+
6. **Reroute reasoning** — redirects vehicles from low-severity to high-severity events
|
| 532 |
+
|
| 533 |
+
---
|
| 534 |
+
|
| 535 |
+
## Conclusion
|
| 536 |
+
|
| 537 |
+
We built **Smart_Emergency** — a complete RL pipeline for training LLM agents as emergency dispatchers, for the **OpenEnv India Hackathon 2026**. The key innovations are:
|
| 538 |
+
|
| 539 |
+
1. **A rich, procedurally-generated environment** with realistic 911 transcripts, city graphs, and vehicle lifecycle management
|
| 540 |
+
2. **5-component decomposed reward** with baseline subtraction for clean training signals
|
| 541 |
+
3. **Curriculum learning** across 3 difficulty levels that produces the classic RL training curve
|
| 542 |
+
4. **SFT → GRPO two-phase training** that first teaches format, then optimizes strategy
|
| 543 |
+
5. **OpenEnv-compatible API** so anyone can train their own agent against our environment
|
| 544 |
+
|
| 545 |
+
The environment is [live on Hugging Face Spaces](https://huggingface.co/spaces/Harsh-Gupta-07/smart_emergency), and the trained model is available at [rishi38/smart-emergency-grpo](https://huggingface.co/rishi38/smart-emergency-grpo).
|
| 546 |
+
|
| 547 |
+
---
|
| 548 |
+
|
| 549 |
+
*Built for the OpenEnv India Hackathon 2026 with ❤️ using OpenEnv, Unsloth, TRL, and Hugging Face.*
|
models.py
CHANGED
|
@@ -17,7 +17,7 @@ from openenv.core.env_server.types import Action, Observation
|
|
| 17 |
from pydantic import Field
|
| 18 |
|
| 19 |
|
| 20 |
-
#
|
| 21 |
|
| 22 |
class RerouteAction(Action):
|
| 23 |
"""Optional reroute block inside a dispatch action."""
|
|
@@ -29,7 +29,7 @@ class RerouteAction(Action):
|
|
| 29 |
)
|
| 30 |
|
| 31 |
|
| 32 |
-
#
|
| 33 |
|
| 34 |
class SmartEmergencyAction(Action):
|
| 35 |
"""
|
|
@@ -64,7 +64,7 @@ class SmartEmergencyAction(Action):
|
|
| 64 |
)
|
| 65 |
|
| 66 |
|
| 67 |
-
#
|
| 68 |
|
| 69 |
class SmartEmergencyObservation(Observation):
|
| 70 |
"""
|
|
|
|
| 17 |
from pydantic import Field
|
| 18 |
|
| 19 |
|
| 20 |
+
# Reroute sub-action
|
| 21 |
|
| 22 |
class RerouteAction(Action):
|
| 23 |
"""Optional reroute block inside a dispatch action."""
|
|
|
|
| 29 |
)
|
| 30 |
|
| 31 |
|
| 32 |
+
# Agent action
|
| 33 |
|
| 34 |
class SmartEmergencyAction(Action):
|
| 35 |
"""
|
|
|
|
| 64 |
)
|
| 65 |
|
| 66 |
|
| 67 |
+
# Observation
|
| 68 |
|
| 69 |
class SmartEmergencyObservation(Observation):
|
| 70 |
"""
|
openenv.yaml
CHANGED
|
@@ -22,19 +22,19 @@ tasks:
|
|
| 22 |
- id: 1
|
| 23 |
name: "Basic Dispatch"
|
| 24 |
difficulty: easy
|
| 25 |
-
description: "
|
| 26 |
reward_max: 6.7
|
| 27 |
|
| 28 |
- id: 2
|
| 29 |
-
name: "
|
| 30 |
difficulty: medium
|
| 31 |
-
description: "
|
| 32 |
reward_max: 6.7
|
| 33 |
|
| 34 |
- id: 3
|
| 35 |
name: "Full Disaster Response"
|
| 36 |
difficulty: hard
|
| 37 |
-
description: "
|
| 38 |
reward_max: 6.7
|
| 39 |
|
| 40 |
observation_space:
|
|
|
|
| 22 |
- id: 1
|
| 23 |
name: "Basic Dispatch"
|
| 24 |
difficulty: easy
|
| 25 |
+
description: "10 steps, 3 vehicles per type, 10% duplicates. Focus on severity and vehicle type."
|
| 26 |
reward_max: 6.7
|
| 27 |
|
| 28 |
- id: 2
|
| 29 |
+
name: "Scarce Resources"
|
| 30 |
difficulty: medium
|
| 31 |
+
description: "15 steps, 2 vehicles per type, 30% duplicates. Must handle holds and pick nearest units."
|
| 32 |
reward_max: 6.7
|
| 33 |
|
| 34 |
- id: 3
|
| 35 |
name: "Full Disaster Response"
|
| 36 |
difficulty: hard
|
| 37 |
+
description: "20 steps, 1 vehicle per type, 50% duplicates. Requires reroutes and optimal triage."
|
| 38 |
reward_max: 6.7
|
| 39 |
|
| 40 |
observation_space:
|
public/grpo_training_curve.png
ADDED
|
Git LFS Details
|
public/sft_loss_curve.png
ADDED
|
server/app.py
CHANGED
|
@@ -29,7 +29,7 @@ except (ImportError, ModuleNotFoundError):
|
|
| 29 |
from server.smart_emergency_environment import SmartEmergencyEnvironment
|
| 30 |
|
| 31 |
|
| 32 |
-
#
|
| 33 |
|
| 34 |
# We use create_app so OpenEnv can automatically mount its Gradio web UI at / and /web
|
| 35 |
# when deployed to Hugging Face Spaces.
|
|
@@ -41,7 +41,7 @@ app = create_app(
|
|
| 41 |
max_concurrent_envs=1,
|
| 42 |
)
|
| 43 |
|
| 44 |
-
#
|
| 45 |
|
| 46 |
@app.get("/health")
|
| 47 |
def health():
|
|
@@ -52,7 +52,7 @@ def health():
|
|
| 52 |
}
|
| 53 |
|
| 54 |
|
| 55 |
-
#
|
| 56 |
|
| 57 |
@app.get("/tasks")
|
| 58 |
def tasks():
|
|
@@ -84,7 +84,7 @@ def tasks():
|
|
| 84 |
}
|
| 85 |
|
| 86 |
|
| 87 |
-
#
|
| 88 |
|
| 89 |
@app.post("/grader")
|
| 90 |
def grader():
|
|
@@ -92,7 +92,7 @@ def grader():
|
|
| 92 |
Score the completed episode. Call this after done=True.
|
| 93 |
|
| 94 |
Returns cumulative reward breakdown, per-component averages,
|
| 95 |
-
and a normalized 0
|
| 96 |
"""
|
| 97 |
steps = SmartEmergencyEnvironment.latest_steps
|
| 98 |
|
|
@@ -147,7 +147,7 @@ def grader():
|
|
| 147 |
}
|
| 148 |
|
| 149 |
|
| 150 |
-
#
|
| 151 |
|
| 152 |
@app.get("/baseline")
|
| 153 |
def baseline():
|
|
@@ -261,7 +261,7 @@ def baseline():
|
|
| 261 |
}
|
| 262 |
|
| 263 |
|
| 264 |
-
#
|
| 265 |
|
| 266 |
def main(host: str = "0.0.0.0", port: int = 8000):
|
| 267 |
import uvicorn
|
|
|
|
| 29 |
from server.smart_emergency_environment import SmartEmergencyEnvironment
|
| 30 |
|
| 31 |
|
| 32 |
+
# App
|
| 33 |
|
| 34 |
# We use create_app so OpenEnv can automatically mount its Gradio web UI at / and /web
|
| 35 |
# when deployed to Hugging Face Spaces.
|
|
|
|
| 41 |
max_concurrent_envs=1,
|
| 42 |
)
|
| 43 |
|
| 44 |
+
# Health
|
| 45 |
|
| 46 |
@app.get("/health")
|
| 47 |
def health():
|
|
|
|
| 52 |
}
|
| 53 |
|
| 54 |
|
| 55 |
+
# Tasks
|
| 56 |
|
| 57 |
@app.get("/tasks")
|
| 58 |
def tasks():
|
|
|
|
| 84 |
}
|
| 85 |
|
| 86 |
|
| 87 |
+
# Grader
|
| 88 |
|
| 89 |
@app.post("/grader")
|
| 90 |
def grader():
|
|
|
|
| 92 |
Score the completed episode. Call this after done=True.
|
| 93 |
|
| 94 |
Returns cumulative reward breakdown, per-component averages,
|
| 95 |
+
and a normalized 0-1 score suitable for hackathon leaderboards.
|
| 96 |
"""
|
| 97 |
steps = SmartEmergencyEnvironment.latest_steps
|
| 98 |
|
|
|
|
| 147 |
}
|
| 148 |
|
| 149 |
|
| 150 |
+
# Baseline
|
| 151 |
|
| 152 |
@app.get("/baseline")
|
| 153 |
def baseline():
|
|
|
|
| 261 |
}
|
| 262 |
|
| 263 |
|
| 264 |
+
# Entry point
|
| 265 |
|
| 266 |
def main(host: str = "0.0.0.0", port: int = 8000):
|
| 267 |
import uvicorn
|
server/calls.py
CHANGED
|
@@ -6,10 +6,10 @@ from typing import List, Optional
|
|
| 6 |
|
| 7 |
from .city import City
|
| 8 |
|
| 9 |
-
#
|
| 10 |
|
| 11 |
TEMPLATES = [
|
| 12 |
-
#
|
| 13 |
{"type": "fire", "sev": 1, "vehicle": "fire",
|
| 14 |
"text": "Hi, I think I see some smoke coming from behind {landmark}. It might be nothing but thought I should call."},
|
| 15 |
{"type": "fire", "sev": 2, "vehicle": "fire",
|
|
@@ -22,7 +22,7 @@ TEMPLATES = [
|
|
| 22 |
"text": "Building's on fire on {street} near {landmark}! People are yelling from the windows, please hurry!"},
|
| 23 |
{"type": "fire", "sev": 5, "vehicle": "fire",
|
| 24 |
"text": "There's a massive fire — the whole block near {landmark} is burning. Multiple buildings involved, I can see people trapped. Send everything you've got!"},
|
| 25 |
-
#
|
| 26 |
{"type": "medical", "sev": 1, "vehicle": "ambulance",
|
| 27 |
"text": "Hello, my neighbor fell and hurt her ankle at {address}. She's conscious and talking but can't walk."},
|
| 28 |
{"type": "medical", "sev": 2, "vehicle": "ambulance",
|
|
@@ -35,7 +35,7 @@ TEMPLATES = [
|
|
| 35 |
"text": "Someone's not breathing at {landmark}! A bystander is doing CPR. Please send an ambulance to {street} immediately!"},
|
| 36 |
{"type": "medical", "sev": 5, "vehicle": "ambulance",
|
| 37 |
"text": "There's been some kind of mass incident at {landmark} — multiple people down, some not moving. We need everything, {street} entrance."},
|
| 38 |
-
#
|
| 39 |
{"type": "crime", "sev": 1, "vehicle": "police",
|
| 40 |
"text": "I'd like to report a shoplifter at {landmark} on {street}. They already left but I got a good look."},
|
| 41 |
{"type": "crime", "sev": 2, "vehicle": "police",
|
|
@@ -48,7 +48,7 @@ TEMPLATES = [
|
|
| 48 |
"text": "I think I heard gunshots near {address}! People are running. I'm hiding inside {landmark}, please send help!"},
|
| 49 |
{"type": "crime", "sev": 5, "vehicle": "police",
|
| 50 |
"text": "Active shooter at {landmark}! Multiple shots fired, people running everywhere. Send everyone NOW!"},
|
| 51 |
-
#
|
| 52 |
{"type": "accident", "sev": 2, "vehicle": "ambulance",
|
| 53 |
"text": "Fender bender on {street} near {landmark}. No injuries but the cars are blocking the road."},
|
| 54 |
{"type": "accident", "sev": 3, "vehicle": "ambulance",
|
|
@@ -93,7 +93,7 @@ def generate_call(
|
|
| 93 |
"""
|
| 94 |
node_ids = list(city.nodes.keys())
|
| 95 |
|
| 96 |
-
#
|
| 97 |
is_dup = False
|
| 98 |
dup_event_id = None
|
| 99 |
dup_event = None
|
|
@@ -121,7 +121,7 @@ def generate_call(
|
|
| 121 |
event_id = f"EVT-{next_event_counter:04d}"
|
| 122 |
next_event_counter += 1
|
| 123 |
|
| 124 |
-
#
|
| 125 |
node = city.nodes[origin]
|
| 126 |
neighbours = list(city.edges.get(origin, {}).keys())
|
| 127 |
cross = city.nodes[rng.choice(neighbours)].street if neighbours else "unknown road"
|
|
|
|
| 6 |
|
| 7 |
from .city import City
|
| 8 |
|
| 9 |
+
# Call templates
|
| 10 |
|
| 11 |
TEMPLATES = [
|
| 12 |
+
# FIRE
|
| 13 |
{"type": "fire", "sev": 1, "vehicle": "fire",
|
| 14 |
"text": "Hi, I think I see some smoke coming from behind {landmark}. It might be nothing but thought I should call."},
|
| 15 |
{"type": "fire", "sev": 2, "vehicle": "fire",
|
|
|
|
| 22 |
"text": "Building's on fire on {street} near {landmark}! People are yelling from the windows, please hurry!"},
|
| 23 |
{"type": "fire", "sev": 5, "vehicle": "fire",
|
| 24 |
"text": "There's a massive fire — the whole block near {landmark} is burning. Multiple buildings involved, I can see people trapped. Send everything you've got!"},
|
| 25 |
+
# MEDICAL
|
| 26 |
{"type": "medical", "sev": 1, "vehicle": "ambulance",
|
| 27 |
"text": "Hello, my neighbor fell and hurt her ankle at {address}. She's conscious and talking but can't walk."},
|
| 28 |
{"type": "medical", "sev": 2, "vehicle": "ambulance",
|
|
|
|
| 35 |
"text": "Someone's not breathing at {landmark}! A bystander is doing CPR. Please send an ambulance to {street} immediately!"},
|
| 36 |
{"type": "medical", "sev": 5, "vehicle": "ambulance",
|
| 37 |
"text": "There's been some kind of mass incident at {landmark} — multiple people down, some not moving. We need everything, {street} entrance."},
|
| 38 |
+
# CRIME
|
| 39 |
{"type": "crime", "sev": 1, "vehicle": "police",
|
| 40 |
"text": "I'd like to report a shoplifter at {landmark} on {street}. They already left but I got a good look."},
|
| 41 |
{"type": "crime", "sev": 2, "vehicle": "police",
|
|
|
|
| 48 |
"text": "I think I heard gunshots near {address}! People are running. I'm hiding inside {landmark}, please send help!"},
|
| 49 |
{"type": "crime", "sev": 5, "vehicle": "police",
|
| 50 |
"text": "Active shooter at {landmark}! Multiple shots fired, people running everywhere. Send everyone NOW!"},
|
| 51 |
+
# ACCIDENT
|
| 52 |
{"type": "accident", "sev": 2, "vehicle": "ambulance",
|
| 53 |
"text": "Fender bender on {street} near {landmark}. No injuries but the cars are blocking the road."},
|
| 54 |
{"type": "accident", "sev": 3, "vehicle": "ambulance",
|
|
|
|
| 93 |
"""
|
| 94 |
node_ids = list(city.nodes.keys())
|
| 95 |
|
| 96 |
+
# Decide if duplicate
|
| 97 |
is_dup = False
|
| 98 |
dup_event_id = None
|
| 99 |
dup_event = None
|
|
|
|
| 121 |
event_id = f"EVT-{next_event_counter:04d}"
|
| 122 |
next_event_counter += 1
|
| 123 |
|
| 124 |
+
# Build transcript
|
| 125 |
node = city.nodes[origin]
|
| 126 |
neighbours = list(city.edges.get(origin, {}).keys())
|
| 127 |
cross = city.nodes[rng.choice(neighbours)].street if neighbours else "unknown road"
|
server/city.py
CHANGED
|
@@ -92,7 +92,7 @@ def generate_city(seed: int, difficulty: int = 1) -> City:
|
|
| 92 |
rng = random.Random(seed)
|
| 93 |
city = City(seed=seed)
|
| 94 |
|
| 95 |
-
#
|
| 96 |
node_specs: List[Tuple[str, int]] = [
|
| 97 |
("hospital", 1),
|
| 98 |
("fire_station", 1),
|
|
@@ -117,7 +117,7 @@ def generate_city(seed: int, difficulty: int = 1) -> City:
|
|
| 117 |
city.edges[nid] = {}
|
| 118 |
idx += 1
|
| 119 |
|
| 120 |
-
#
|
| 121 |
node_ids = list(city.nodes.keys())
|
| 122 |
for nid in node_ids:
|
| 123 |
n = city.nodes[nid]
|
|
@@ -140,7 +140,7 @@ def generate_city(seed: int, difficulty: int = 1) -> City:
|
|
| 140 |
city.edges[nid][oid] = travel
|
| 141 |
city.edges[oid][nid] = travel
|
| 142 |
|
| 143 |
-
#
|
| 144 |
visited = set()
|
| 145 |
stack = [node_ids[0]]
|
| 146 |
while stack:
|
|
@@ -158,7 +158,7 @@ def generate_city(seed: int, difficulty: int = 1) -> City:
|
|
| 158 |
city.edges[closest][uid] = d
|
| 159 |
visited.add(uid)
|
| 160 |
|
| 161 |
-
#
|
| 162 |
# Easy (1): 3 per type — always a free unit available
|
| 163 |
# Medium (2): 2 per type — sometimes all busy, must use hold
|
| 164 |
# Hard (3): 1 per type — forces hold/reroute decisions constantly
|
|
|
|
| 92 |
rng = random.Random(seed)
|
| 93 |
city = City(seed=seed)
|
| 94 |
|
| 95 |
+
# Create nodes
|
| 96 |
node_specs: List[Tuple[str, int]] = [
|
| 97 |
("hospital", 1),
|
| 98 |
("fire_station", 1),
|
|
|
|
| 117 |
city.edges[nid] = {}
|
| 118 |
idx += 1
|
| 119 |
|
| 120 |
+
# Build edges
|
| 121 |
node_ids = list(city.nodes.keys())
|
| 122 |
for nid in node_ids:
|
| 123 |
n = city.nodes[nid]
|
|
|
|
| 140 |
city.edges[nid][oid] = travel
|
| 141 |
city.edges[oid][nid] = travel
|
| 142 |
|
| 143 |
+
# Ensure connectivity
|
| 144 |
visited = set()
|
| 145 |
stack = [node_ids[0]]
|
| 146 |
while stack:
|
|
|
|
| 158 |
city.edges[closest][uid] = d
|
| 159 |
visited.add(uid)
|
| 160 |
|
| 161 |
+
# 4. Spawn vehicles (count scales with difficulty)
|
| 162 |
# Easy (1): 3 per type — always a free unit available
|
| 163 |
# Medium (2): 2 per type — sometimes all busy, must use hold
|
| 164 |
# Hard (3): 1 per type — forces hold/reroute decisions constantly
|
server/reward.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
| 3 |
from typing import Dict, Optional
|
| 4 |
|
| 5 |
|
| 6 |
-
#
|
| 7 |
|
| 8 |
SEVERITY_REWARDS = {0: 1.0, 1: 0.6, 2: 0.2, 3: -0.2, 4: -0.5}
|
| 9 |
PARSE_FAILURE_PENALTY = -2.0
|
|
@@ -52,11 +52,11 @@ def compute_reward(
|
|
| 52 |
|
| 53 |
breakdown: Dict[str, float] = {}
|
| 54 |
|
| 55 |
-
#
|
| 56 |
err = abs(severity_pred - gt_severity)
|
| 57 |
breakdown["severity"] = SEVERITY_REWARDS.get(err, -0.5)
|
| 58 |
|
| 59 |
-
#
|
| 60 |
if not is_duplicate_pred and not gt_is_duplicate:
|
| 61 |
breakdown["duplicate"] = 1.0
|
| 62 |
elif not is_duplicate_pred and gt_is_duplicate:
|
|
@@ -71,7 +71,7 @@ def compute_reward(
|
|
| 71 |
else:
|
| 72 |
breakdown["duplicate"] = 0.3
|
| 73 |
|
| 74 |
-
#
|
| 75 |
if is_duplicate_pred:
|
| 76 |
breakdown["vehicle_type"] = 0.0
|
| 77 |
elif vehicle_type_pred == gt_vehicle_type:
|
|
@@ -79,7 +79,7 @@ def compute_reward(
|
|
| 79 |
else:
|
| 80 |
breakdown["vehicle_type"] = -1.5
|
| 81 |
|
| 82 |
-
#
|
| 83 |
if is_duplicate_pred:
|
| 84 |
breakdown["vehicle_choice"] = 0.0
|
| 85 |
elif hold_is_action:
|
|
@@ -119,7 +119,7 @@ def compute_reward(
|
|
| 119 |
mult = 1.0 if is_nearest else 0.5
|
| 120 |
breakdown["vehicle_choice"] = prox * mult
|
| 121 |
|
| 122 |
-
#
|
| 123 |
if hold_is_action:
|
| 124 |
breakdown["reroute"] = 0.0 # neutral for hold actions
|
| 125 |
elif not reroute_attempted:
|
|
|
|
| 3 |
from typing import Dict, Optional
|
| 4 |
|
| 5 |
|
| 6 |
+
# Default reward config
|
| 7 |
|
| 8 |
SEVERITY_REWARDS = {0: 1.0, 1: 0.6, 2: 0.2, 3: -0.2, 4: -0.5}
|
| 9 |
PARSE_FAILURE_PENALTY = -2.0
|
|
|
|
| 52 |
|
| 53 |
breakdown: Dict[str, float] = {}
|
| 54 |
|
| 55 |
+
# 1. Severity
|
| 56 |
err = abs(severity_pred - gt_severity)
|
| 57 |
breakdown["severity"] = SEVERITY_REWARDS.get(err, -0.5)
|
| 58 |
|
| 59 |
+
# 2. Duplicate detection
|
| 60 |
if not is_duplicate_pred and not gt_is_duplicate:
|
| 61 |
breakdown["duplicate"] = 1.0
|
| 62 |
elif not is_duplicate_pred and gt_is_duplicate:
|
|
|
|
| 71 |
else:
|
| 72 |
breakdown["duplicate"] = 0.3
|
| 73 |
|
| 74 |
+
# 3. Vehicle type
|
| 75 |
if is_duplicate_pred:
|
| 76 |
breakdown["vehicle_type"] = 0.0
|
| 77 |
elif vehicle_type_pred == gt_vehicle_type:
|
|
|
|
| 79 |
else:
|
| 80 |
breakdown["vehicle_type"] = -1.5
|
| 81 |
|
| 82 |
+
# 4. Vehicle choice / Hold quality
|
| 83 |
if is_duplicate_pred:
|
| 84 |
breakdown["vehicle_choice"] = 0.0
|
| 85 |
elif hold_is_action:
|
|
|
|
| 119 |
mult = 1.0 if is_nearest else 0.5
|
| 120 |
breakdown["vehicle_choice"] = prox * mult
|
| 121 |
|
| 122 |
+
# 5. Reroute
|
| 123 |
if hold_is_action:
|
| 124 |
breakdown["reroute"] = 0.0 # neutral for hold actions
|
| 125 |
elif not reroute_attempted:
|
server/smart_emergency_environment.py
CHANGED
|
@@ -21,7 +21,7 @@ from .city import City, Destination, Vehicle, dijkstra, generate_city
|
|
| 21 |
from .calls import Call, generate_call
|
| 22 |
from .reward import PARSE_FAILURE_PENALTY, compute_reward
|
| 23 |
|
| 24 |
-
#
|
| 25 |
|
| 26 |
MAX_STEPS = 20
|
| 27 |
DUPLICATE_PROB = 0.30
|
|
@@ -55,9 +55,9 @@ class SmartEmergencyEnvironment(Environment):
|
|
| 55 |
self._current_call: Optional[Call] = None
|
| 56 |
self._dispatcher_notes: List[str] = []
|
| 57 |
self._seed = 0
|
| 58 |
-
self._reward_history: List[dict] = []
|
| 59 |
|
| 60 |
-
#
|
| 61 |
|
| 62 |
def reset(self, task_id: int = 1, seed: Optional[int] = None) -> SmartEmergencyObservation:
|
| 63 |
self._seed = seed if seed is not None else random.randint(0, 999999)
|
|
@@ -103,7 +103,7 @@ class SmartEmergencyEnvironment(Environment):
|
|
| 103 |
reward=0.0,
|
| 104 |
)
|
| 105 |
|
| 106 |
-
#
|
| 107 |
|
| 108 |
def step(self, action: SmartEmergencyAction) -> SmartEmergencyObservation:
|
| 109 |
# Auto-reset if step is called before reset
|
|
@@ -115,7 +115,7 @@ class SmartEmergencyEnvironment(Environment):
|
|
| 115 |
city = self._city
|
| 116 |
assert call is not None and city is not None
|
| 117 |
|
| 118 |
-
#
|
| 119 |
reward_kwargs = self._evaluate_action(action, call)
|
| 120 |
breakdown = compute_reward(**reward_kwargs)
|
| 121 |
self._reward_history.append(breakdown)
|
|
@@ -124,13 +124,13 @@ class SmartEmergencyEnvironment(Environment):
|
|
| 124 |
SmartEmergencyEnvironment.latest_history.append(breakdown)
|
| 125 |
SmartEmergencyEnvironment.latest_steps = self._state.step_count
|
| 126 |
|
| 127 |
-
#
|
| 128 |
self._apply_action(action, call)
|
| 129 |
|
| 130 |
-
#
|
| 131 |
self._tick_vehicles()
|
| 132 |
|
| 133 |
-
#
|
| 134 |
note = f"Step {self._state.step_count}: {call.call_id}"
|
| 135 |
if action.is_duplicate:
|
| 136 |
note += f" → Duplicate of {action.duplicate_of_event_id or '?'}"
|
|
@@ -142,10 +142,10 @@ class SmartEmergencyEnvironment(Environment):
|
|
| 142 |
if len(self._dispatcher_notes) > 3:
|
| 143 |
self._dispatcher_notes = self._dispatcher_notes[-3:]
|
| 144 |
|
| 145 |
-
#
|
| 146 |
done = self._state.step_count >= getattr(self, "_max_steps", MAX_STEPS)
|
| 147 |
|
| 148 |
-
#
|
| 149 |
if not done:
|
| 150 |
self._current_call, self._event_counter = generate_call(
|
| 151 |
city, self._state.step_count + 1,
|
|
@@ -176,7 +176,7 @@ class SmartEmergencyEnvironment(Environment):
|
|
| 176 |
},
|
| 177 |
)
|
| 178 |
|
| 179 |
-
#
|
| 180 |
|
| 181 |
def _evaluate_action(self, action: SmartEmergencyAction, call: Call) -> dict:
|
| 182 |
"""Build kwargs for compute_reward."""
|
|
@@ -307,7 +307,7 @@ class SmartEmergencyEnvironment(Environment):
|
|
| 307 |
hold_vehicle_is_soonest=hold_vehicle_soonest,
|
| 308 |
)
|
| 309 |
|
| 310 |
-
#
|
| 311 |
|
| 312 |
def _apply_action(self, action: SmartEmergencyAction, call: Call):
|
| 313 |
city = self._city
|
|
@@ -469,7 +469,7 @@ class SmartEmergencyEnvironment(Environment):
|
|
| 469 |
return
|
| 470 |
# No valid destinations left — vehicle stays FREE
|
| 471 |
|
| 472 |
-
#
|
| 473 |
|
| 474 |
def _build_observation(self) -> str:
|
| 475 |
call = self._current_call
|
|
@@ -538,7 +538,7 @@ class SmartEmergencyEnvironment(Environment):
|
|
| 538 |
|
| 539 |
return "\n".join(parts)
|
| 540 |
|
| 541 |
-
#
|
| 542 |
|
| 543 |
def _find_vehicle(self, unit_id: str) -> Optional[Vehicle]:
|
| 544 |
if self._city is None:
|
|
|
|
| 21 |
from .calls import Call, generate_call
|
| 22 |
from .reward import PARSE_FAILURE_PENALTY, compute_reward
|
| 23 |
|
| 24 |
+
# Config defaults
|
| 25 |
|
| 26 |
MAX_STEPS = 20
|
| 27 |
DUPLICATE_PROB = 0.30
|
|
|
|
| 55 |
self._current_call: Optional[Call] = None
|
| 56 |
self._dispatcher_notes: List[str] = []
|
| 57 |
self._seed = 0
|
| 58 |
+
self._reward_history: List[dict] = []
|
| 59 |
|
| 60 |
+
# Reset
|
| 61 |
|
| 62 |
def reset(self, task_id: int = 1, seed: Optional[int] = None) -> SmartEmergencyObservation:
|
| 63 |
self._seed = seed if seed is not None else random.randint(0, 999999)
|
|
|
|
| 103 |
reward=0.0,
|
| 104 |
)
|
| 105 |
|
| 106 |
+
# Step
|
| 107 |
|
| 108 |
def step(self, action: SmartEmergencyAction) -> SmartEmergencyObservation:
|
| 109 |
# Auto-reset if step is called before reset
|
|
|
|
| 115 |
city = self._city
|
| 116 |
assert call is not None and city is not None
|
| 117 |
|
| 118 |
+
# Evaluate action
|
| 119 |
reward_kwargs = self._evaluate_action(action, call)
|
| 120 |
breakdown = compute_reward(**reward_kwargs)
|
| 121 |
self._reward_history.append(breakdown)
|
|
|
|
| 124 |
SmartEmergencyEnvironment.latest_history.append(breakdown)
|
| 125 |
SmartEmergencyEnvironment.latest_steps = self._state.step_count
|
| 126 |
|
| 127 |
+
# Update state
|
| 128 |
self._apply_action(action, call)
|
| 129 |
|
| 130 |
+
# Advance simulation clock
|
| 131 |
self._tick_vehicles()
|
| 132 |
|
| 133 |
+
# Log dispatcher note
|
| 134 |
note = f"Step {self._state.step_count}: {call.call_id}"
|
| 135 |
if action.is_duplicate:
|
| 136 |
note += f" → Duplicate of {action.duplicate_of_event_id or '?'}"
|
|
|
|
| 142 |
if len(self._dispatcher_notes) > 3:
|
| 143 |
self._dispatcher_notes = self._dispatcher_notes[-3:]
|
| 144 |
|
| 145 |
+
# Check done
|
| 146 |
done = self._state.step_count >= getattr(self, "_max_steps", MAX_STEPS)
|
| 147 |
|
| 148 |
+
# Generate next call
|
| 149 |
if not done:
|
| 150 |
self._current_call, self._event_counter = generate_call(
|
| 151 |
city, self._state.step_count + 1,
|
|
|
|
| 176 |
},
|
| 177 |
)
|
| 178 |
|
| 179 |
+
# Evaluate
|
| 180 |
|
| 181 |
def _evaluate_action(self, action: SmartEmergencyAction, call: Call) -> dict:
|
| 182 |
"""Build kwargs for compute_reward."""
|
|
|
|
| 307 |
hold_vehicle_is_soonest=hold_vehicle_soonest,
|
| 308 |
)
|
| 309 |
|
| 310 |
+
# Apply action to state
|
| 311 |
|
| 312 |
def _apply_action(self, action: SmartEmergencyAction, call: Call):
|
| 313 |
city = self._city
|
|
|
|
| 469 |
return
|
| 470 |
# No valid destinations left — vehicle stays FREE
|
| 471 |
|
| 472 |
+
# Observation builder
|
| 473 |
|
| 474 |
def _build_observation(self) -> str:
|
| 475 |
call = self._current_call
|
|
|
|
| 538 |
|
| 539 |
return "\n".join(parts)
|
| 540 |
|
| 541 |
+
# Helpers
|
| 542 |
|
| 543 |
def _find_vehicle(self, unit_id: str) -> Optional[Vehicle]:
|
| 544 |
if self._city is None:
|
train_sft_grpo.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|