Spaces:
Sleeping
Sleeping
Upload 17 files
Browse files- Dockerfile +20 -0
- RULES.md +551 -0
- __pycache__/app.cpython-314.pyc +0 -0
- app.py +775 -0
- environment.py +469 -0
- graders.py +296 -0
- inference.py +377 -0
- models.py +62 -0
- openenv.yaml +27 -0
- pyproject.toml +20 -0
- requirements.txt +4 -0
- server.py +775 -0
- server/__pycache__/app.cpython-314.pyc +0 -0
- server/app.py +775 -0
- tasks.py +748 -0
- uv.lock +8 -0
- validate-submission.sh +198 -0
Dockerfile
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 4 |
+
ENV PYTHONUNBUFFERED=1
|
| 5 |
+
|
| 6 |
+
WORKDIR /app
|
| 7 |
+
|
| 8 |
+
COPY requirements.txt /app/requirements.txt
|
| 9 |
+
RUN pip install --no-cache-dir -r /app/requirements.txt
|
| 10 |
+
|
| 11 |
+
COPY . /app
|
| 12 |
+
|
| 13 |
+
RUN adduser --disabled-password --gecos "" appuser \
|
| 14 |
+
&& chown -R appuser:appuser /app
|
| 15 |
+
|
| 16 |
+
USER appuser
|
| 17 |
+
|
| 18 |
+
EXPOSE 7860
|
| 19 |
+
|
| 20 |
+
CMD ["gunicorn", "--bind", "0.0.0.0:7860", "server:app"]
|
RULES.md
ADDED
|
@@ -0,0 +1,551 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# RULES.md - Project Constitution & AI Guardrails
|
| 2 |
+
# OpenEnv Email Triage Environment
|
| 3 |
+
|
| 4 |
+
EVERY AI agent, copilot, or assistant working on this project MUST read and obey this file before generating ANY code.
|
| 5 |
+
|
| 6 |
+
REVISION 2: Updated based on sample inference.py analysis.
|
| 7 |
+
Where submission rules conflict with the original brief, SUBMISSION RULES WIN.
|
| 8 |
+
Where the sample script reveals patterns, MATCH THE PATTERNS.
|
| 9 |
+
|
| 10 |
+
## 0. GOLDEN RULE
|
| 11 |
+
|
| 12 |
+
> Do NOT generate code that you cannot explain line by line.
|
| 13 |
+
> Do NOT add features not listed in this document.
|
| 14 |
+
> Do NOT deviate from the file map, naming conventions, or interfaces defined here.
|
| 15 |
+
> When in doubt, do LESS, not more.
|
| 16 |
+
|
| 17 |
+
---
|
| 18 |
+
|
| 19 |
+
## 1. SCOPE - What This Project Is
|
| 20 |
+
|
| 21 |
+
- An OpenEnv-compliant AI agent training environment
|
| 22 |
+
- Domain: Email Triage (classify, prioritise, route emails)
|
| 23 |
+
- Deployed as a Docker-based Hugging Face Space
|
| 24 |
+
- Evaluated by inference.py using OpenAI Client with configurable endpoint
|
| 25 |
+
|
| 26 |
+
### What this project is NOT
|
| 27 |
+
|
| 28 |
+
- A chatbot
|
| 29 |
+
- A web app with a UI
|
| 30 |
+
- A game or toy problem
|
| 31 |
+
- A fine-tuning pipeline
|
| 32 |
+
- A multi-agent system
|
| 33 |
+
- An LLM wrapper with extra features
|
| 34 |
+
- A BrowserGym environment (the sample uses BrowserGym - we do NOT)
|
| 35 |
+
|
| 36 |
+
---
|
| 37 |
+
|
| 38 |
+
## 2. SUBMISSION CHECKLIST - DISQUALIFICATION CRITERIA
|
| 39 |
+
|
| 40 |
+
These are automated checks. Failing ANY ONE means disqualification.
|
| 41 |
+
|
| 42 |
+
| # | Check | What the validator does |
|
| 43 |
+
|---|---|---|
|
| 44 |
+
| 1 | HF Space deploys | Pings Space URL - must return HTTP 200 and respond to reset() |
|
| 45 |
+
| 2 | OpenEnv spec compliance | Validates openenv.yaml, typed models, /step, /reset, /state |
|
| 46 |
+
| 3 | Dockerfile builds | Runs docker build on the submitted repo - must succeed |
|
| 47 |
+
| 4 | Inference reproduces | Runs inference.py - must complete without error and produce scores |
|
| 48 |
+
| 5 | 3+ tasks with graders | Enumerates tasks, runs each grader, verifies scores in [0.0, 1.0] |
|
| 49 |
+
| 6 | Pre-validation script | Runs `./validate-submission.sh <ping_url> .` and expects all 3 checks to pass |
|
| 50 |
+
|
| 51 |
+
### 2.1 Mandatory pre-submit validation
|
| 52 |
+
|
| 53 |
+
- Before claiming "submission ready", run `./validate-submission.sh <ping_url> .` from repo root.
|
| 54 |
+
- If `<ping_url>` is unavailable, request it and block readiness claims until provided.
|
| 55 |
+
- Any AI assistant working on this repo must treat validator failure as a hard stop.
|
| 56 |
+
|
| 57 |
+
### Infrastructure constraints
|
| 58 |
+
|
| 59 |
+
| Constraint | Limit |
|
| 60 |
+
|---|---|
|
| 61 |
+
| vCPU | 2 |
|
| 62 |
+
| Memory | 8 GB |
|
| 63 |
+
| Inference runtime | < 20 minutes |
|
| 64 |
+
|
| 65 |
+
---
|
| 66 |
+
|
| 67 |
+
## 3. ENVIRONMENT VARIABLES - Mandatory
|
| 68 |
+
|
| 69 |
+
```python
|
| 70 |
+
import os
|
| 71 |
+
|
| 72 |
+
API_BASE_URL = os.getenv("API_BASE_URL")
|
| 73 |
+
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 74 |
+
MODEL_NAME = os.getenv("MODEL_NAME")
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
How to use in code (EXACT PATTERN - matches sample):
|
| 78 |
+
|
| 79 |
+
```python
|
| 80 |
+
from openai import OpenAI
|
| 81 |
+
|
| 82 |
+
client = OpenAI(
|
| 83 |
+
base_url=API_BASE_URL,
|
| 84 |
+
api_key=API_KEY,
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
completion = client.chat.completions.create(
|
| 88 |
+
model=MODEL_NAME,
|
| 89 |
+
messages=[...],
|
| 90 |
+
temperature=0.2,
|
| 91 |
+
max_tokens=200,
|
| 92 |
+
stream=False,
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
response_text = completion.choices[0].message.content or ""
|
| 96 |
+
```
|
| 97 |
+
|
| 98 |
+
Rules:
|
| 99 |
+
|
| 100 |
+
- NEVER hard-code any of these values
|
| 101 |
+
- NEVER use os.environ["VAR"] (use os.getenv() - matches sample)
|
| 102 |
+
- NEVER use any LLM client other than openai.OpenAI
|
| 103 |
+
- Support both HF_TOKEN and API_KEY with or fallback (matches sample)
|
| 104 |
+
|
| 105 |
+
---
|
| 106 |
+
|
| 107 |
+
## 4. FILE MAP - Strict Build Order
|
| 108 |
+
|
| 109 |
+
| Order | File | Purpose | May import from |
|
| 110 |
+
|---|---|---|---|
|
| 111 |
+
| 1st | models.py | Pydantic models + StepResult wrapper | stdlib, pydantic only |
|
| 112 |
+
| 2nd | tasks.py | Task definitions + hard-coded email data | models.py only |
|
| 113 |
+
| 3rd | graders.py | Deterministic grader functions | models.py, tasks.py only |
|
| 114 |
+
| 4th | environment.py | Core env class: step, reset, state | models, tasks, graders |
|
| 115 |
+
| 5th | server.py | Flask HTTP wrapper: /reset, /step, /state | environment.py, models.py |
|
| 116 |
+
| 6th | inference.py | OpenAI Client inference script | models.py, environment.py |
|
| 117 |
+
| 7th | openenv.yaml | Spec metadata | N/A (data file) |
|
| 118 |
+
| 8th | Dockerfile | Container build | N/A (config file) |
|
| 119 |
+
| 8th | requirements.txt | Pinned dependencies | N/A (config file) |
|
| 120 |
+
| 9th | README.md | Full documentation | N/A (documentation) |
|
| 121 |
+
| 10th | validate-submission.sh | Pre-submission validator script | N/A (shell script) |
|
| 122 |
+
|
| 123 |
+
### Rules about files
|
| 124 |
+
|
| 125 |
+
- Do NOT create files not listed above. No utils.py, helpers.py, or config.py.
|
| 126 |
+
- Do NOT merge files. Each file has one responsibility.
|
| 127 |
+
- Do NOT create subdirectories. All files live in the project root.
|
| 128 |
+
- Do NOT add init.py. This is not a package.
|
| 129 |
+
|
| 130 |
+
---
|
| 131 |
+
|
| 132 |
+
## 5. DEPENDENCY RULES
|
| 133 |
+
|
| 134 |
+
### Allowed dependencies
|
| 135 |
+
|
| 136 |
+
```txt
|
| 137 |
+
pydantic>=2.0,<3.0
|
| 138 |
+
flask>=3.0,<4.0
|
| 139 |
+
openai>=1.0,<2.0
|
| 140 |
+
gunicorn>=21.0,<23.0
|
| 141 |
+
```
|
| 142 |
+
|
| 143 |
+
### Conditionally allowed (only if needed)
|
| 144 |
+
|
| 145 |
+
```txt
|
| 146 |
+
numpy
|
| 147 |
+
Pillow
|
| 148 |
+
```
|
| 149 |
+
|
| 150 |
+
### Forbidden
|
| 151 |
+
|
| 152 |
+
- No LangChain, LlamaIndex, or any agent framework
|
| 153 |
+
- No pandas or scipy
|
| 154 |
+
- No database libraries
|
| 155 |
+
- No async frameworks (FastAPI, aiohttp) - use Flask
|
| 156 |
+
- No frontend frameworks (Streamlit, Gradio)
|
| 157 |
+
- No ML libraries (torch, transformers, sklearn)
|
| 158 |
+
|
| 159 |
+
---
|
| 160 |
+
|
| 161 |
+
## 6. PYDANTIC MODEL RULES
|
| 162 |
+
|
| 163 |
+
### models.py constraints
|
| 164 |
+
|
| 165 |
+
- ALL models MUST inherit from pydantic.BaseModel
|
| 166 |
+
- ALL fields MUST have explicit type annotations
|
| 167 |
+
- ALL Literal types MUST use typing.Literal with exhaustive values
|
| 168 |
+
- NO methods on models (except StepResult and ResetResult wrappers)
|
| 169 |
+
- NO validators that call external services
|
| 170 |
+
- NO default_factory that uses randomness
|
| 171 |
+
- Field names MUST be snake_case
|
| 172 |
+
- NO nested models deeper than 2 levels
|
| 173 |
+
|
| 174 |
+
### Required models (exact names)
|
| 175 |
+
|
| 176 |
+
```python
|
| 177 |
+
class EmailObservation(BaseModel): ...
|
| 178 |
+
class TriageAction(BaseModel): ...
|
| 179 |
+
class RewardResult(BaseModel): ...
|
| 180 |
+
class EnvironmentState(BaseModel): ...
|
| 181 |
+
class StepResult(BaseModel): ...
|
| 182 |
+
class ResetResult(BaseModel): ...
|
| 183 |
+
```
|
| 184 |
+
|
| 185 |
+
### StepResult and ResetResult interface (mandatory)
|
| 186 |
+
|
| 187 |
+
```python
|
| 188 |
+
class StepResult(BaseModel):
|
| 189 |
+
observation: EmailObservation
|
| 190 |
+
reward: float
|
| 191 |
+
done: bool
|
| 192 |
+
info: dict[str, str | int | float | bool]
|
| 193 |
+
|
| 194 |
+
class ResetResult(BaseModel):
|
| 195 |
+
observation: EmailObservation
|
| 196 |
+
info: dict[str, str | int | float | bool]
|
| 197 |
+
```
|
| 198 |
+
|
| 199 |
+
### EmailObservation required fields
|
| 200 |
+
|
| 201 |
+
| Field | Type | Required |
|
| 202 |
+
|---|---|---|
|
| 203 |
+
| email_id | str | Yes |
|
| 204 |
+
| subject | str | Yes |
|
| 205 |
+
| body | str | Yes |
|
| 206 |
+
| sender | str | Yes |
|
| 207 |
+
| timestamp | str | Yes |
|
| 208 |
+
| thread_history | list[str] | Yes |
|
| 209 |
+
| task_id | str | Yes |
|
| 210 |
+
| step_number | int | Yes |
|
| 211 |
+
| total_emails | int | Yes |
|
| 212 |
+
|
| 213 |
+
### TriageAction required fields
|
| 214 |
+
|
| 215 |
+
| Field | Type | Required |
|
| 216 |
+
|---|---|---|
|
| 217 |
+
| label | Literal["urgent", "normal", "spam", "archive"] | Yes |
|
| 218 |
+
| summary | str | Yes |
|
| 219 |
+
| route_to | str | Yes |
|
| 220 |
+
|
| 221 |
+
### RewardResult required fields
|
| 222 |
+
|
| 223 |
+
| Field | Type | Required |
|
| 224 |
+
|---|---|---|
|
| 225 |
+
| score | float | Yes |
|
| 226 |
+
| breakdown | dict[str, float] | Yes |
|
| 227 |
+
| feedback | str | Yes |
|
| 228 |
+
|
| 229 |
+
### EnvironmentState required fields
|
| 230 |
+
|
| 231 |
+
| Field | Type | Required |
|
| 232 |
+
|---|---|---|
|
| 233 |
+
| task_id | str | Yes |
|
| 234 |
+
| current_step | int | Yes |
|
| 235 |
+
| total_steps | int | Yes |
|
| 236 |
+
| done | bool | Yes |
|
| 237 |
+
| action_history | list | Yes |
|
| 238 |
+
| reward_history | list | Yes |
|
| 239 |
+
|
| 240 |
+
---
|
| 241 |
+
|
| 242 |
+
## 7. ENVIRONMENT CLASS RULES
|
| 243 |
+
|
| 244 |
+
- Class name: EmailTriageEnv
|
| 245 |
+
- Constructor: __init__(self, task_id: str)
|
| 246 |
+
- MUST accept a task_id string
|
| 247 |
+
- MUST NOT call any external API
|
| 248 |
+
- MUST NOT use randomness
|
| 249 |
+
|
| 250 |
+
### reset() -> ResetResult
|
| 251 |
+
|
| 252 |
+
- MUST return a ResetResult object (not a bare observation)
|
| 253 |
+
- result.observation must contain the first email
|
| 254 |
+
- MUST reset all internal state
|
| 255 |
+
- MUST be callable multiple times without side effects
|
| 256 |
+
- HF Space validator will call /reset and expect HTTP 200 + valid JSON
|
| 257 |
+
|
| 258 |
+
### step(action: TriageAction) -> StepResult
|
| 259 |
+
|
| 260 |
+
- MUST return a StepResult object (not a tuple)
|
| 261 |
+
- result.observation: next email or terminal observation
|
| 262 |
+
- result.reward: float score for this step
|
| 263 |
+
- result.done: bool indicating episode end
|
| 264 |
+
- result.info: metadata dict
|
| 265 |
+
- MUST never raise an exception from bad agent input
|
| 266 |
+
- If action validation fails: return StepResult with reward=0.0 and continue
|
| 267 |
+
- MUST increment step counter
|
| 268 |
+
- MUST set done=True when all emails processed or max_steps hit
|
| 269 |
+
|
| 270 |
+
### state() -> EnvironmentState
|
| 271 |
+
|
| 272 |
+
- MUST return the full current internal state
|
| 273 |
+
- MUST be read-only
|
| 274 |
+
|
| 275 |
+
### Hard rules for environment.py
|
| 276 |
+
|
| 277 |
+
- NO randomness
|
| 278 |
+
- NO API calls
|
| 279 |
+
- NO file I/O during step/reset/state
|
| 280 |
+
- NO global mutable state
|
| 281 |
+
- NO threading or async
|
| 282 |
+
- NO print statements
|
| 283 |
+
|
| 284 |
+
---
|
| 285 |
+
|
| 286 |
+
## 8. TASK DATA RULES
|
| 287 |
+
|
| 288 |
+
Unchanged from previous version.
|
| 289 |
+
|
| 290 |
+
- All email data MUST be hard-coded
|
| 291 |
+
- NO loading from external files, URLs, or databases
|
| 292 |
+
- Task IDs: task_easy, task_medium, task_hard
|
| 293 |
+
- Each task defines: task_id, description, emails, ground_truth
|
| 294 |
+
- Ground truth MUST NOT be in observations (no answer leakage)
|
| 295 |
+
- Realistic professional email content
|
| 296 |
+
- NO offensive or NSFW content
|
| 297 |
+
|
| 298 |
+
---
|
| 299 |
+
|
| 300 |
+
## 9. GRADER RULES
|
| 301 |
+
|
| 302 |
+
Unchanged from previous version.
|
| 303 |
+
|
| 304 |
+
- Pure functions
|
| 305 |
+
- Deterministic
|
| 306 |
+
- Partial credit
|
| 307 |
+
- Scores in [0.0, 1.0]
|
| 308 |
+
|
| 309 |
+
---
|
| 310 |
+
|
| 311 |
+
## 10. REWARD FUNCTION RULES
|
| 312 |
+
|
| 313 |
+
Unchanged from previous version.
|
| 314 |
+
|
| 315 |
+
```text
|
| 316 |
+
final_reward = base_score - (step_count * 0.01) + trajectory_bonus - penalties
|
| 317 |
+
```
|
| 318 |
+
|
| 319 |
+
Final reward is clipped to [-1.0, 1.0].
|
| 320 |
+
|
| 321 |
+
---
|
| 322 |
+
|
| 323 |
+
## 11. SERVER RULES
|
| 324 |
+
|
| 325 |
+
### server.py constraints
|
| 326 |
+
|
| 327 |
+
- MUST use Flask
|
| 328 |
+
- Exactly THREE routes:
|
| 329 |
+
- POST /reset: accepts {"task_id": str}, returns ResetResult JSON
|
| 330 |
+
- POST /step: accepts TriageAction JSON, returns StepResult JSON
|
| 331 |
+
- POST /state: returns EnvironmentState JSON
|
| 332 |
+
- MUST listen on port 7860
|
| 333 |
+
- MUST handle malformed JSON gracefully (return 400)
|
| 334 |
+
- All responses must include Content-Type: application/json
|
| 335 |
+
- Validator will ping and call /reset, which must return HTTP 200
|
| 336 |
+
|
| 337 |
+
### /step response format
|
| 338 |
+
|
| 339 |
+
```json
|
| 340 |
+
{
|
| 341 |
+
"observation": {},
|
| 342 |
+
"reward": 0.85,
|
| 343 |
+
"done": false,
|
| 344 |
+
"info": {"step": 1, "task_id": "task_easy"}
|
| 345 |
+
}
|
| 346 |
+
```
|
| 347 |
+
|
| 348 |
+
### /reset response format
|
| 349 |
+
|
| 350 |
+
```json
|
| 351 |
+
{
|
| 352 |
+
"observation": {},
|
| 353 |
+
"info": {"task_id": "task_easy"}
|
| 354 |
+
}
|
| 355 |
+
```
|
| 356 |
+
|
| 357 |
+
---
|
| 358 |
+
|
| 359 |
+
## 12. INFERENCE SCRIPT RULES
|
| 360 |
+
|
| 361 |
+
CRITICAL PATTERNS FROM SAMPLE - MUST FOLLOW
|
| 362 |
+
|
| 363 |
+
### Architecture (matches sample)
|
| 364 |
+
|
| 365 |
+
```text
|
| 366 |
+
1. Initialize OpenAI client with env vars
|
| 367 |
+
2. Create environment instance
|
| 368 |
+
3. Call reset(), get initial observation
|
| 369 |
+
4. Loop up to MAX_STEPS:
|
| 370 |
+
a. Build prompt from observation + history
|
| 371 |
+
b. Call LLM
|
| 372 |
+
c. Parse response into action (with fallback)
|
| 373 |
+
d. Call step(action)
|
| 374 |
+
e. Record history
|
| 375 |
+
f. Check done flag
|
| 376 |
+
5. Print results
|
| 377 |
+
```
|
| 378 |
+
|
| 379 |
+
### Mandatory constants
|
| 380 |
+
|
| 381 |
+
```python
|
| 382 |
+
MAX_STEPS = 10
|
| 383 |
+
TEMPERATURE = 0.2
|
| 384 |
+
MAX_TOKENS = 200
|
| 385 |
+
FALLBACK_ACTION = ...
|
| 386 |
+
```
|
| 387 |
+
|
| 388 |
+
### Response parsing rules
|
| 389 |
+
|
| 390 |
+
- Do NOT rely only on response_format={"type": "json_object"}
|
| 391 |
+
- Parse free-text responses with regex or string matching
|
| 392 |
+
- If parsing fails, use a fallback action
|
| 393 |
+
- Strip prefixes like action: or next action: before parsing
|
| 394 |
+
- Regex parsing with fallback is preferred
|
| 395 |
+
|
| 396 |
+
### History tracking
|
| 397 |
+
|
| 398 |
+
```python
|
| 399 |
+
history: list[str] = []
|
| 400 |
+
history_line = f"Step {step}: {action} -> reward {reward:+.2f}"
|
| 401 |
+
history.append(history_line)
|
| 402 |
+
```
|
| 403 |
+
|
| 404 |
+
### Error handling
|
| 405 |
+
|
| 406 |
+
```python
|
| 407 |
+
try:
|
| 408 |
+
completion = client.chat.completions.create(...)
|
| 409 |
+
response_text = completion.choices[0].message.content or ""
|
| 410 |
+
except Exception as exc:
|
| 411 |
+
print(f"Model request failed ({exc}). Using fallback action.")
|
| 412 |
+
response_text = ""
|
| 413 |
+
```
|
| 414 |
+
|
| 415 |
+
### Output format
|
| 416 |
+
|
| 417 |
+
```text
|
| 418 |
+
Episode: task_easy
|
| 419 |
+
Step 1: label=urgent, route=safety -> reward +0.85
|
| 420 |
+
Final score: 0.85
|
| 421 |
+
|
| 422 |
+
=== SCORE TABLE ===
|
| 423 |
+
Task Score Steps
|
| 424 |
+
task_easy 0.85 1
|
| 425 |
+
task_medium 0.62 5
|
| 426 |
+
task_hard 0.45 2
|
| 427 |
+
Mean 0.64
|
| 428 |
+
```
|
| 429 |
+
|
| 430 |
+
### File naming and location
|
| 431 |
+
|
| 432 |
+
- File MUST be named inference.py
|
| 433 |
+
- MUST be in the project root directory
|
| 434 |
+
- MUST be runnable with python inference.py
|
| 435 |
+
- MUST complete in under 20 minutes
|
| 436 |
+
|
| 437 |
+
---
|
| 438 |
+
|
| 439 |
+
## 13. DOCKERFILE RULES
|
| 440 |
+
|
| 441 |
+
- Base image: python:3.11-slim
|
| 442 |
+
- WORKDIR: /app
|
| 443 |
+
- Copy requirements.txt first, pip install, then copy source
|
| 444 |
+
- EXPOSE 7860
|
| 445 |
+
- Create non-root user
|
| 446 |
+
- CMD starts the server
|
| 447 |
+
- Must build with --platform linux/amd64
|
| 448 |
+
- Must run within 2 vCPU / 8 GB memory
|
| 449 |
+
- No unnecessary system packages
|
| 450 |
+
- No CUDA/GPU dependencies
|
| 451 |
+
|
| 452 |
+
---
|
| 453 |
+
|
| 454 |
+
## 14. CODE STYLE RULES
|
| 455 |
+
|
| 456 |
+
- Python 3.11+
|
| 457 |
+
- Type hints on ALL function signatures
|
| 458 |
+
- Docstrings on ALL public functions (Google style)
|
| 459 |
+
- No single-letter variable names except i in loops
|
| 460 |
+
- Comments explain WHY, not WHAT
|
| 461 |
+
- Max line length: 100 characters
|
| 462 |
+
- f-strings only
|
| 463 |
+
- No wildcard imports
|
| 464 |
+
- Import order: stdlib -> third-party -> local
|
| 465 |
+
|
| 466 |
+
---
|
| 467 |
+
|
| 468 |
+
## 15. WHAT AI MUST NEVER DO
|
| 469 |
+
|
| 470 |
+
- Never add features not in this spec
|
| 471 |
+
- Never use an LLM inside a grader
|
| 472 |
+
- Never generate fake scores
|
| 473 |
+
- Never create a UI
|
| 474 |
+
- Never use randomness in the environment
|
| 475 |
+
- Never store API keys in code
|
| 476 |
+
- Never skip error handling in step()
|
| 477 |
+
- Never use bare dicts where Pydantic models are specified
|
| 478 |
+
- Never name the inference script baseline.py
|
| 479 |
+
- Never use OPENAI_API_KEY; use HF_TOKEN/API_KEY
|
| 480 |
+
- Never use response_format={"type": "json_object"} without text-parsing fallback
|
| 481 |
+
- Never return tuples from step/reset; use StepResult/ResetResult objects
|
| 482 |
+
- Never skip the fallback action pattern
|
| 483 |
+
- Never skip history tracking in inference
|
| 484 |
+
|
| 485 |
+
---
|
| 486 |
+
|
| 487 |
+
## 16. DEFINITION OF DONE - Per Phase Checklist
|
| 488 |
+
|
| 489 |
+
### Phase 1 complete when
|
| 490 |
+
|
| 491 |
+
- models.py exists with all 6 models (including StepResult, ResetResult)
|
| 492 |
+
- All fields match this document
|
| 493 |
+
- Models instantiate with sample data without errors
|
| 494 |
+
- StepResult has observation, reward, done, info attributes
|
| 495 |
+
|
| 496 |
+
### Phase 2 complete when
|
| 497 |
+
|
| 498 |
+
- tasks.py exists with 3 tasks
|
| 499 |
+
- All email data is realistic and hard-coded
|
| 500 |
+
- Ground truth exists for every email
|
| 501 |
+
- No answer leakage
|
| 502 |
+
|
| 503 |
+
### Phase 3 complete when
|
| 504 |
+
|
| 505 |
+
- graders.py has 3 pure grader functions
|
| 506 |
+
- Partial credit works
|
| 507 |
+
- All scores in [0.0, 1.0]
|
| 508 |
+
|
| 509 |
+
### Phase 4 complete when
|
| 510 |
+
|
| 511 |
+
- environment.py has EmailTriageEnv class
|
| 512 |
+
- reset() returns ResetResult
|
| 513 |
+
- step() returns StepResult
|
| 514 |
+
- step() handles invalid input without crashing
|
| 515 |
+
- Full episode runs to completion
|
| 516 |
+
|
| 517 |
+
### Phase 5 complete when
|
| 518 |
+
|
| 519 |
+
- server.py has /reset, /step, /state routes
|
| 520 |
+
- /reset returns {"observation": ..., "info": ...}
|
| 521 |
+
- /step returns {"observation": ..., "reward": ..., "done": ..., "info": ...}
|
| 522 |
+
- Malformed requests return 400
|
| 523 |
+
- Port 7860
|
| 524 |
+
|
| 525 |
+
### Phase 6 complete when
|
| 526 |
+
|
| 527 |
+
- inference.py follows sample architecture
|
| 528 |
+
- Uses os.getenv() for API_BASE_URL, HF_TOKEN/API_KEY, MODEL_NAME
|
| 529 |
+
- Has MAX_STEPS, TEMPERATURE, MAX_TOKENS, FALLBACK constants
|
| 530 |
+
- Has history tracking
|
| 531 |
+
- Has response parsing with fallback
|
| 532 |
+
- Has try/except around API calls
|
| 533 |
+
- Prints score table
|
| 534 |
+
- Completes in under 20 minutes
|
| 535 |
+
|
| 536 |
+
### Phase 7-9
|
| 537 |
+
|
| 538 |
+
Unchanged from previous version.
|
| 539 |
+
|
| 540 |
+
---
|
| 541 |
+
|
| 542 |
+
## 17. WHEN IN DOUBT
|
| 543 |
+
|
| 544 |
+
- Re-read this file
|
| 545 |
+
- Re-read the project briefing
|
| 546 |
+
- Re-read the sample inference.py
|
| 547 |
+
- Match the sample patterns
|
| 548 |
+
- Choose the simpler option
|
| 549 |
+
- Ask the human, do not guess
|
| 550 |
+
|
| 551 |
+
This file is the law. Code that violates it gets deleted.
|
__pycache__/app.cpython-314.pyc
ADDED
|
Binary file (28.4 kB). View file
|
|
|
app.py
ADDED
|
@@ -0,0 +1,775 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Auxiliary server entrypoint required by OpenEnv local validation checks."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
from flask import Flask, Response, jsonify, request
|
| 6 |
+
|
| 7 |
+
from environment import EmailTriageEnv
|
| 8 |
+
from tasks import get_task_scenario_count, list_task_ids
|
| 9 |
+
|
| 10 |
+
FRONTEND_HTML = """<!doctype html>
|
| 11 |
+
<html lang="en">
|
| 12 |
+
<head>
|
| 13 |
+
<meta charset="utf-8" />
|
| 14 |
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
| 15 |
+
<title>Inbox Helper Practice</title>
|
| 16 |
+
<style>
|
| 17 |
+
@import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;600;700&family=IBM+Plex+Mono:wght@400;500&display=swap');
|
| 18 |
+
|
| 19 |
+
:root {
|
| 20 |
+
--bg: #f5f1e9;
|
| 21 |
+
--paper: #fffaf2;
|
| 22 |
+
--ink: #102433;
|
| 23 |
+
--accent: #ea6a2a;
|
| 24 |
+
--accent-soft: #ffd6bf;
|
| 25 |
+
--line: #d7cabb;
|
| 26 |
+
--ok: #0f7b6c;
|
| 27 |
+
--warn: #9a3a12;
|
| 28 |
+
--radius: 14px;
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
* { box-sizing: border-box; }
|
| 32 |
+
|
| 33 |
+
body {
|
| 34 |
+
margin: 0;
|
| 35 |
+
font-family: 'Space Grotesk', sans-serif;
|
| 36 |
+
color: var(--ink);
|
| 37 |
+
background:
|
| 38 |
+
radial-gradient(1100px 460px at -10% -20%, #f2bc9f 0%, transparent 60%),
|
| 39 |
+
radial-gradient(1100px 520px at 120% 115%, #b8d7cf 0%, transparent 62%),
|
| 40 |
+
var(--bg);
|
| 41 |
+
min-height: 100vh;
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
.wrap {
|
| 45 |
+
max-width: 1100px;
|
| 46 |
+
margin: 28px auto;
|
| 47 |
+
padding: 0 16px;
|
| 48 |
+
animation: reveal .45s ease-out;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
@keyframes reveal {
|
| 52 |
+
from { opacity: 0; transform: translateY(10px); }
|
| 53 |
+
to { opacity: 1; transform: translateY(0); }
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
.title {
|
| 57 |
+
display: flex;
|
| 58 |
+
justify-content: space-between;
|
| 59 |
+
align-items: baseline;
|
| 60 |
+
gap: 14px;
|
| 61 |
+
margin-bottom: 14px;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
h1 {
|
| 65 |
+
margin: 0;
|
| 66 |
+
font-size: clamp(1.5rem, 2vw, 2.2rem);
|
| 67 |
+
letter-spacing: .4px;
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
.subtitle {
|
| 71 |
+
margin: 6px 0 0;
|
| 72 |
+
font-size: .95rem;
|
| 73 |
+
opacity: .8;
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
.badge {
|
| 77 |
+
background: var(--accent-soft);
|
| 78 |
+
border: 1px solid #f2b693;
|
| 79 |
+
color: #7f2e0b;
|
| 80 |
+
padding: 6px 10px;
|
| 81 |
+
border-radius: 999px;
|
| 82 |
+
font-size: .85rem;
|
| 83 |
+
font-weight: 600;
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
.grid {
|
| 87 |
+
display: grid;
|
| 88 |
+
grid-template-columns: 1fr;
|
| 89 |
+
gap: 14px;
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
@media (min-width: 900px) {
|
| 93 |
+
.grid { grid-template-columns: 1fr 1fr; }
|
| 94 |
+
.wide { grid-column: span 2; }
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
.card {
|
| 98 |
+
background: var(--paper);
|
| 99 |
+
border: 1px solid var(--line);
|
| 100 |
+
border-radius: var(--radius);
|
| 101 |
+
padding: 14px;
|
| 102 |
+
box-shadow: 0 8px 28px rgba(16, 36, 51, 0.08);
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
.card h2 {
|
| 106 |
+
margin: 0 0 10px;
|
| 107 |
+
font-size: 1rem;
|
| 108 |
+
text-transform: uppercase;
|
| 109 |
+
letter-spacing: .08em;
|
| 110 |
+
opacity: .86;
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
.row {
|
| 114 |
+
display: flex;
|
| 115 |
+
flex-wrap: wrap;
|
| 116 |
+
gap: 8px;
|
| 117 |
+
align-items: center;
|
| 118 |
+
margin-bottom: 10px;
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
select, input, textarea, button {
|
| 122 |
+
font-family: inherit;
|
| 123 |
+
font-size: .95rem;
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
select, input, textarea {
|
| 127 |
+
width: 100%;
|
| 128 |
+
border: 1px solid #cdbba6;
|
| 129 |
+
border-radius: 10px;
|
| 130 |
+
padding: 9px 10px;
|
| 131 |
+
background: #fff;
|
| 132 |
+
color: var(--ink);
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
textarea {
|
| 136 |
+
min-height: 92px;
|
| 137 |
+
resize: vertical;
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
button {
|
| 141 |
+
border: 0;
|
| 142 |
+
border-radius: 10px;
|
| 143 |
+
padding: 9px 12px;
|
| 144 |
+
font-weight: 700;
|
| 145 |
+
background: var(--ink);
|
| 146 |
+
color: #fff;
|
| 147 |
+
cursor: pointer;
|
| 148 |
+
transition: transform .12s ease, opacity .12s ease;
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
button.secondary {
|
| 152 |
+
background: #285066;
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
button.accent {
|
| 156 |
+
background: var(--accent);
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
button:hover { transform: translateY(-1px); }
|
| 160 |
+
button:active { transform: translateY(0); opacity: .92; }
|
| 161 |
+
|
| 162 |
+
.status {
|
| 163 |
+
padding: 8px 10px;
|
| 164 |
+
border-radius: 10px;
|
| 165 |
+
background: #eef7f5;
|
| 166 |
+
border: 1px solid #c7e4de;
|
| 167 |
+
color: var(--ok);
|
| 168 |
+
font-weight: 600;
|
| 169 |
+
min-height: 40px;
|
| 170 |
+
display: flex;
|
| 171 |
+
align-items: center;
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
.status.error {
|
| 175 |
+
background: #fff1ea;
|
| 176 |
+
border-color: #ffc8ae;
|
| 177 |
+
color: var(--warn);
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
pre {
|
| 181 |
+
margin: 0;
|
| 182 |
+
white-space: pre-wrap;
|
| 183 |
+
background: #0f1b24;
|
| 184 |
+
color: #d9efe9;
|
| 185 |
+
border-radius: 10px;
|
| 186 |
+
padding: 12px;
|
| 187 |
+
max-height: 340px;
|
| 188 |
+
overflow: auto;
|
| 189 |
+
font-family: 'IBM Plex Mono', monospace;
|
| 190 |
+
font-size: .85rem;
|
| 191 |
+
border: 1px solid #21313f;
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
.email-block {
|
| 195 |
+
background: #fff;
|
| 196 |
+
border: 1px solid #d9ccbc;
|
| 197 |
+
border-radius: 10px;
|
| 198 |
+
padding: 12px;
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
.email-row {
|
| 202 |
+
margin-bottom: 8px;
|
| 203 |
+
font-size: .95rem;
|
| 204 |
+
line-height: 1.35;
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
.email-row strong {
|
| 208 |
+
display: inline-block;
|
| 209 |
+
min-width: 66px;
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
.help {
|
| 213 |
+
margin: 0 0 10px;
|
| 214 |
+
font-size: .9rem;
|
| 215 |
+
opacity: .8;
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
.metric {
|
| 219 |
+
display: flex;
|
| 220 |
+
justify-content: space-between;
|
| 221 |
+
align-items: center;
|
| 222 |
+
margin-bottom: 8px;
|
| 223 |
+
padding-bottom: 6px;
|
| 224 |
+
border-bottom: 1px dashed #dbcfbe;
|
| 225 |
+
font-size: .95rem;
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
.metric strong {
|
| 229 |
+
font-weight: 700;
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
.coach {
|
| 233 |
+
background: #fff7ed;
|
| 234 |
+
border: 1px solid #f2caa9;
|
| 235 |
+
border-radius: 10px;
|
| 236 |
+
padding: 10px;
|
| 237 |
+
min-height: 74px;
|
| 238 |
+
line-height: 1.4;
|
| 239 |
+
font-size: .92rem;
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
.chip-row {
|
| 243 |
+
display: flex;
|
| 244 |
+
flex-wrap: wrap;
|
| 245 |
+
gap: 8px;
|
| 246 |
+
margin-top: 10px;
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
.chip {
|
| 250 |
+
background: #eaf3ff;
|
| 251 |
+
border: 1px solid #b9d1ef;
|
| 252 |
+
color: #184469;
|
| 253 |
+
border-radius: 999px;
|
| 254 |
+
padding: 6px 10px;
|
| 255 |
+
font-size: .84rem;
|
| 256 |
+
cursor: pointer;
|
| 257 |
+
font-weight: 600;
|
| 258 |
+
}
|
| 259 |
+
</style>
|
| 260 |
+
</head>
|
| 261 |
+
<body>
|
| 262 |
+
<div class="wrap">
|
| 263 |
+
<div class="title">
|
| 264 |
+
<div>
|
| 265 |
+
<h1>Inbox Helper Practice</h1>
|
| 266 |
+
<p class="subtitle">Practice deciding priority, category, and who should handle each email.</p>
|
| 267 |
+
</div>
|
| 268 |
+
<span class="badge" id="badge">connecting...</span>
|
| 269 |
+
</div>
|
| 270 |
+
|
| 271 |
+
<div class="grid">
|
| 272 |
+
<section class="card">
|
| 273 |
+
<h2>Start a Scenario</h2>
|
| 274 |
+
<p class="help">Pick a difficulty, then click Start.</p>
|
| 275 |
+
<div class="row">
|
| 276 |
+
<select id="taskId">
|
| 277 |
+
<option value="task_easy">Easy: one clear email</option>
|
| 278 |
+
<option value="task_medium">Medium: mixed inbox</option>
|
| 279 |
+
<option value="task_hard">Hard: high-risk complaint</option>
|
| 280 |
+
<option value="task_production">Production: full inbox simulator</option>
|
| 281 |
+
</select>
|
| 282 |
+
</div>
|
| 283 |
+
<div id="productionControls" style="display:none;">
|
| 284 |
+
<div class="row">
|
| 285 |
+
<select id="productionProfile">
|
| 286 |
+
<option value="light">Workload: Light</option>
|
| 287 |
+
<option value="standard" selected>Workload: Standard</option>
|
| 288 |
+
<option value="heavy">Workload: Heavy</option>
|
| 289 |
+
</select>
|
| 290 |
+
</div>
|
| 291 |
+
<div class="row">
|
| 292 |
+
<select id="businessHoursMode">
|
| 293 |
+
<option value="false" selected>Time Profile: 24x7 inbox</option>
|
| 294 |
+
<option value="true">Time Profile: business hours focus</option>
|
| 295 |
+
</select>
|
| 296 |
+
</div>
|
| 297 |
+
<div class="row">
|
| 298 |
+
<select id="escalationMode">
|
| 299 |
+
<option value="low">Escalation: Low</option>
|
| 300 |
+
<option value="normal" selected>Escalation: Normal</option>
|
| 301 |
+
<option value="high">Escalation: High</option>
|
| 302 |
+
</select>
|
| 303 |
+
</div>
|
| 304 |
+
</div>
|
| 305 |
+
<div class="row">
|
| 306 |
+
<button class="accent" id="btnReset">Start</button>
|
| 307 |
+
<button class="secondary" id="btnState">Check Progress</button>
|
| 308 |
+
</div>
|
| 309 |
+
<div class="status" id="status">Ready. Start a scenario.</div>
|
| 310 |
+
</section>
|
| 311 |
+
|
| 312 |
+
<section class="card">
|
| 313 |
+
<h2>Your Decision</h2>
|
| 314 |
+
<p class="help">Choose priority, who should handle it, and a short reason.</p>
|
| 315 |
+
<div class="row">
|
| 316 |
+
<select id="label">
|
| 317 |
+
<option value="urgent">Urgent</option>
|
| 318 |
+
<option value="normal" selected>Normal</option>
|
| 319 |
+
<option value="spam">Spam</option>
|
| 320 |
+
<option value="archive">Archive</option>
|
| 321 |
+
</select>
|
| 322 |
+
</div>
|
| 323 |
+
<div class="row">
|
| 324 |
+
<input id="routeTo" placeholder="Who should handle this? (billing, safety, engineering, support)" value="general" />
|
| 325 |
+
</div>
|
| 326 |
+
<div class="row">
|
| 327 |
+
<textarea id="summary" placeholder="Write one clear sentence with key clues from the email.">Needs review.</textarea>
|
| 328 |
+
</div>
|
| 329 |
+
<div class="row">
|
| 330 |
+
<button id="btnStep">Send Decision</button>
|
| 331 |
+
</div>
|
| 332 |
+
</section>
|
| 333 |
+
|
| 334 |
+
<section class="card wide">
|
| 335 |
+
<h2>Current Email</h2>
|
| 336 |
+
<div class="email-block">
|
| 337 |
+
<div class="email-row"><strong>Subject:</strong> <span id="mailSubject">No email loaded yet.</span></div>
|
| 338 |
+
<div class="email-row"><strong>From:</strong> <span id="mailSender">-</span></div>
|
| 339 |
+
<div class="email-row"><strong>Message:</strong> <span id="mailBody">Start a scenario to load an email.</span></div>
|
| 340 |
+
</div>
|
| 341 |
+
</section>
|
| 342 |
+
|
| 343 |
+
<section class="card">
|
| 344 |
+
<h2>Live Progress</h2>
|
| 345 |
+
<div class="metric"><span>Task</span><strong id="insightTask">-</strong></div>
|
| 346 |
+
<div class="metric"><span>Scenario</span><strong id="insightScenario">-</strong></div>
|
| 347 |
+
<div class="metric"><span>Progress</span><strong id="insightProgress">0/0</strong></div>
|
| 348 |
+
<div class="metric"><span>Last Reward</span><strong id="insightReward">-</strong></div>
|
| 349 |
+
<div class="metric"><span>Base Score</span><strong id="insightBase">-</strong></div>
|
| 350 |
+
</section>
|
| 351 |
+
|
| 352 |
+
<section class="card">
|
| 353 |
+
<h2>Coach Notes</h2>
|
| 354 |
+
<p class="help">Use this to improve your next triage action.</p>
|
| 355 |
+
<div class="coach" id="coachNotes">Start a scenario and submit one decision to get feedback.</div>
|
| 356 |
+
<div class="chip-row">
|
| 357 |
+
<button class="chip" id="chipSafety">Quick Fill: Urgent + Safety</button>
|
| 358 |
+
<button class="chip" id="chipBilling">Quick Fill: Normal + Billing</button>
|
| 359 |
+
<button class="chip" id="chipSpam">Quick Fill: Spam + General</button>
|
| 360 |
+
</div>
|
| 361 |
+
</section>
|
| 362 |
+
|
| 363 |
+
<section class="card wide">
|
| 364 |
+
<h2>Details (Advanced)</h2>
|
| 365 |
+
<pre id="output">Waiting for your first action...</pre>
|
| 366 |
+
</section>
|
| 367 |
+
</div>
|
| 368 |
+
</div>
|
| 369 |
+
|
| 370 |
+
<script>
|
| 371 |
+
const statusEl = document.getElementById('status');
|
| 372 |
+
const badgeEl = document.getElementById('badge');
|
| 373 |
+
const outEl = document.getElementById('output');
|
| 374 |
+
const mailSubjectEl = document.getElementById('mailSubject');
|
| 375 |
+
const mailSenderEl = document.getElementById('mailSender');
|
| 376 |
+
const mailBodyEl = document.getElementById('mailBody');
|
| 377 |
+
const taskIdEl = document.getElementById('taskId');
|
| 378 |
+
const productionControlsEl = document.getElementById('productionControls');
|
| 379 |
+
const insightTaskEl = document.getElementById('insightTask');
|
| 380 |
+
const insightScenarioEl = document.getElementById('insightScenario');
|
| 381 |
+
const insightProgressEl = document.getElementById('insightProgress');
|
| 382 |
+
const insightRewardEl = document.getElementById('insightReward');
|
| 383 |
+
const insightBaseEl = document.getElementById('insightBase');
|
| 384 |
+
const coachNotesEl = document.getElementById('coachNotes');
|
| 385 |
+
|
| 386 |
+
function setStatus(msg, isError = false) {
|
| 387 |
+
statusEl.textContent = msg;
|
| 388 |
+
statusEl.classList.toggle('error', isError);
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
function writeOutput(value) {
|
| 392 |
+
outEl.textContent = typeof value === 'string' ? value : JSON.stringify(value, null, 2);
|
| 393 |
+
}
|
| 394 |
+
|
| 395 |
+
function updateEmailPanel(data) {
|
| 396 |
+
if (!data || !data.observation) {
|
| 397 |
+
return;
|
| 398 |
+
}
|
| 399 |
+
const obs = data.observation;
|
| 400 |
+
mailSubjectEl.textContent = obs.subject || 'No subject';
|
| 401 |
+
mailSenderEl.textContent = obs.sender || '-';
|
| 402 |
+
mailBodyEl.textContent = obs.body || '';
|
| 403 |
+
}
|
| 404 |
+
|
| 405 |
+
function updateProductionControlsVisibility() {
|
| 406 |
+
const isProduction = taskIdEl.value === 'task_production';
|
| 407 |
+
productionControlsEl.style.display = isProduction ? 'block' : 'none';
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
function safeNumber(value) {
|
| 411 |
+
return typeof value === 'number' && !Number.isNaN(value) ? value : null;
|
| 412 |
+
}
|
| 413 |
+
|
| 414 |
+
function updateInsights(data) {
|
| 415 |
+
const info = (data && data.info) ? data.info : {};
|
| 416 |
+
const taskValue = info.task_id || data.task_id || (data.observation && data.observation.task_id) || '-';
|
| 417 |
+
const scenarioValue = info.scenario_id || '-';
|
| 418 |
+
|
| 419 |
+
insightTaskEl.textContent = taskValue;
|
| 420 |
+
insightScenarioEl.textContent = scenarioValue;
|
| 421 |
+
|
| 422 |
+
const emailsProcessed = safeNumber(info.emails_processed);
|
| 423 |
+
const emailsTotal = safeNumber(info.emails_total);
|
| 424 |
+
if (emailsProcessed !== null && emailsTotal !== null) {
|
| 425 |
+
insightProgressEl.textContent = `${emailsProcessed}/${emailsTotal}`;
|
| 426 |
+
} else if (safeNumber(data.current_step) !== null && safeNumber(data.total_steps) !== null) {
|
| 427 |
+
insightProgressEl.textContent = `${data.current_step}/${data.total_steps}`;
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
const rewardValue = safeNumber(data.reward);
|
| 431 |
+
insightRewardEl.textContent = rewardValue !== null ? rewardValue.toFixed(2) : '-';
|
| 432 |
+
|
| 433 |
+
const baseScoreValue = safeNumber(info.base_score);
|
| 434 |
+
insightBaseEl.textContent = baseScoreValue !== null ? baseScoreValue.toFixed(2) : '-';
|
| 435 |
+
|
| 436 |
+
const tips = [];
|
| 437 |
+
if (info.validation_error) {
|
| 438 |
+
tips.push('Action format is invalid. Keep label/summary/route_to filled correctly.');
|
| 439 |
+
}
|
| 440 |
+
|
| 441 |
+
const routeNoise = safeNumber(info.grade_route_noise_penalty);
|
| 442 |
+
if (routeNoise !== null && routeNoise > 0.01) {
|
| 443 |
+
tips.push('Route to one best owner team. Avoid sending to many teams at once.');
|
| 444 |
+
}
|
| 445 |
+
|
| 446 |
+
const summaryMatch = safeNumber(info.grade_summary_match);
|
| 447 |
+
if (summaryMatch !== null && summaryMatch < 0.6) {
|
| 448 |
+
tips.push('Summary is weak. Include concrete clues from subject/body/thread.');
|
| 449 |
+
}
|
| 450 |
+
|
| 451 |
+
const labelMatch = safeNumber(info.grade_label_match);
|
| 452 |
+
if (labelMatch !== null && labelMatch < 1.0) {
|
| 453 |
+
tips.push('Priority label may be off. Re-check urgency and risk signals.');
|
| 454 |
+
}
|
| 455 |
+
|
| 456 |
+
const routeMatch = safeNumber(info.grade_route_match);
|
| 457 |
+
if (routeMatch !== null && routeMatch < 1.0) {
|
| 458 |
+
tips.push('Routing looks off. Pick the team that directly owns this issue.');
|
| 459 |
+
}
|
| 460 |
+
|
| 461 |
+
const urgencyComponent = safeNumber(info.grade_urgency_component);
|
| 462 |
+
if (urgencyComponent !== null && urgencyComponent < 0.2) {
|
| 463 |
+
tips.push('For high-risk complaints, mark urgent and route to safety first.');
|
| 464 |
+
}
|
| 465 |
+
|
| 466 |
+
if (!tips.length && typeof info.grading_feedback === 'string' && info.grading_feedback) {
|
| 467 |
+
tips.push(info.grading_feedback);
|
| 468 |
+
}
|
| 469 |
+
|
| 470 |
+
coachNotesEl.textContent = tips.length
|
| 471 |
+
? tips.join(' ')
|
| 472 |
+
: 'Looks good. Keep your next route precise and your summary evidence-based.';
|
| 473 |
+
}
|
| 474 |
+
|
| 475 |
+
function prefillAction(label, routeTo, summary) {
|
| 476 |
+
document.getElementById('label').value = label;
|
| 477 |
+
document.getElementById('routeTo').value = routeTo;
|
| 478 |
+
document.getElementById('summary').value = summary;
|
| 479 |
+
}
|
| 480 |
+
|
| 481 |
+
async function postJson(path, payload) {
|
| 482 |
+
const response = await fetch(path, {
|
| 483 |
+
method: 'POST',
|
| 484 |
+
headers: { 'Content-Type': 'application/json' },
|
| 485 |
+
body: JSON.stringify(payload || {}),
|
| 486 |
+
});
|
| 487 |
+
const text = await response.text();
|
| 488 |
+
let data = text;
|
| 489 |
+
try { data = JSON.parse(text); } catch (e) {}
|
| 490 |
+
if (!response.ok) {
|
| 491 |
+
throw new Error('HTTP ' + response.status + ' - ' + text);
|
| 492 |
+
}
|
| 493 |
+
return data;
|
| 494 |
+
}
|
| 495 |
+
|
| 496 |
+
async function warmup() {
|
| 497 |
+
try {
|
| 498 |
+
const res = await fetch('/meta');
|
| 499 |
+
const data = await res.json();
|
| 500 |
+
badgeEl.textContent = data.status === 'ok' ? 'ready' : 'check service';
|
| 501 |
+
} catch (e) {
|
| 502 |
+
badgeEl.textContent = 'offline';
|
| 503 |
+
}
|
| 504 |
+
}
|
| 505 |
+
|
| 506 |
+
document.getElementById('btnReset').addEventListener('click', async () => {
|
| 507 |
+
const taskId = taskIdEl.value;
|
| 508 |
+
setStatus('Starting a new scenario...');
|
| 509 |
+
try {
|
| 510 |
+
const payload = { task_id: taskId };
|
| 511 |
+
if (taskId === 'task_production') {
|
| 512 |
+
payload.production_profile = document.getElementById('productionProfile').value;
|
| 513 |
+
payload.business_hours_mode = document.getElementById('businessHoursMode').value === 'true';
|
| 514 |
+
payload.escalation_mode = document.getElementById('escalationMode').value;
|
| 515 |
+
}
|
| 516 |
+
const data = await postJson('/reset', payload);
|
| 517 |
+
setStatus('Scenario started. Read the email below.');
|
| 518 |
+
updateEmailPanel(data);
|
| 519 |
+
updateInsights(data);
|
| 520 |
+
writeOutput(data);
|
| 521 |
+
} catch (e) {
|
| 522 |
+
setStatus('Could not start scenario. See details below.', true);
|
| 523 |
+
writeOutput(String(e));
|
| 524 |
+
}
|
| 525 |
+
});
|
| 526 |
+
|
| 527 |
+
document.getElementById('btnState').addEventListener('click', async () => {
|
| 528 |
+
setStatus('Checking progress...');
|
| 529 |
+
try {
|
| 530 |
+
const data = await postJson('/state', {});
|
| 531 |
+
setStatus('Progress updated.');
|
| 532 |
+
updateInsights(data);
|
| 533 |
+
writeOutput(data);
|
| 534 |
+
} catch (e) {
|
| 535 |
+
setStatus('Could not fetch progress. See details below.', true);
|
| 536 |
+
writeOutput(String(e));
|
| 537 |
+
}
|
| 538 |
+
});
|
| 539 |
+
|
| 540 |
+
document.getElementById('btnStep').addEventListener('click', async () => {
|
| 541 |
+
const payload = {
|
| 542 |
+
label: document.getElementById('label').value,
|
| 543 |
+
summary: document.getElementById('summary').value,
|
| 544 |
+
route_to: document.getElementById('routeTo').value,
|
| 545 |
+
};
|
| 546 |
+
setStatus('Sending your decision...');
|
| 547 |
+
try {
|
| 548 |
+
const data = await postJson('/step', payload);
|
| 549 |
+
setStatus('Decision saved.');
|
| 550 |
+
updateEmailPanel(data);
|
| 551 |
+
updateInsights(data);
|
| 552 |
+
writeOutput(data);
|
| 553 |
+
} catch (e) {
|
| 554 |
+
setStatus('Could not submit decision. See details below.', true);
|
| 555 |
+
writeOutput(String(e));
|
| 556 |
+
}
|
| 557 |
+
});
|
| 558 |
+
|
| 559 |
+
document.getElementById('chipSafety').addEventListener('click', () => {
|
| 560 |
+
prefillAction('urgent', 'safety', 'Potential safety risk with immediate escalation needed.');
|
| 561 |
+
});
|
| 562 |
+
|
| 563 |
+
document.getElementById('chipBilling').addEventListener('click', () => {
|
| 564 |
+
prefillAction('normal', 'billing', 'Customer billing issue needs finance team review and response.');
|
| 565 |
+
});
|
| 566 |
+
|
| 567 |
+
document.getElementById('chipSpam').addEventListener('click', () => {
|
| 568 |
+
prefillAction('spam', 'general', 'Likely phishing or irrelevant message with suspicious external request.');
|
| 569 |
+
});
|
| 570 |
+
|
| 571 |
+
taskIdEl.addEventListener('change', updateProductionControlsVisibility);
|
| 572 |
+
|
| 573 |
+
updateProductionControlsVisibility();
|
| 574 |
+
warmup();
|
| 575 |
+
</script>
|
| 576 |
+
</body>
|
| 577 |
+
</html>
|
| 578 |
+
"""
|
| 579 |
+
|
| 580 |
+
app = Flask(__name__)
|
| 581 |
+
current_env = EmailTriageEnv(task_id="task_easy")
|
| 582 |
+
SCENARIO_COUNTERS = {task_id: 0 for task_id in list_task_ids()}
|
| 583 |
+
DEFAULT_EVAL_SPLIT = os.getenv("OPENENV_EVAL_SPLIT", "public")
|
| 584 |
+
ALLOW_CLIENT_EVAL_OVERRIDE = (
|
| 585 |
+
os.getenv("OPENENV_ALLOW_CLIENT_EVAL_OVERRIDE", "false").strip().lower() == "true"
|
| 586 |
+
)
|
| 587 |
+
|
| 588 |
+
|
| 589 |
+
@app.get("/")
|
| 590 |
+
def root_page():
|
| 591 |
+
"""Render a lightweight frontend for interacting with the environment."""
|
| 592 |
+
return Response(FRONTEND_HTML, mimetype="text/html")
|
| 593 |
+
|
| 594 |
+
|
| 595 |
+
@app.get("/meta")
|
| 596 |
+
def root_endpoint():
|
| 597 |
+
"""Return service metadata for health checks and machine clients."""
|
| 598 |
+
return jsonify(
|
| 599 |
+
{
|
| 600 |
+
"name": "email-triage-env",
|
| 601 |
+
"status": "ok",
|
| 602 |
+
"endpoints": {
|
| 603 |
+
"reset": {"method": "POST", "path": "/reset"},
|
| 604 |
+
"step": {"method": "POST", "path": "/step"},
|
| 605 |
+
"state": {"method": "POST", "path": "/state"},
|
| 606 |
+
},
|
| 607 |
+
"scenario_pools": {
|
| 608 |
+
"public": {
|
| 609 |
+
task_id: get_task_scenario_count(task_id, "public")
|
| 610 |
+
for task_id in list_task_ids()
|
| 611 |
+
},
|
| 612 |
+
},
|
| 613 |
+
"eval_split": DEFAULT_EVAL_SPLIT,
|
| 614 |
+
"production_runtime_controls": {
|
| 615 |
+
"production_profile": ["light", "standard", "heavy"],
|
| 616 |
+
"business_hours_mode": [True, False],
|
| 617 |
+
"escalation_mode": ["low", "normal", "high"],
|
| 618 |
+
},
|
| 619 |
+
}
|
| 620 |
+
)
|
| 621 |
+
|
| 622 |
+
|
| 623 |
+
@app.post("/reset")
|
| 624 |
+
def reset_endpoint():
|
| 625 |
+
"""Reset the environment with a selected task and return ResetResult JSON.
|
| 626 |
+
|
| 627 |
+
Returns:
|
| 628 |
+
Flask response containing reset payload.
|
| 629 |
+
"""
|
| 630 |
+
global current_env
|
| 631 |
+
global SCENARIO_COUNTERS
|
| 632 |
+
|
| 633 |
+
payload = request.get_json(silent=True)
|
| 634 |
+
if payload is None:
|
| 635 |
+
payload = {}
|
| 636 |
+
elif not isinstance(payload, dict):
|
| 637 |
+
return jsonify({"error": "Malformed JSON payload."}), 400
|
| 638 |
+
|
| 639 |
+
task_id = payload.get("task_id", "task_easy")
|
| 640 |
+
if not isinstance(task_id, str):
|
| 641 |
+
return jsonify({"error": "Field 'task_id' must be a string."}), 400
|
| 642 |
+
|
| 643 |
+
runtime_options: dict[str, object] = {}
|
| 644 |
+
if task_id == "task_production":
|
| 645 |
+
production_profile = payload.get("production_profile", "standard")
|
| 646 |
+
if not isinstance(production_profile, str) or production_profile not in {
|
| 647 |
+
"light",
|
| 648 |
+
"standard",
|
| 649 |
+
"heavy",
|
| 650 |
+
}:
|
| 651 |
+
return (
|
| 652 |
+
jsonify(
|
| 653 |
+
{
|
| 654 |
+
"error": (
|
| 655 |
+
"Field 'production_profile' must be one of "
|
| 656 |
+
"light/standard/heavy."
|
| 657 |
+
)
|
| 658 |
+
}
|
| 659 |
+
),
|
| 660 |
+
400,
|
| 661 |
+
)
|
| 662 |
+
|
| 663 |
+
escalation_mode = payload.get("escalation_mode", "normal")
|
| 664 |
+
if not isinstance(escalation_mode, str) or escalation_mode not in {
|
| 665 |
+
"low",
|
| 666 |
+
"normal",
|
| 667 |
+
"high",
|
| 668 |
+
}:
|
| 669 |
+
return (
|
| 670 |
+
jsonify(
|
| 671 |
+
{
|
| 672 |
+
"error": (
|
| 673 |
+
"Field 'escalation_mode' must be one of "
|
| 674 |
+
"low/normal/high."
|
| 675 |
+
)
|
| 676 |
+
}
|
| 677 |
+
),
|
| 678 |
+
400,
|
| 679 |
+
)
|
| 680 |
+
|
| 681 |
+
business_hours_mode = payload.get("business_hours_mode", False)
|
| 682 |
+
if isinstance(business_hours_mode, str):
|
| 683 |
+
business_hours_mode = business_hours_mode.strip().lower() in {
|
| 684 |
+
"1",
|
| 685 |
+
"true",
|
| 686 |
+
"yes",
|
| 687 |
+
"on",
|
| 688 |
+
}
|
| 689 |
+
elif not isinstance(business_hours_mode, bool):
|
| 690 |
+
return jsonify({"error": "Field 'business_hours_mode' must be boolean."}), 400
|
| 691 |
+
|
| 692 |
+
runtime_options = {
|
| 693 |
+
"production_profile": production_profile,
|
| 694 |
+
"business_hours_mode": business_hours_mode,
|
| 695 |
+
"escalation_mode": escalation_mode,
|
| 696 |
+
}
|
| 697 |
+
|
| 698 |
+
if not ALLOW_CLIENT_EVAL_OVERRIDE and (
|
| 699 |
+
"eval_split" in payload or "scenario_index" in payload
|
| 700 |
+
):
|
| 701 |
+
return jsonify(
|
| 702 |
+
{
|
| 703 |
+
"error": (
|
| 704 |
+
"Client overrides for eval_split/scenario_index are disabled "
|
| 705 |
+
"by server policy."
|
| 706 |
+
)
|
| 707 |
+
}
|
| 708 |
+
), 400
|
| 709 |
+
|
| 710 |
+
eval_split = DEFAULT_EVAL_SPLIT
|
| 711 |
+
if ALLOW_CLIENT_EVAL_OVERRIDE:
|
| 712 |
+
requested_split = payload.get("eval_split", DEFAULT_EVAL_SPLIT)
|
| 713 |
+
if not isinstance(requested_split, str):
|
| 714 |
+
return jsonify({"error": "Field 'eval_split' must be a string."}), 400
|
| 715 |
+
eval_split = requested_split
|
| 716 |
+
|
| 717 |
+
requested_index = payload.get("scenario_index") if ALLOW_CLIENT_EVAL_OVERRIDE else None
|
| 718 |
+
if requested_index is not None and (not isinstance(requested_index, int) or requested_index < 0):
|
| 719 |
+
return jsonify({"error": "Field 'scenario_index' must be a non-negative integer."}), 400
|
| 720 |
+
|
| 721 |
+
try:
|
| 722 |
+
scenario_count = get_task_scenario_count(task_id, eval_split)
|
| 723 |
+
if requested_index is None:
|
| 724 |
+
scenario_index = SCENARIO_COUNTERS.get(task_id, 0)
|
| 725 |
+
if scenario_count > 0:
|
| 726 |
+
SCENARIO_COUNTERS[task_id] = (scenario_index + 1) % scenario_count
|
| 727 |
+
else:
|
| 728 |
+
scenario_index = requested_index
|
| 729 |
+
|
| 730 |
+
current_env = EmailTriageEnv(
|
| 731 |
+
task_id=task_id,
|
| 732 |
+
scenario_index=scenario_index,
|
| 733 |
+
split=eval_split,
|
| 734 |
+
runtime_options=runtime_options,
|
| 735 |
+
)
|
| 736 |
+
reset_result = current_env.reset()
|
| 737 |
+
except KeyError as error:
|
| 738 |
+
return jsonify({"error": str(error)}), 400
|
| 739 |
+
|
| 740 |
+
return jsonify(reset_result.model_dump())
|
| 741 |
+
|
| 742 |
+
|
| 743 |
+
@app.post("/step")
|
| 744 |
+
def step_endpoint():
|
| 745 |
+
"""Advance environment by one action and return StepResult JSON.
|
| 746 |
+
|
| 747 |
+
Returns:
|
| 748 |
+
Flask response containing step payload.
|
| 749 |
+
"""
|
| 750 |
+
payload = request.get_json(silent=True)
|
| 751 |
+
if payload is None:
|
| 752 |
+
return jsonify({"error": "Malformed JSON payload."}), 400
|
| 753 |
+
|
| 754 |
+
step_result = current_env.step(payload)
|
| 755 |
+
return jsonify(step_result.model_dump())
|
| 756 |
+
|
| 757 |
+
|
| 758 |
+
@app.post("/state")
|
| 759 |
+
def state_endpoint():
|
| 760 |
+
"""Return read-only EnvironmentState JSON snapshot.
|
| 761 |
+
|
| 762 |
+
Returns:
|
| 763 |
+
Flask response containing state payload.
|
| 764 |
+
"""
|
| 765 |
+
state_result = current_env.state()
|
| 766 |
+
return jsonify(state_result.model_dump())
|
| 767 |
+
|
| 768 |
+
|
| 769 |
+
def main() -> None:
|
| 770 |
+
"""Run the Flask app for local and script-based launches."""
|
| 771 |
+
app.run(host="0.0.0.0", port=7860)
|
| 772 |
+
|
| 773 |
+
|
| 774 |
+
if __name__ == "__main__":
|
| 775 |
+
main()
|
environment.py
ADDED
|
@@ -0,0 +1,469 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Core OpenEnv email triage environment implementation."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from typing import cast
|
| 5 |
+
|
| 6 |
+
from pydantic import ValidationError
|
| 7 |
+
|
| 8 |
+
from graders import grade_easy, grade_hard, grade_medium_step
|
| 9 |
+
from models import (
|
| 10 |
+
EmailObservation,
|
| 11 |
+
EnvironmentState,
|
| 12 |
+
ResetResult,
|
| 13 |
+
RewardResult,
|
| 14 |
+
StepResult,
|
| 15 |
+
TriageAction,
|
| 16 |
+
)
|
| 17 |
+
from tasks import get_task_definition
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class EmailTriageEnv:
|
| 21 |
+
"""Deterministic email triage environment implementing reset, step, and state."""
|
| 22 |
+
|
| 23 |
+
def __init__(
|
| 24 |
+
self,
|
| 25 |
+
task_id: str,
|
| 26 |
+
scenario_index: int = 0,
|
| 27 |
+
split: str | None = None,
|
| 28 |
+
runtime_options: dict[str, object] | None = None,
|
| 29 |
+
) -> None:
|
| 30 |
+
"""Initialize environment with a selected task.
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
task_id: Task identifier such as task_easy, task_medium, or task_hard.
|
| 34 |
+
scenario_index: Deterministic scenario index within the task pool.
|
| 35 |
+
split: Scenario split, either public or private_eval.
|
| 36 |
+
runtime_options: Optional deterministic runtime controls for task generation.
|
| 37 |
+
"""
|
| 38 |
+
self.task_id = task_id
|
| 39 |
+
self._episode_index = max(0, scenario_index)
|
| 40 |
+
self.split = split or os.getenv("OPENENV_EVAL_SPLIT", "public")
|
| 41 |
+
self.runtime_options = runtime_options or {}
|
| 42 |
+
self._task_definition = get_task_definition(
|
| 43 |
+
task_id,
|
| 44 |
+
self._episode_index,
|
| 45 |
+
self.split,
|
| 46 |
+
self.runtime_options,
|
| 47 |
+
)
|
| 48 |
+
self._scenario_id = str(self._task_definition.get("scenario_id", "unknown"))
|
| 49 |
+
self._emails = cast(list[dict[str, object]], self._task_definition.get("emails", []))
|
| 50 |
+
self._ground_truth = cast(
|
| 51 |
+
list[dict[str, object]], self._task_definition.get("ground_truth", [])
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
self._current_index = 0
|
| 55 |
+
self._current_step = 0
|
| 56 |
+
self._done = False
|
| 57 |
+
self._max_steps = max(10, len(self._emails) + 5)
|
| 58 |
+
self._action_history: list[TriageAction] = []
|
| 59 |
+
self._reward_history: list[float] = []
|
| 60 |
+
self._base_score_history: list[float] = []
|
| 61 |
+
self._generated_followups = 0
|
| 62 |
+
self._max_generated_followups = 4
|
| 63 |
+
self._followup_quality_threshold = 0.7
|
| 64 |
+
self._configure_runtime_controls()
|
| 65 |
+
|
| 66 |
+
def reset(self) -> ResetResult:
|
| 67 |
+
"""Reset episode state and return the first observation.
|
| 68 |
+
|
| 69 |
+
Returns:
|
| 70 |
+
ResetResult containing first observation and metadata.
|
| 71 |
+
"""
|
| 72 |
+
self._task_definition = get_task_definition(
|
| 73 |
+
self.task_id,
|
| 74 |
+
self._episode_index,
|
| 75 |
+
self.split,
|
| 76 |
+
self.runtime_options,
|
| 77 |
+
)
|
| 78 |
+
self._scenario_id = str(self._task_definition.get("scenario_id", "unknown"))
|
| 79 |
+
self._emails = cast(list[dict[str, object]], self._task_definition.get("emails", []))
|
| 80 |
+
self._ground_truth = cast(
|
| 81 |
+
list[dict[str, object]], self._task_definition.get("ground_truth", [])
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
self._current_index = 0
|
| 85 |
+
self._current_step = 0
|
| 86 |
+
self._done = False
|
| 87 |
+
self._max_steps = max(10, len(self._emails) + 5)
|
| 88 |
+
self._action_history = []
|
| 89 |
+
self._reward_history = []
|
| 90 |
+
self._base_score_history = []
|
| 91 |
+
self._generated_followups = 0
|
| 92 |
+
self._configure_runtime_controls()
|
| 93 |
+
self._episode_index += 1
|
| 94 |
+
|
| 95 |
+
first_observation = self._build_observation(self._current_index)
|
| 96 |
+
return ResetResult(
|
| 97 |
+
observation=first_observation,
|
| 98 |
+
info={
|
| 99 |
+
"task_id": self.task_id,
|
| 100 |
+
"scenario_id": self._scenario_id,
|
| 101 |
+
"split": self.split,
|
| 102 |
+
"step": self._current_step,
|
| 103 |
+
"emails_total": len(self._emails),
|
| 104 |
+
"task_description": str(self._task_definition.get("description", "")),
|
| 105 |
+
},
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
def step(self, action: TriageAction) -> StepResult:
|
| 109 |
+
"""Apply an action and return StepResult.
|
| 110 |
+
|
| 111 |
+
Args:
|
| 112 |
+
action: Proposed triage action.
|
| 113 |
+
|
| 114 |
+
Returns:
|
| 115 |
+
StepResult with next observation, reward, done flag, and metadata.
|
| 116 |
+
"""
|
| 117 |
+
if self._done:
|
| 118 |
+
return StepResult(
|
| 119 |
+
observation=self._terminal_observation(),
|
| 120 |
+
reward=0.0,
|
| 121 |
+
done=True,
|
| 122 |
+
info={
|
| 123 |
+
"task_id": self.task_id,
|
| 124 |
+
"scenario_id": self._scenario_id,
|
| 125 |
+
"split": self.split,
|
| 126 |
+
"step": self._current_step,
|
| 127 |
+
"already_done": True,
|
| 128 |
+
},
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
try:
|
| 132 |
+
validated_action = TriageAction.model_validate(action)
|
| 133 |
+
except ValidationError as validation_error:
|
| 134 |
+
self._current_step += 1
|
| 135 |
+
self._reward_history.append(0.0)
|
| 136 |
+
self._done = self._current_step >= self._max_steps
|
| 137 |
+
return StepResult(
|
| 138 |
+
observation=self._build_observation(self._current_index),
|
| 139 |
+
reward=0.0,
|
| 140 |
+
done=self._done,
|
| 141 |
+
info={
|
| 142 |
+
"task_id": self.task_id,
|
| 143 |
+
"scenario_id": self._scenario_id,
|
| 144 |
+
"split": self.split,
|
| 145 |
+
"step": self._current_step,
|
| 146 |
+
"emails_total": len(self._emails),
|
| 147 |
+
"emails_processed": self._current_index,
|
| 148 |
+
"emails_remaining": max(len(self._emails) - self._current_index, 0),
|
| 149 |
+
"validation_error": str(validation_error),
|
| 150 |
+
},
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
base_result = self._grade_current_step(validated_action)
|
| 154 |
+
base_score = base_result.score
|
| 155 |
+
previous_base_score = self._base_score_history[-1] if self._base_score_history else None
|
| 156 |
+
progress_signal = self._compute_progress_signal(base_score, previous_base_score)
|
| 157 |
+
|
| 158 |
+
truth_for_step = (
|
| 159 |
+
self._ground_truth[min(self._current_index, len(self._ground_truth) - 1)]
|
| 160 |
+
if self._ground_truth
|
| 161 |
+
else {}
|
| 162 |
+
)
|
| 163 |
+
self._maybe_enqueue_follow_up(validated_action, truth_for_step, base_score)
|
| 164 |
+
|
| 165 |
+
self._action_history.append(validated_action)
|
| 166 |
+
self._base_score_history.append(base_score)
|
| 167 |
+
self._current_step += 1
|
| 168 |
+
|
| 169 |
+
penalties = self._compute_penalties(validated_action)
|
| 170 |
+
trajectory_bonus = self._compute_trajectory_bonus()
|
| 171 |
+
step_cost = self._compute_step_cost()
|
| 172 |
+
final_reward = self._clip_reward(
|
| 173 |
+
base_score + progress_signal + trajectory_bonus - penalties - step_cost
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
self._reward_history.append(final_reward)
|
| 177 |
+
|
| 178 |
+
if self._current_index < len(self._emails):
|
| 179 |
+
self._current_index += 1
|
| 180 |
+
|
| 181 |
+
all_emails_processed = self._current_index >= len(self._emails)
|
| 182 |
+
self._done = all_emails_processed or self._current_step >= self._max_steps
|
| 183 |
+
|
| 184 |
+
next_observation = (
|
| 185 |
+
self._terminal_observation()
|
| 186 |
+
if self._done
|
| 187 |
+
else self._build_observation(self._current_index)
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
info = {
|
| 191 |
+
"task_id": self.task_id,
|
| 192 |
+
"scenario_id": self._scenario_id,
|
| 193 |
+
"split": self.split,
|
| 194 |
+
"step": self._current_step,
|
| 195 |
+
"emails_total": len(self._emails),
|
| 196 |
+
"emails_processed": min(self._current_index, len(self._emails)),
|
| 197 |
+
"emails_remaining": max(len(self._emails) - self._current_index, 0),
|
| 198 |
+
"base_score": round(base_score, 4),
|
| 199 |
+
"progress_signal": round(progress_signal, 4),
|
| 200 |
+
"step_cost": round(step_cost, 4),
|
| 201 |
+
"penalties": round(penalties, 4),
|
| 202 |
+
"trajectory_bonus": round(trajectory_bonus, 4),
|
| 203 |
+
"grading_feedback": base_result.feedback,
|
| 204 |
+
}
|
| 205 |
+
for breakdown_key, breakdown_value in base_result.breakdown.items():
|
| 206 |
+
if isinstance(breakdown_value, (int, float)):
|
| 207 |
+
info[f"grade_{breakdown_key}"] = round(float(breakdown_value), 4)
|
| 208 |
+
|
| 209 |
+
return StepResult(
|
| 210 |
+
observation=next_observation,
|
| 211 |
+
reward=final_reward,
|
| 212 |
+
done=self._done,
|
| 213 |
+
info=info,
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
def _maybe_enqueue_follow_up(
|
| 217 |
+
self,
|
| 218 |
+
action: TriageAction,
|
| 219 |
+
truth: dict[str, object],
|
| 220 |
+
base_score: float,
|
| 221 |
+
) -> None:
|
| 222 |
+
"""Insert deterministic escalation follow-up emails for production mode."""
|
| 223 |
+
if self.task_id != "task_production":
|
| 224 |
+
return
|
| 225 |
+
if self._generated_followups >= self._max_generated_followups:
|
| 226 |
+
return
|
| 227 |
+
if not self._emails:
|
| 228 |
+
return
|
| 229 |
+
|
| 230 |
+
expected_label = str(truth.get("label", ""))
|
| 231 |
+
expected_route = str(truth.get("route_to", "general"))
|
| 232 |
+
is_missed_critical = (
|
| 233 |
+
expected_label == "urgent"
|
| 234 |
+
and (action.label != "urgent" or expected_route not in action.route_to.lower())
|
| 235 |
+
)
|
| 236 |
+
if not is_missed_critical and base_score >= self._followup_quality_threshold:
|
| 237 |
+
return
|
| 238 |
+
|
| 239 |
+
source_email = self._emails[min(self._current_index, len(self._emails) - 1)]
|
| 240 |
+
source_subject = str(source_email.get("subject", "Inbox incident"))
|
| 241 |
+
source_timestamp = str(source_email.get("timestamp", "2026-04-03T00:00:00Z"))
|
| 242 |
+
|
| 243 |
+
followup_email = {
|
| 244 |
+
"email_id": f"followup-{self._scenario_id}-{self._generated_followups + 1}",
|
| 245 |
+
"subject": f"Escalation follow-up: {source_subject}",
|
| 246 |
+
"body": (
|
| 247 |
+
"Automated escalation triggered because prior triage appears incomplete. "
|
| 248 |
+
"Please route to the responsible team and provide a clear summary now."
|
| 249 |
+
),
|
| 250 |
+
"sender": "incident-control@acme-enterprise.com",
|
| 251 |
+
"timestamp": source_timestamp,
|
| 252 |
+
"thread_history": [f"Previous message subject: {source_subject}"],
|
| 253 |
+
}
|
| 254 |
+
followup_truth = {
|
| 255 |
+
"label": "urgent",
|
| 256 |
+
"route_to": expected_route,
|
| 257 |
+
"priority_weight": min(max(float(truth.get("priority_weight", 1.5)) + 0.2, 1.5), 2.0),
|
| 258 |
+
"summary_keywords": ["escalation", "follow-up", expected_route],
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
insert_at = min(self._current_index + 1, len(self._emails))
|
| 262 |
+
self._emails.insert(insert_at, followup_email)
|
| 263 |
+
self._ground_truth.insert(insert_at, followup_truth)
|
| 264 |
+
self._generated_followups += 1
|
| 265 |
+
|
| 266 |
+
def _configure_runtime_controls(self) -> None:
|
| 267 |
+
"""Apply deterministic runtime control options for production simulator."""
|
| 268 |
+
if self.task_id != "task_production":
|
| 269 |
+
self._max_generated_followups = 4
|
| 270 |
+
self._followup_quality_threshold = 0.7
|
| 271 |
+
return
|
| 272 |
+
|
| 273 |
+
escalation_mode = str(self.runtime_options.get("escalation_mode", "normal")).lower()
|
| 274 |
+
escalation_map = {
|
| 275 |
+
"low": (2, 0.55),
|
| 276 |
+
"normal": (4, 0.7),
|
| 277 |
+
"high": (8, 0.85),
|
| 278 |
+
}
|
| 279 |
+
max_followups, threshold = escalation_map.get(escalation_mode, escalation_map["normal"])
|
| 280 |
+
self._max_generated_followups = max_followups
|
| 281 |
+
self._followup_quality_threshold = threshold
|
| 282 |
+
|
| 283 |
+
def state(self) -> EnvironmentState:
|
| 284 |
+
"""Return read-only snapshot of full internal state.
|
| 285 |
+
|
| 286 |
+
Returns:
|
| 287 |
+
EnvironmentState with progress and history.
|
| 288 |
+
"""
|
| 289 |
+
return EnvironmentState(
|
| 290 |
+
task_id=self.task_id,
|
| 291 |
+
current_step=self._current_step,
|
| 292 |
+
total_steps=self._max_steps,
|
| 293 |
+
done=self._done,
|
| 294 |
+
action_history=list(self._action_history),
|
| 295 |
+
reward_history=list(self._reward_history),
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
def _build_observation(self, email_index: int) -> EmailObservation:
|
| 299 |
+
"""Build observation for the email at a given index.
|
| 300 |
+
|
| 301 |
+
Args:
|
| 302 |
+
email_index: Zero-based email index.
|
| 303 |
+
|
| 304 |
+
Returns:
|
| 305 |
+
EmailObservation for the selected email or terminal placeholder.
|
| 306 |
+
"""
|
| 307 |
+
if not self._emails:
|
| 308 |
+
return self._terminal_observation()
|
| 309 |
+
|
| 310 |
+
safe_index = min(max(email_index, 0), len(self._emails) - 1)
|
| 311 |
+
email_payload = self._emails[safe_index]
|
| 312 |
+
|
| 313 |
+
return EmailObservation(
|
| 314 |
+
email_id=str(email_payload.get("email_id", "")),
|
| 315 |
+
subject=str(email_payload.get("subject", "")),
|
| 316 |
+
body=str(email_payload.get("body", "")),
|
| 317 |
+
sender=str(email_payload.get("sender", "")),
|
| 318 |
+
timestamp=str(email_payload.get("timestamp", "")),
|
| 319 |
+
thread_history=[str(item) for item in email_payload.get("thread_history", [])],
|
| 320 |
+
task_id=self.task_id,
|
| 321 |
+
step_number=self._current_step,
|
| 322 |
+
total_emails=len(self._emails),
|
| 323 |
+
)
|
| 324 |
+
|
| 325 |
+
def _terminal_observation(self) -> EmailObservation:
|
| 326 |
+
"""Build terminal observation returned when episode is complete.
|
| 327 |
+
|
| 328 |
+
Returns:
|
| 329 |
+
Terminal EmailObservation payload.
|
| 330 |
+
"""
|
| 331 |
+
return EmailObservation(
|
| 332 |
+
email_id="terminal",
|
| 333 |
+
subject="Episode complete",
|
| 334 |
+
body="No further emails remain for this task.",
|
| 335 |
+
sender="system",
|
| 336 |
+
timestamp="",
|
| 337 |
+
thread_history=[],
|
| 338 |
+
task_id=self.task_id,
|
| 339 |
+
step_number=self._current_step,
|
| 340 |
+
total_emails=len(self._emails),
|
| 341 |
+
)
|
| 342 |
+
|
| 343 |
+
def _grade_current_step(self, action: TriageAction) -> RewardResult:
|
| 344 |
+
"""Select deterministic grader based on task and current progress.
|
| 345 |
+
|
| 346 |
+
Args:
|
| 347 |
+
action: Validated action for the current step.
|
| 348 |
+
|
| 349 |
+
Returns:
|
| 350 |
+
RewardResult from task-specific grader.
|
| 351 |
+
"""
|
| 352 |
+
if not self._ground_truth:
|
| 353 |
+
return RewardResult(
|
| 354 |
+
score=0.0,
|
| 355 |
+
breakdown={"missing_ground_truth": 1.0},
|
| 356 |
+
feedback="Missing ground truth for task.",
|
| 357 |
+
)
|
| 358 |
+
|
| 359 |
+
if self.task_id == "task_easy":
|
| 360 |
+
truth = self._ground_truth[min(self._current_index, len(self._ground_truth) - 1)]
|
| 361 |
+
return grade_easy(action, truth)
|
| 362 |
+
|
| 363 |
+
if self.task_id == "task_medium":
|
| 364 |
+
truth = self._ground_truth[min(self._current_index, len(self._ground_truth) - 1)]
|
| 365 |
+
return grade_medium_step(action, truth)
|
| 366 |
+
|
| 367 |
+
truth = self._ground_truth[min(self._current_index, len(self._ground_truth) - 1)]
|
| 368 |
+
return grade_hard(action, truth)
|
| 369 |
+
|
| 370 |
+
def _compute_penalties(self, action: TriageAction) -> float:
|
| 371 |
+
"""Compute deterministic penalties according to reward policy.
|
| 372 |
+
|
| 373 |
+
Args:
|
| 374 |
+
action: Validated action for the step.
|
| 375 |
+
|
| 376 |
+
Returns:
|
| 377 |
+
Total penalty value for current step.
|
| 378 |
+
"""
|
| 379 |
+
penalty_total = 0.0
|
| 380 |
+
|
| 381 |
+
summary_too_short = len(action.summary.strip()) < 10
|
| 382 |
+
if action.label == "archive" and summary_too_short:
|
| 383 |
+
penalty_total += 0.5
|
| 384 |
+
|
| 385 |
+
if self._is_repeated_action_pattern(action):
|
| 386 |
+
penalty_total += 0.3
|
| 387 |
+
|
| 388 |
+
return penalty_total
|
| 389 |
+
|
| 390 |
+
def _compute_progress_signal(
|
| 391 |
+
self,
|
| 392 |
+
base_score: float,
|
| 393 |
+
previous_base_score: float | None,
|
| 394 |
+
) -> float:
|
| 395 |
+
"""Compute dense partial-progress reward independent of final completion.
|
| 396 |
+
|
| 397 |
+
Args:
|
| 398 |
+
base_score: Current-step base grade in [0.0, 1.0].
|
| 399 |
+
previous_base_score: Previous step base grade when available.
|
| 400 |
+
|
| 401 |
+
Returns:
|
| 402 |
+
Small positive/negative signal reflecting progress and quality trend.
|
| 403 |
+
"""
|
| 404 |
+
total_emails = max(len(self._emails), 1)
|
| 405 |
+
progress_ratio = min(1.0, (self._current_index + 1) / total_emails)
|
| 406 |
+
|
| 407 |
+
completion_signal = 0.05 * progress_ratio
|
| 408 |
+
quality_signal = 0.05 * self._clip_reward(base_score)
|
| 409 |
+
|
| 410 |
+
trend_signal = 0.0
|
| 411 |
+
if previous_base_score is not None:
|
| 412 |
+
delta = base_score - previous_base_score
|
| 413 |
+
trend_signal = max(-0.02, min(0.03, delta * 0.1))
|
| 414 |
+
|
| 415 |
+
return completion_signal + quality_signal + trend_signal
|
| 416 |
+
|
| 417 |
+
def _compute_step_cost(self) -> float:
|
| 418 |
+
"""Return a gentle efficiency cost that grows with episode length."""
|
| 419 |
+
normalized_step = self._current_step / max(self._max_steps, 1)
|
| 420 |
+
return 0.005 + (0.01 * normalized_step)
|
| 421 |
+
|
| 422 |
+
def _compute_trajectory_bonus(self) -> float:
|
| 423 |
+
"""Return trajectory bonus when episode completion quality is high.
|
| 424 |
+
|
| 425 |
+
Returns:
|
| 426 |
+
0.2 when mean base score is above threshold at completion, else 0.0.
|
| 427 |
+
"""
|
| 428 |
+
if not self._base_score_history:
|
| 429 |
+
return 0.0
|
| 430 |
+
|
| 431 |
+
all_emails_done_after_step = self._current_index + 1 >= len(self._emails)
|
| 432 |
+
if not all_emails_done_after_step:
|
| 433 |
+
return 0.0
|
| 434 |
+
|
| 435 |
+
mean_base = sum(self._base_score_history) / len(self._base_score_history)
|
| 436 |
+
return 0.2 if mean_base > 0.8 else 0.0
|
| 437 |
+
|
| 438 |
+
def _is_repeated_action_pattern(self, action: TriageAction) -> bool:
|
| 439 |
+
"""Detect whether same action appears three times consecutively.
|
| 440 |
+
|
| 441 |
+
Args:
|
| 442 |
+
action: Current action.
|
| 443 |
+
|
| 444 |
+
Returns:
|
| 445 |
+
True when repeated label and route occur three times in a row.
|
| 446 |
+
"""
|
| 447 |
+
if len(self._action_history) < 2:
|
| 448 |
+
return False
|
| 449 |
+
|
| 450 |
+
previous_action = self._action_history[-1]
|
| 451 |
+
older_action = self._action_history[-2]
|
| 452 |
+
|
| 453 |
+
return (
|
| 454 |
+
previous_action.label == older_action.label == action.label
|
| 455 |
+
and previous_action.route_to.strip().lower()
|
| 456 |
+
== older_action.route_to.strip().lower()
|
| 457 |
+
== action.route_to.strip().lower()
|
| 458 |
+
)
|
| 459 |
+
|
| 460 |
+
def _clip_reward(self, reward_value: float) -> float:
|
| 461 |
+
"""Clip reward to the inclusive range [-1.0, 1.0].
|
| 462 |
+
|
| 463 |
+
Args:
|
| 464 |
+
reward_value: Raw reward value.
|
| 465 |
+
|
| 466 |
+
Returns:
|
| 467 |
+
Clipped reward.
|
| 468 |
+
"""
|
| 469 |
+
return max(-1.0, min(1.0, reward_value))
|
graders.py
ADDED
|
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deterministic graders for OpenEnv email triage tasks."""
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
from models import RewardResult, TriageAction
|
| 6 |
+
|
| 7 |
+
ROUTE_ALIAS_MAP = {
|
| 8 |
+
"billing": ["billing", "finance", "payments", "accounts"],
|
| 9 |
+
"safety": ["safety", "compliance", "risk"],
|
| 10 |
+
"engineering": ["engineering", "eng", "sre", "platform", "on-call"],
|
| 11 |
+
"support": ["support", "helpdesk", "customer support"],
|
| 12 |
+
"general": ["general", "inbox", "operations"],
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def _clip_score(score_value: float) -> float:
|
| 17 |
+
"""Clip a score to the inclusive range [0.0, 1.0].
|
| 18 |
+
|
| 19 |
+
Args:
|
| 20 |
+
score_value: Raw score.
|
| 21 |
+
|
| 22 |
+
Returns:
|
| 23 |
+
Clipped score.
|
| 24 |
+
"""
|
| 25 |
+
return max(0.0, min(1.0, score_value))
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def _normalized_text(text_value: str) -> str:
|
| 29 |
+
"""Return normalized lowercase text for deterministic comparisons.
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
text_value: Input text.
|
| 33 |
+
|
| 34 |
+
Returns:
|
| 35 |
+
Normalized text.
|
| 36 |
+
"""
|
| 37 |
+
return text_value.strip().lower()
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _route_matches(action_route: str, expected_route: str) -> bool:
|
| 41 |
+
"""Check if action route contains the expected route token.
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
action_route: Route provided by agent.
|
| 45 |
+
expected_route: Route expected by ground truth.
|
| 46 |
+
|
| 47 |
+
Returns:
|
| 48 |
+
True when expected route is present in the action route.
|
| 49 |
+
"""
|
| 50 |
+
normalized_expected = _normalized_text(expected_route)
|
| 51 |
+
if not normalized_expected:
|
| 52 |
+
return False
|
| 53 |
+
|
| 54 |
+
return normalized_expected in _canonical_route_tokens(action_route)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def _canonical_route_tokens(action_route: str) -> set[str]:
|
| 58 |
+
"""Map free-form route text to canonical route categories."""
|
| 59 |
+
normalized_action = _normalized_text(action_route)
|
| 60 |
+
if not normalized_action:
|
| 61 |
+
return set()
|
| 62 |
+
|
| 63 |
+
route_fragments = [
|
| 64 |
+
fragment.strip()
|
| 65 |
+
for fragment in re.split(r"[,;/|]+", normalized_action)
|
| 66 |
+
if fragment.strip()
|
| 67 |
+
]
|
| 68 |
+
|
| 69 |
+
canonical: set[str] = set()
|
| 70 |
+
for fragment in route_fragments:
|
| 71 |
+
for route_name, aliases in ROUTE_ALIAS_MAP.items():
|
| 72 |
+
if any(alias in fragment for alias in aliases):
|
| 73 |
+
canonical.add(route_name)
|
| 74 |
+
break
|
| 75 |
+
|
| 76 |
+
# Fallback for phrases without separators.
|
| 77 |
+
if not canonical:
|
| 78 |
+
for route_name, aliases in ROUTE_ALIAS_MAP.items():
|
| 79 |
+
if any(alias in normalized_action for alias in aliases):
|
| 80 |
+
canonical.add(route_name)
|
| 81 |
+
|
| 82 |
+
return canonical
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def _route_noise_penalty(action_route: str) -> float:
|
| 86 |
+
"""Penalize over-routing to many teams in one action."""
|
| 87 |
+
route_count = len(_canonical_route_tokens(action_route))
|
| 88 |
+
if route_count <= 2:
|
| 89 |
+
return 0.0
|
| 90 |
+
return min(0.24, 0.08 * (route_count - 2))
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def _summary_keyword_score(summary_text: str, ground_truth: dict) -> float:
|
| 94 |
+
"""Score summary quality using deterministic keyword overlap.
|
| 95 |
+
|
| 96 |
+
Args:
|
| 97 |
+
summary_text: Summary text produced by the agent.
|
| 98 |
+
ground_truth: Ground-truth dict that may include summary keywords.
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
Score in [0.0, 1.0] based on matched summary keywords.
|
| 102 |
+
"""
|
| 103 |
+
raw_keywords = ground_truth.get("summary_keywords", [])
|
| 104 |
+
if not isinstance(raw_keywords, list):
|
| 105 |
+
return 1.0 if len(summary_text.strip()) >= 10 else 0.0
|
| 106 |
+
|
| 107 |
+
keywords = [
|
| 108 |
+
_normalized_text(str(keyword))
|
| 109 |
+
for keyword in raw_keywords
|
| 110 |
+
if _normalized_text(str(keyword))
|
| 111 |
+
]
|
| 112 |
+
if not keywords:
|
| 113 |
+
return 1.0 if len(summary_text.strip()) >= 10 else 0.0
|
| 114 |
+
|
| 115 |
+
normalized_summary = _normalized_text(summary_text)
|
| 116 |
+
matches = 0
|
| 117 |
+
for keyword in keywords:
|
| 118 |
+
if keyword in normalized_summary:
|
| 119 |
+
matches += 1
|
| 120 |
+
|
| 121 |
+
base_score = matches / len(keywords)
|
| 122 |
+
|
| 123 |
+
# Discourage keyword stuffing and overly verbose summaries.
|
| 124 |
+
word_count = len(re.findall(r"[a-z0-9'-]+", normalized_summary))
|
| 125 |
+
if word_count < 4:
|
| 126 |
+
brevity_factor = 0.6
|
| 127 |
+
elif word_count <= 40:
|
| 128 |
+
brevity_factor = 1.0
|
| 129 |
+
else:
|
| 130 |
+
brevity_factor = max(0.45, 1.0 - (word_count - 40) * 0.02)
|
| 131 |
+
|
| 132 |
+
list_like_penalty = 0.85 if normalized_summary.count(",") >= 6 and matches >= 3 else 1.0
|
| 133 |
+
return _clip_score(base_score * brevity_factor * list_like_penalty)
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def grade_easy(action: TriageAction, ground_truth: dict) -> RewardResult:
|
| 137 |
+
"""Grade easy task with deterministic partial credit.
|
| 138 |
+
|
| 139 |
+
Args:
|
| 140 |
+
action: Agent action for one email.
|
| 141 |
+
ground_truth: Expected label and route.
|
| 142 |
+
|
| 143 |
+
Returns:
|
| 144 |
+
Deterministic reward result in [0.0, 1.0].
|
| 145 |
+
"""
|
| 146 |
+
expected_label = _normalized_text(str(ground_truth.get("label", "")))
|
| 147 |
+
expected_route = _normalized_text(str(ground_truth.get("route_to", "")))
|
| 148 |
+
|
| 149 |
+
label_correct = _normalized_text(action.label) == expected_label
|
| 150 |
+
route_correct = _route_matches(action.route_to, expected_route)
|
| 151 |
+
summary_score = _summary_keyword_score(action.summary, ground_truth)
|
| 152 |
+
noise_penalty = _route_noise_penalty(action.route_to)
|
| 153 |
+
|
| 154 |
+
score_value = (0.6 if label_correct else 0.0) + (0.25 if route_correct else 0.0)
|
| 155 |
+
score_value += 0.15 * summary_score
|
| 156 |
+
score_value -= noise_penalty
|
| 157 |
+
|
| 158 |
+
score_value = _clip_score(score_value)
|
| 159 |
+
breakdown = {
|
| 160 |
+
"label_match": 1.0 if label_correct else 0.0,
|
| 161 |
+
"route_match": 1.0 if route_correct else 0.0,
|
| 162 |
+
"summary_match": round(summary_score, 4),
|
| 163 |
+
"route_noise_penalty": round(noise_penalty, 4),
|
| 164 |
+
}
|
| 165 |
+
feedback = "Easy-task grading completed with context summary scoring."
|
| 166 |
+
return RewardResult(score=score_value, breakdown=breakdown, feedback=feedback)
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def grade_medium_step(action: TriageAction, truth: dict) -> RewardResult:
|
| 170 |
+
"""Grade one medium-task step without cumulative history effects."""
|
| 171 |
+
expected_label = _normalized_text(str(truth.get("label", "")))
|
| 172 |
+
expected_route = _normalized_text(str(truth.get("route_to", "")))
|
| 173 |
+
priority_weight = max(float(truth.get("priority_weight", 1.0)), 0.1)
|
| 174 |
+
|
| 175 |
+
label_correct = _normalized_text(action.label) == expected_label
|
| 176 |
+
route_correct = _route_matches(action.route_to, expected_route)
|
| 177 |
+
summary_score = _summary_keyword_score(action.summary, truth)
|
| 178 |
+
noise_penalty = _route_noise_penalty(action.route_to)
|
| 179 |
+
|
| 180 |
+
per_email_score = (0.55 if label_correct else 0.0) + (0.3 if route_correct else 0.0)
|
| 181 |
+
per_email_score += 0.15 * summary_score
|
| 182 |
+
per_email_score -= noise_penalty
|
| 183 |
+
per_email_score = _clip_score(per_email_score)
|
| 184 |
+
|
| 185 |
+
weighted_step_score = _clip_score(per_email_score * min(priority_weight, 2.0))
|
| 186 |
+
|
| 187 |
+
return RewardResult(
|
| 188 |
+
score=weighted_step_score,
|
| 189 |
+
breakdown={
|
| 190 |
+
"label_match": 1.0 if label_correct else 0.0,
|
| 191 |
+
"route_match": 1.0 if route_correct else 0.0,
|
| 192 |
+
"summary_match": round(summary_score, 4),
|
| 193 |
+
"priority_weight": round(priority_weight, 4),
|
| 194 |
+
"route_noise_penalty": round(noise_penalty, 4),
|
| 195 |
+
},
|
| 196 |
+
feedback="Medium-task step grading completed.",
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def grade_medium(actions: list[TriageAction], ground_truths: list[dict]) -> RewardResult:
|
| 201 |
+
"""Grade medium task using weighted per-email partial scoring.
|
| 202 |
+
|
| 203 |
+
Args:
|
| 204 |
+
actions: Agent actions for the medium task email queue.
|
| 205 |
+
ground_truths: Expected action details for each email.
|
| 206 |
+
|
| 207 |
+
Returns:
|
| 208 |
+
Deterministic reward result in [0.0, 1.0].
|
| 209 |
+
"""
|
| 210 |
+
comparable_count = min(len(actions), len(ground_truths))
|
| 211 |
+
if comparable_count == 0:
|
| 212 |
+
return RewardResult(
|
| 213 |
+
score=0.0,
|
| 214 |
+
breakdown={"emails_scored": 0.0, "weighted_average": 0.0},
|
| 215 |
+
feedback="No actions available for grading.",
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
weighted_score_sum = 0.0
|
| 219 |
+
weight_sum = 0.0
|
| 220 |
+
label_hits = 0
|
| 221 |
+
route_hits = 0
|
| 222 |
+
summary_total = 0.0
|
| 223 |
+
noise_penalty_total = 0.0
|
| 224 |
+
|
| 225 |
+
for index in range(comparable_count):
|
| 226 |
+
action = actions[index]
|
| 227 |
+
truth = ground_truths[index]
|
| 228 |
+
|
| 229 |
+
step_result = grade_medium_step(action, truth)
|
| 230 |
+
priority_weight = float(step_result.breakdown.get("priority_weight", 1.0))
|
| 231 |
+
weighted_score_sum += step_result.score
|
| 232 |
+
weight_sum += min(priority_weight, 2.0)
|
| 233 |
+
|
| 234 |
+
label_hits += 1 if step_result.breakdown.get("label_match", 0.0) > 0 else 0
|
| 235 |
+
route_hits += 1 if step_result.breakdown.get("route_match", 0.0) > 0 else 0
|
| 236 |
+
summary_total += float(step_result.breakdown.get("summary_match", 0.0))
|
| 237 |
+
noise_penalty_total += float(step_result.breakdown.get("route_noise_penalty", 0.0))
|
| 238 |
+
|
| 239 |
+
weighted_average = weighted_score_sum / weight_sum if weight_sum > 0.0 else 0.0
|
| 240 |
+
score_value = _clip_score(weighted_average)
|
| 241 |
+
|
| 242 |
+
breakdown = {
|
| 243 |
+
"emails_scored": float(comparable_count),
|
| 244 |
+
"label_accuracy": label_hits / comparable_count,
|
| 245 |
+
"route_accuracy": route_hits / comparable_count,
|
| 246 |
+
"summary_accuracy": summary_total / comparable_count,
|
| 247 |
+
"avg_route_noise_penalty": noise_penalty_total / comparable_count,
|
| 248 |
+
"weighted_average": score_value,
|
| 249 |
+
}
|
| 250 |
+
feedback = "Weighted medium-task grading completed."
|
| 251 |
+
return RewardResult(score=score_value, breakdown=breakdown, feedback=feedback)
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
def grade_hard(action: TriageAction, ground_truth: dict) -> RewardResult:
|
| 255 |
+
"""Grade hard task using weighted policy-sensitive components.
|
| 256 |
+
|
| 257 |
+
Args:
|
| 258 |
+
action: Agent action for hard task case.
|
| 259 |
+
ground_truth: Expected routing and urgency intent.
|
| 260 |
+
|
| 261 |
+
Returns:
|
| 262 |
+
Deterministic reward result in [0.0, 1.0].
|
| 263 |
+
"""
|
| 264 |
+
expected_label = _normalized_text(str(ground_truth.get("label", "urgent")))
|
| 265 |
+
primary_route = _normalized_text(str(ground_truth.get("route_to", "safety")))
|
| 266 |
+
secondary_route = _normalized_text(str(ground_truth.get("cc_route", "billing")))
|
| 267 |
+
spam_penalty = float(ground_truth.get("penalize_spam", 0.2))
|
| 268 |
+
|
| 269 |
+
normalized_route = _normalized_text(action.route_to)
|
| 270 |
+
has_primary_route = _route_matches(normalized_route, primary_route)
|
| 271 |
+
has_secondary_route = _route_matches(normalized_route, secondary_route)
|
| 272 |
+
urgent_label = _normalized_text(action.label) == expected_label
|
| 273 |
+
summary_score = _summary_keyword_score(action.summary, ground_truth)
|
| 274 |
+
noise_penalty = _route_noise_penalty(action.route_to)
|
| 275 |
+
|
| 276 |
+
escalation_component = 0.35 if has_primary_route else 0.0
|
| 277 |
+
routing_component = 0.25 if has_secondary_route else 0.0
|
| 278 |
+
urgency_component = 0.25 if urgent_label else 0.0
|
| 279 |
+
summary_component = 0.15 * summary_score
|
| 280 |
+
|
| 281 |
+
raw_score = escalation_component + routing_component + urgency_component + summary_component
|
| 282 |
+
raw_score -= noise_penalty
|
| 283 |
+
if _normalized_text(action.label) == "spam":
|
| 284 |
+
raw_score -= spam_penalty
|
| 285 |
+
|
| 286 |
+
score_value = _clip_score(raw_score)
|
| 287 |
+
breakdown = {
|
| 288 |
+
"escalation_component": escalation_component,
|
| 289 |
+
"routing_component": routing_component,
|
| 290 |
+
"urgency_component": urgency_component,
|
| 291 |
+
"summary_component": round(summary_component, 4),
|
| 292 |
+
"route_noise_penalty": round(noise_penalty, 4),
|
| 293 |
+
"spam_penalty": spam_penalty if _normalized_text(action.label) == "spam" else 0.0,
|
| 294 |
+
}
|
| 295 |
+
feedback = "Hard-task weighted policy grading completed."
|
| 296 |
+
return RewardResult(score=score_value, breakdown=breakdown, feedback=feedback)
|
inference.py
ADDED
|
@@ -0,0 +1,377 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Inference script for OpenEnv email triage with strict stdout event format."""
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
import re
|
| 7 |
+
import time
|
| 8 |
+
from typing import Any
|
| 9 |
+
|
| 10 |
+
from openai import OpenAI
|
| 11 |
+
|
| 12 |
+
from environment import EmailTriageEnv
|
| 13 |
+
from models import EmailObservation, TriageAction
|
| 14 |
+
|
| 15 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 16 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
|
| 17 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 18 |
+
API_KEY = HF_TOKEN or os.getenv("API_KEY")
|
| 19 |
+
LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
|
| 20 |
+
|
| 21 |
+
BENCHMARK = "openenv-email-triage"
|
| 22 |
+
MAX_STEPS = 30
|
| 23 |
+
TEMPERATURE = 0.2
|
| 24 |
+
MAX_TOKENS = 200
|
| 25 |
+
SUCCESS_SCORE_THRESHOLD = 0.5
|
| 26 |
+
DEFAULT_RUNTIME_BUDGET_SECONDS = int(os.getenv("INFERENCE_RUNTIME_BUDGET_SECONDS", "1140"))
|
| 27 |
+
DEFAULT_REQUEST_TIMEOUT_SECONDS = float(os.getenv("INFERENCE_REQUEST_TIMEOUT_SECONDS", "12"))
|
| 28 |
+
|
| 29 |
+
SYSTEM_PROMPT = (
|
| 30 |
+
"You are an email triage assistant. For each email, prioritize risk/time impact, "
|
| 31 |
+
"categorize with one label (urgent|normal|spam|archive), route to the best team, "
|
| 32 |
+
"and summarize the key evidence. Return one JSON object with keys label, summary, route_to."
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
FALLBACK_ACTION = {
|
| 36 |
+
"label": "normal",
|
| 37 |
+
"summary": "Unable to parse response",
|
| 38 |
+
"route_to": "general",
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
TASK_MAP = {
|
| 42 |
+
"1": "task_easy",
|
| 43 |
+
"2": "task_medium",
|
| 44 |
+
"3": "task_hard",
|
| 45 |
+
"4": "task_production",
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def parse_args() -> argparse.Namespace:
|
| 50 |
+
"""Parse command-line arguments for task and optional model override."""
|
| 51 |
+
parser = argparse.ArgumentParser(description="Run OpenEnv email triage inference.")
|
| 52 |
+
parser.add_argument(
|
| 53 |
+
"--task",
|
| 54 |
+
default="all",
|
| 55 |
+
choices=["1", "2", "3", "4", "all"],
|
| 56 |
+
help="Task selection: 1, 2, 3, 4, or all.",
|
| 57 |
+
)
|
| 58 |
+
parser.add_argument(
|
| 59 |
+
"--model",
|
| 60 |
+
default=None,
|
| 61 |
+
help="Optional model override. Falls back to MODEL_NAME environment variable.",
|
| 62 |
+
)
|
| 63 |
+
parser.add_argument(
|
| 64 |
+
"--split",
|
| 65 |
+
default=os.getenv("OPENENV_EVAL_SPLIT", "public"),
|
| 66 |
+
choices=["public", "private_eval"],
|
| 67 |
+
help="Scenario split to evaluate.",
|
| 68 |
+
)
|
| 69 |
+
parser.add_argument(
|
| 70 |
+
"--episodes-per-task",
|
| 71 |
+
default=1,
|
| 72 |
+
type=int,
|
| 73 |
+
help="Number of deterministic scenarios to evaluate per task.",
|
| 74 |
+
)
|
| 75 |
+
parser.add_argument(
|
| 76 |
+
"--runtime-budget-seconds",
|
| 77 |
+
default=DEFAULT_RUNTIME_BUDGET_SECONDS,
|
| 78 |
+
type=int,
|
| 79 |
+
help="Global wall-clock budget for the full script run.",
|
| 80 |
+
)
|
| 81 |
+
parser.add_argument(
|
| 82 |
+
"--request-timeout-seconds",
|
| 83 |
+
default=DEFAULT_REQUEST_TIMEOUT_SECONDS,
|
| 84 |
+
type=float,
|
| 85 |
+
help="Timeout per LLM request.",
|
| 86 |
+
)
|
| 87 |
+
parser.add_argument(
|
| 88 |
+
"--production-profile",
|
| 89 |
+
default="standard",
|
| 90 |
+
choices=["light", "standard", "heavy"],
|
| 91 |
+
help="Runtime workload profile used for task 4 episodes.",
|
| 92 |
+
)
|
| 93 |
+
parser.add_argument(
|
| 94 |
+
"--business-hours-mode",
|
| 95 |
+
action="store_true",
|
| 96 |
+
help="If set, task 4 timestamps focus on business-hours windows.",
|
| 97 |
+
)
|
| 98 |
+
parser.add_argument(
|
| 99 |
+
"--escalation-mode",
|
| 100 |
+
default="normal",
|
| 101 |
+
choices=["low", "normal", "high"],
|
| 102 |
+
help="Escalation strictness for task 4 follow-up generation.",
|
| 103 |
+
)
|
| 104 |
+
return parser.parse_args()
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def validate_runtime_config(model_name: str | None) -> str:
|
| 108 |
+
"""Validate required runtime settings and return effective model name."""
|
| 109 |
+
if not API_KEY:
|
| 110 |
+
raise ValueError("Missing HF_TOKEN or API_KEY environment variable.")
|
| 111 |
+
|
| 112 |
+
effective_model = model_name or MODEL_NAME
|
| 113 |
+
return effective_model
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def log_start(task_name: str, benchmark_name: str, model_name: str) -> None:
|
| 117 |
+
"""Emit mandatory START line."""
|
| 118 |
+
print(
|
| 119 |
+
f"[START] task={task_name} env={benchmark_name} model={model_name}",
|
| 120 |
+
flush=True,
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def log_step(step: int, action_str: str, reward: float, done: bool, error: str | None) -> None:
|
| 125 |
+
"""Emit mandatory STEP line."""
|
| 126 |
+
error_value = error if error else "null"
|
| 127 |
+
done_value = str(done).lower()
|
| 128 |
+
print(
|
| 129 |
+
f"[STEP] step={step} action={action_str} reward={reward:.2f} "
|
| 130 |
+
f"done={done_value} error={error_value}",
|
| 131 |
+
flush=True,
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def log_end(success: bool, steps: int, rewards: list[float]) -> None:
|
| 136 |
+
"""Emit mandatory END line."""
|
| 137 |
+
rewards_str = ",".join(f"{reward:.2f}" for reward in rewards)
|
| 138 |
+
print(
|
| 139 |
+
f"[END] success={str(success).lower()} steps={steps} rewards={rewards_str}",
|
| 140 |
+
flush=True,
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def build_user_prompt(observation: EmailObservation, history: list[str]) -> str:
|
| 145 |
+
"""Build model prompt from current observation and recent history."""
|
| 146 |
+
recent_history = "\n".join(history[-5:]) if history else "None"
|
| 147 |
+
return (
|
| 148 |
+
f"email_id: {observation.email_id}\n"
|
| 149 |
+
f"subject: {observation.subject}\n"
|
| 150 |
+
f"sender: {observation.sender}\n"
|
| 151 |
+
f"timestamp: {observation.timestamp}\n"
|
| 152 |
+
f"body: {observation.body}\n"
|
| 153 |
+
f"thread_history: {observation.thread_history}\n"
|
| 154 |
+
f"task_id: {observation.task_id}\n"
|
| 155 |
+
f"step_number: {observation.step_number}\n"
|
| 156 |
+
f"total_emails: {observation.total_emails}\n\n"
|
| 157 |
+
f"recent_history:\n{recent_history}\n\n"
|
| 158 |
+
"Return exactly one JSON object with label, summary, route_to."
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def strip_action_prefixes(response_text: str) -> str:
|
| 163 |
+
"""Remove common formatting wrappers before parsing model output."""
|
| 164 |
+
cleaned = response_text.strip()
|
| 165 |
+
cleaned = re.sub(r"^```(?:json)?", "", cleaned, flags=re.IGNORECASE).strip()
|
| 166 |
+
cleaned = re.sub(r"```$", "", cleaned).strip()
|
| 167 |
+
cleaned = re.sub(r"^(next\s+action|action)\s*:\s*", "", cleaned, flags=re.IGNORECASE)
|
| 168 |
+
return cleaned.strip()
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def parse_text_action(cleaned_text: str) -> dict[str, str]:
|
| 172 |
+
"""Parse action from free-form text with deterministic regex fallback."""
|
| 173 |
+
result: dict[str, str] = {}
|
| 174 |
+
|
| 175 |
+
label_match = re.search(
|
| 176 |
+
r"(?:\"label\"|label)\s*[:=]\s*\"?(urgent|normal|spam|archive)\"?",
|
| 177 |
+
cleaned_text,
|
| 178 |
+
flags=re.IGNORECASE,
|
| 179 |
+
)
|
| 180 |
+
if label_match:
|
| 181 |
+
result["label"] = label_match.group(1).lower()
|
| 182 |
+
|
| 183 |
+
route_match = re.search(
|
| 184 |
+
r"(?:\"route_to\"|route_to|route)\s*[:=]\s*\"?([a-zA-Z0-9_\-/ ]+)\"?",
|
| 185 |
+
cleaned_text,
|
| 186 |
+
flags=re.IGNORECASE,
|
| 187 |
+
)
|
| 188 |
+
if route_match:
|
| 189 |
+
result["route_to"] = route_match.group(1).strip().lower()
|
| 190 |
+
|
| 191 |
+
summary_match = re.search(
|
| 192 |
+
r"(?:\"summary\"|summary)\s*[:=]\s*\"?([^\"\n]+)\"?",
|
| 193 |
+
cleaned_text,
|
| 194 |
+
flags=re.IGNORECASE,
|
| 195 |
+
)
|
| 196 |
+
if summary_match:
|
| 197 |
+
result["summary"] = summary_match.group(1).strip()
|
| 198 |
+
|
| 199 |
+
return result
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
def parse_action_response(response_text: str) -> TriageAction:
|
| 203 |
+
"""Parse model response into a valid TriageAction with fallback behavior."""
|
| 204 |
+
cleaned_text = strip_action_prefixes(response_text)
|
| 205 |
+
parsed_payload: dict[str, Any] = {}
|
| 206 |
+
|
| 207 |
+
json_start = cleaned_text.find("{")
|
| 208 |
+
json_end = cleaned_text.rfind("}")
|
| 209 |
+
if json_start != -1 and json_end != -1 and json_end > json_start:
|
| 210 |
+
candidate = cleaned_text[json_start : json_end + 1]
|
| 211 |
+
try:
|
| 212 |
+
loaded = json.loads(candidate)
|
| 213 |
+
if isinstance(loaded, dict):
|
| 214 |
+
parsed_payload = loaded
|
| 215 |
+
except json.JSONDecodeError:
|
| 216 |
+
parsed_payload = {}
|
| 217 |
+
|
| 218 |
+
if not parsed_payload:
|
| 219 |
+
parsed_payload = parse_text_action(cleaned_text)
|
| 220 |
+
|
| 221 |
+
fallback_copy = dict(FALLBACK_ACTION)
|
| 222 |
+
fallback_copy.update(parsed_payload)
|
| 223 |
+
|
| 224 |
+
try:
|
| 225 |
+
return TriageAction.model_validate(fallback_copy)
|
| 226 |
+
except Exception:
|
| 227 |
+
return TriageAction.model_validate(FALLBACK_ACTION)
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
def action_to_log_string(action: TriageAction) -> str:
|
| 231 |
+
"""Return single-line action string for required STEP logging."""
|
| 232 |
+
return json.dumps(action.model_dump(), separators=(",", ":"), ensure_ascii=True)
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
def run_episode(
|
| 236 |
+
client: OpenAI,
|
| 237 |
+
model_name: str,
|
| 238 |
+
task_id: str,
|
| 239 |
+
scenario_index: int,
|
| 240 |
+
eval_split: str,
|
| 241 |
+
deadline: float,
|
| 242 |
+
request_timeout_seconds: float,
|
| 243 |
+
runtime_options: dict[str, Any] | None = None,
|
| 244 |
+
) -> None:
|
| 245 |
+
"""Run one episode and emit strict START/STEP/END lines."""
|
| 246 |
+
rewards: list[float] = []
|
| 247 |
+
steps_taken = 0
|
| 248 |
+
success = False
|
| 249 |
+
env: EmailTriageEnv | None = None
|
| 250 |
+
|
| 251 |
+
log_start(task_name=task_id, benchmark_name=BENCHMARK, model_name=model_name)
|
| 252 |
+
|
| 253 |
+
try:
|
| 254 |
+
env = EmailTriageEnv(
|
| 255 |
+
task_id=task_id,
|
| 256 |
+
scenario_index=scenario_index,
|
| 257 |
+
split=eval_split,
|
| 258 |
+
runtime_options=runtime_options,
|
| 259 |
+
)
|
| 260 |
+
reset_result = env.reset()
|
| 261 |
+
observation = reset_result.observation
|
| 262 |
+
history: list[str] = []
|
| 263 |
+
|
| 264 |
+
for step in range(1, MAX_STEPS + 1):
|
| 265 |
+
if time.monotonic() >= deadline:
|
| 266 |
+
break
|
| 267 |
+
|
| 268 |
+
prompt = build_user_prompt(observation, history)
|
| 269 |
+
|
| 270 |
+
response_text = ""
|
| 271 |
+
try:
|
| 272 |
+
remaining = max(1.0, deadline - time.monotonic())
|
| 273 |
+
timeout_seconds = max(
|
| 274 |
+
1.0,
|
| 275 |
+
min(float(request_timeout_seconds), float(remaining)),
|
| 276 |
+
)
|
| 277 |
+
completion = client.chat.completions.create(
|
| 278 |
+
model=model_name,
|
| 279 |
+
messages=[
|
| 280 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 281 |
+
{"role": "user", "content": prompt},
|
| 282 |
+
],
|
| 283 |
+
temperature=TEMPERATURE,
|
| 284 |
+
max_tokens=MAX_TOKENS,
|
| 285 |
+
stream=False,
|
| 286 |
+
timeout=timeout_seconds,
|
| 287 |
+
)
|
| 288 |
+
response_text = completion.choices[0].message.content or ""
|
| 289 |
+
except Exception:
|
| 290 |
+
response_text = ""
|
| 291 |
+
|
| 292 |
+
action = parse_action_response(response_text)
|
| 293 |
+
step_result = env.step(action)
|
| 294 |
+
|
| 295 |
+
reward = float(step_result.reward)
|
| 296 |
+
done = bool(step_result.done)
|
| 297 |
+
error_raw = step_result.info.get("validation_error")
|
| 298 |
+
error = str(error_raw) if isinstance(error_raw, str) else None
|
| 299 |
+
|
| 300 |
+
rewards.append(reward)
|
| 301 |
+
steps_taken = step
|
| 302 |
+
|
| 303 |
+
log_step(
|
| 304 |
+
step=step,
|
| 305 |
+
action_str=action_to_log_string(action),
|
| 306 |
+
reward=reward,
|
| 307 |
+
done=done,
|
| 308 |
+
error=error,
|
| 309 |
+
)
|
| 310 |
+
|
| 311 |
+
history.append(
|
| 312 |
+
f"step={step} action={action.label}/{action.route_to} reward={reward:.2f}"
|
| 313 |
+
)
|
| 314 |
+
observation = step_result.observation
|
| 315 |
+
|
| 316 |
+
if done:
|
| 317 |
+
break
|
| 318 |
+
|
| 319 |
+
avg_reward = sum(rewards) / max(len(rewards), 1)
|
| 320 |
+
success = avg_reward >= SUCCESS_SCORE_THRESHOLD
|
| 321 |
+
except Exception:
|
| 322 |
+
success = False
|
| 323 |
+
finally:
|
| 324 |
+
if env is not None:
|
| 325 |
+
close_method = getattr(env, "close", None)
|
| 326 |
+
if callable(close_method):
|
| 327 |
+
try:
|
| 328 |
+
close_method()
|
| 329 |
+
except Exception:
|
| 330 |
+
pass
|
| 331 |
+
|
| 332 |
+
log_end(success=success, steps=steps_taken, rewards=rewards)
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
def main() -> None:
|
| 336 |
+
"""Entrypoint for running one or many tasks with strict stdout logs."""
|
| 337 |
+
args = parse_args()
|
| 338 |
+
deadline = time.monotonic() + max(args.runtime_budget_seconds, 1)
|
| 339 |
+
request_timeout_seconds = max(float(args.request_timeout_seconds), 1.0)
|
| 340 |
+
|
| 341 |
+
try:
|
| 342 |
+
effective_model = validate_runtime_config(args.model)
|
| 343 |
+
except ValueError as error:
|
| 344 |
+
print(str(error), flush=True)
|
| 345 |
+
raise SystemExit(1) from error
|
| 346 |
+
|
| 347 |
+
_ = LOCAL_IMAGE_NAME
|
| 348 |
+
|
| 349 |
+
client = OpenAI(
|
| 350 |
+
base_url=API_BASE_URL,
|
| 351 |
+
api_key=API_KEY,
|
| 352 |
+
)
|
| 353 |
+
|
| 354 |
+
task_ids = [TASK_MAP[args.task]] if args.task in TASK_MAP else list(TASK_MAP.values())
|
| 355 |
+
for task_id in task_ids:
|
| 356 |
+
runtime_options = None
|
| 357 |
+
if task_id == "task_production":
|
| 358 |
+
runtime_options = {
|
| 359 |
+
"production_profile": args.production_profile,
|
| 360 |
+
"business_hours_mode": args.business_hours_mode,
|
| 361 |
+
"escalation_mode": args.escalation_mode,
|
| 362 |
+
}
|
| 363 |
+
for scenario_index in range(max(args.episodes_per_task, 1)):
|
| 364 |
+
run_episode(
|
| 365 |
+
client=client,
|
| 366 |
+
model_name=effective_model,
|
| 367 |
+
task_id=task_id,
|
| 368 |
+
scenario_index=scenario_index,
|
| 369 |
+
eval_split=args.split,
|
| 370 |
+
deadline=deadline,
|
| 371 |
+
request_timeout_seconds=request_timeout_seconds,
|
| 372 |
+
runtime_options=runtime_options,
|
| 373 |
+
)
|
| 374 |
+
|
| 375 |
+
|
| 376 |
+
if __name__ == "__main__":
|
| 377 |
+
main()
|
models.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Data models for the OpenEnv email triage environment."""
|
| 2 |
+
|
| 3 |
+
from typing import Literal
|
| 4 |
+
|
| 5 |
+
from pydantic import BaseModel
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class EmailObservation(BaseModel):
|
| 9 |
+
"""Represents the email context visible to the agent at each step."""
|
| 10 |
+
|
| 11 |
+
email_id: str
|
| 12 |
+
subject: str
|
| 13 |
+
body: str
|
| 14 |
+
sender: str
|
| 15 |
+
timestamp: str
|
| 16 |
+
thread_history: list[str]
|
| 17 |
+
task_id: str
|
| 18 |
+
step_number: int
|
| 19 |
+
total_emails: int
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class TriageAction(BaseModel):
|
| 23 |
+
"""Represents the action chosen by the agent for an email."""
|
| 24 |
+
|
| 25 |
+
label: Literal["urgent", "normal", "spam", "archive"]
|
| 26 |
+
summary: str
|
| 27 |
+
route_to: str
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class RewardResult(BaseModel):
|
| 31 |
+
"""Represents deterministic grading output before reward shaping."""
|
| 32 |
+
|
| 33 |
+
score: float
|
| 34 |
+
breakdown: dict[str, float]
|
| 35 |
+
feedback: str
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class EnvironmentState(BaseModel):
|
| 39 |
+
"""Represents full internal environment state for debugging and evaluation."""
|
| 40 |
+
|
| 41 |
+
task_id: str
|
| 42 |
+
current_step: int
|
| 43 |
+
total_steps: int
|
| 44 |
+
done: bool
|
| 45 |
+
action_history: list[TriageAction]
|
| 46 |
+
reward_history: list[float]
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class StepResult(BaseModel):
|
| 50 |
+
"""Represents the standardized output of environment step calls."""
|
| 51 |
+
|
| 52 |
+
observation: EmailObservation
|
| 53 |
+
reward: float
|
| 54 |
+
done: bool
|
| 55 |
+
info: dict[str, str | int | float | bool]
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class ResetResult(BaseModel):
|
| 59 |
+
"""Represents the standardized output of environment reset calls."""
|
| 60 |
+
|
| 61 |
+
observation: EmailObservation
|
| 62 |
+
info: dict[str, str | int | float | bool]
|
openenv.yaml
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: email-triage-env
|
| 2 |
+
version: 0.1.0
|
| 3 |
+
tasks:
|
| 4 |
+
- task_easy
|
| 5 |
+
- task_medium
|
| 6 |
+
- task_hard
|
| 7 |
+
- task_production
|
| 8 |
+
observation_space:
|
| 9 |
+
email_id: str
|
| 10 |
+
subject: str
|
| 11 |
+
body: str
|
| 12 |
+
sender: str
|
| 13 |
+
timestamp: str
|
| 14 |
+
thread_history: list[str]
|
| 15 |
+
task_id: str
|
| 16 |
+
step_number: int
|
| 17 |
+
total_emails: int
|
| 18 |
+
action_space:
|
| 19 |
+
label: literal[urgent, normal, spam, archive]
|
| 20 |
+
summary: str
|
| 21 |
+
route_to: str
|
| 22 |
+
entrypoint: environment:EmailTriageEnv
|
| 23 |
+
tags:
|
| 24 |
+
- openenv
|
| 25 |
+
- real-world
|
| 26 |
+
- nlp
|
| 27 |
+
- email-triage
|
pyproject.toml
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=68"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "email-triage-env"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
+
description = "OpenEnv email triage environment"
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
requires-python = ">=3.11"
|
| 11 |
+
dependencies = [
|
| 12 |
+
"pydantic==2.11.3",
|
| 13 |
+
"flask==3.1.0",
|
| 14 |
+
"openai==1.69.0",
|
| 15 |
+
"gunicorn==22.0.0",
|
| 16 |
+
"openenv-core>=0.2.0",
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
[project.scripts]
|
| 20 |
+
server = "server:main"
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pydantic==2.11.3
|
| 2 |
+
flask==3.1.0
|
| 3 |
+
openai==1.69.0
|
| 4 |
+
gunicorn==22.0.0
|
server.py
ADDED
|
@@ -0,0 +1,775 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Flask server wrapper for the OpenEnv email triage environment."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
from flask import Flask, Response, jsonify, request
|
| 6 |
+
|
| 7 |
+
from environment import EmailTriageEnv
|
| 8 |
+
from tasks import get_task_scenario_count, list_task_ids
|
| 9 |
+
|
| 10 |
+
FRONTEND_HTML = """<!doctype html>
|
| 11 |
+
<html lang="en">
|
| 12 |
+
<head>
|
| 13 |
+
<meta charset="utf-8" />
|
| 14 |
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
| 15 |
+
<title>Inbox Helper Practice</title>
|
| 16 |
+
<style>
|
| 17 |
+
@import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;600;700&family=IBM+Plex+Mono:wght@400;500&display=swap');
|
| 18 |
+
|
| 19 |
+
:root {
|
| 20 |
+
--bg: #f5f1e9;
|
| 21 |
+
--paper: #fffaf2;
|
| 22 |
+
--ink: #102433;
|
| 23 |
+
--accent: #ea6a2a;
|
| 24 |
+
--accent-soft: #ffd6bf;
|
| 25 |
+
--line: #d7cabb;
|
| 26 |
+
--ok: #0f7b6c;
|
| 27 |
+
--warn: #9a3a12;
|
| 28 |
+
--radius: 14px;
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
* { box-sizing: border-box; }
|
| 32 |
+
|
| 33 |
+
body {
|
| 34 |
+
margin: 0;
|
| 35 |
+
font-family: 'Space Grotesk', sans-serif;
|
| 36 |
+
color: var(--ink);
|
| 37 |
+
background:
|
| 38 |
+
radial-gradient(1100px 460px at -10% -20%, #f2bc9f 0%, transparent 60%),
|
| 39 |
+
radial-gradient(1100px 520px at 120% 115%, #b8d7cf 0%, transparent 62%),
|
| 40 |
+
var(--bg);
|
| 41 |
+
min-height: 100vh;
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
.wrap {
|
| 45 |
+
max-width: 1100px;
|
| 46 |
+
margin: 28px auto;
|
| 47 |
+
padding: 0 16px;
|
| 48 |
+
animation: reveal .45s ease-out;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
@keyframes reveal {
|
| 52 |
+
from { opacity: 0; transform: translateY(10px); }
|
| 53 |
+
to { opacity: 1; transform: translateY(0); }
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
.title {
|
| 57 |
+
display: flex;
|
| 58 |
+
justify-content: space-between;
|
| 59 |
+
align-items: baseline;
|
| 60 |
+
gap: 14px;
|
| 61 |
+
margin-bottom: 14px;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
h1 {
|
| 65 |
+
margin: 0;
|
| 66 |
+
font-size: clamp(1.5rem, 2vw, 2.2rem);
|
| 67 |
+
letter-spacing: .4px;
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
.subtitle {
|
| 71 |
+
margin: 6px 0 0;
|
| 72 |
+
font-size: .95rem;
|
| 73 |
+
opacity: .8;
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
.badge {
|
| 77 |
+
background: var(--accent-soft);
|
| 78 |
+
border: 1px solid #f2b693;
|
| 79 |
+
color: #7f2e0b;
|
| 80 |
+
padding: 6px 10px;
|
| 81 |
+
border-radius: 999px;
|
| 82 |
+
font-size: .85rem;
|
| 83 |
+
font-weight: 600;
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
.grid {
|
| 87 |
+
display: grid;
|
| 88 |
+
grid-template-columns: 1fr;
|
| 89 |
+
gap: 14px;
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
@media (min-width: 900px) {
|
| 93 |
+
.grid { grid-template-columns: 1fr 1fr; }
|
| 94 |
+
.wide { grid-column: span 2; }
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
.card {
|
| 98 |
+
background: var(--paper);
|
| 99 |
+
border: 1px solid var(--line);
|
| 100 |
+
border-radius: var(--radius);
|
| 101 |
+
padding: 14px;
|
| 102 |
+
box-shadow: 0 8px 28px rgba(16, 36, 51, 0.08);
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
.card h2 {
|
| 106 |
+
margin: 0 0 10px;
|
| 107 |
+
font-size: 1rem;
|
| 108 |
+
text-transform: uppercase;
|
| 109 |
+
letter-spacing: .08em;
|
| 110 |
+
opacity: .86;
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
.row {
|
| 114 |
+
display: flex;
|
| 115 |
+
flex-wrap: wrap;
|
| 116 |
+
gap: 8px;
|
| 117 |
+
align-items: center;
|
| 118 |
+
margin-bottom: 10px;
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
select, input, textarea, button {
|
| 122 |
+
font-family: inherit;
|
| 123 |
+
font-size: .95rem;
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
select, input, textarea {
|
| 127 |
+
width: 100%;
|
| 128 |
+
border: 1px solid #cdbba6;
|
| 129 |
+
border-radius: 10px;
|
| 130 |
+
padding: 9px 10px;
|
| 131 |
+
background: #fff;
|
| 132 |
+
color: var(--ink);
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
textarea {
|
| 136 |
+
min-height: 92px;
|
| 137 |
+
resize: vertical;
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
button {
|
| 141 |
+
border: 0;
|
| 142 |
+
border-radius: 10px;
|
| 143 |
+
padding: 9px 12px;
|
| 144 |
+
font-weight: 700;
|
| 145 |
+
background: var(--ink);
|
| 146 |
+
color: #fff;
|
| 147 |
+
cursor: pointer;
|
| 148 |
+
transition: transform .12s ease, opacity .12s ease;
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
button.secondary {
|
| 152 |
+
background: #285066;
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
button.accent {
|
| 156 |
+
background: var(--accent);
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
button:hover { transform: translateY(-1px); }
|
| 160 |
+
button:active { transform: translateY(0); opacity: .92; }
|
| 161 |
+
|
| 162 |
+
.status {
|
| 163 |
+
padding: 8px 10px;
|
| 164 |
+
border-radius: 10px;
|
| 165 |
+
background: #eef7f5;
|
| 166 |
+
border: 1px solid #c7e4de;
|
| 167 |
+
color: var(--ok);
|
| 168 |
+
font-weight: 600;
|
| 169 |
+
min-height: 40px;
|
| 170 |
+
display: flex;
|
| 171 |
+
align-items: center;
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
.status.error {
|
| 175 |
+
background: #fff1ea;
|
| 176 |
+
border-color: #ffc8ae;
|
| 177 |
+
color: var(--warn);
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
pre {
|
| 181 |
+
margin: 0;
|
| 182 |
+
white-space: pre-wrap;
|
| 183 |
+
background: #0f1b24;
|
| 184 |
+
color: #d9efe9;
|
| 185 |
+
border-radius: 10px;
|
| 186 |
+
padding: 12px;
|
| 187 |
+
max-height: 340px;
|
| 188 |
+
overflow: auto;
|
| 189 |
+
font-family: 'IBM Plex Mono', monospace;
|
| 190 |
+
font-size: .85rem;
|
| 191 |
+
border: 1px solid #21313f;
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
.email-block {
|
| 195 |
+
background: #fff;
|
| 196 |
+
border: 1px solid #d9ccbc;
|
| 197 |
+
border-radius: 10px;
|
| 198 |
+
padding: 12px;
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
.email-row {
|
| 202 |
+
margin-bottom: 8px;
|
| 203 |
+
font-size: .95rem;
|
| 204 |
+
line-height: 1.35;
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
.email-row strong {
|
| 208 |
+
display: inline-block;
|
| 209 |
+
min-width: 66px;
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
.help {
|
| 213 |
+
margin: 0 0 10px;
|
| 214 |
+
font-size: .9rem;
|
| 215 |
+
opacity: .8;
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
.metric {
|
| 219 |
+
display: flex;
|
| 220 |
+
justify-content: space-between;
|
| 221 |
+
align-items: center;
|
| 222 |
+
margin-bottom: 8px;
|
| 223 |
+
padding-bottom: 6px;
|
| 224 |
+
border-bottom: 1px dashed #dbcfbe;
|
| 225 |
+
font-size: .95rem;
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
.metric strong {
|
| 229 |
+
font-weight: 700;
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
.coach {
|
| 233 |
+
background: #fff7ed;
|
| 234 |
+
border: 1px solid #f2caa9;
|
| 235 |
+
border-radius: 10px;
|
| 236 |
+
padding: 10px;
|
| 237 |
+
min-height: 74px;
|
| 238 |
+
line-height: 1.4;
|
| 239 |
+
font-size: .92rem;
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
.chip-row {
|
| 243 |
+
display: flex;
|
| 244 |
+
flex-wrap: wrap;
|
| 245 |
+
gap: 8px;
|
| 246 |
+
margin-top: 10px;
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
.chip {
|
| 250 |
+
background: #eaf3ff;
|
| 251 |
+
border: 1px solid #b9d1ef;
|
| 252 |
+
color: #184469;
|
| 253 |
+
border-radius: 999px;
|
| 254 |
+
padding: 6px 10px;
|
| 255 |
+
font-size: .84rem;
|
| 256 |
+
cursor: pointer;
|
| 257 |
+
font-weight: 600;
|
| 258 |
+
}
|
| 259 |
+
</style>
|
| 260 |
+
</head>
|
| 261 |
+
<body>
|
| 262 |
+
<div class="wrap">
|
| 263 |
+
<div class="title">
|
| 264 |
+
<div>
|
| 265 |
+
<h1>Inbox Helper Practice</h1>
|
| 266 |
+
<p class="subtitle">Practice deciding priority, category, and who should handle each email.</p>
|
| 267 |
+
</div>
|
| 268 |
+
<span class="badge" id="badge">connecting...</span>
|
| 269 |
+
</div>
|
| 270 |
+
|
| 271 |
+
<div class="grid">
|
| 272 |
+
<section class="card">
|
| 273 |
+
<h2>Start a Scenario</h2>
|
| 274 |
+
<p class="help">Pick a difficulty, then click Start.</p>
|
| 275 |
+
<div class="row">
|
| 276 |
+
<select id="taskId">
|
| 277 |
+
<option value="task_easy">Easy: one clear email</option>
|
| 278 |
+
<option value="task_medium">Medium: mixed inbox</option>
|
| 279 |
+
<option value="task_hard">Hard: high-risk complaint</option>
|
| 280 |
+
<option value="task_production">Production: full inbox simulator</option>
|
| 281 |
+
</select>
|
| 282 |
+
</div>
|
| 283 |
+
<div id="productionControls" style="display:none;">
|
| 284 |
+
<div class="row">
|
| 285 |
+
<select id="productionProfile">
|
| 286 |
+
<option value="light">Workload: Light</option>
|
| 287 |
+
<option value="standard" selected>Workload: Standard</option>
|
| 288 |
+
<option value="heavy">Workload: Heavy</option>
|
| 289 |
+
</select>
|
| 290 |
+
</div>
|
| 291 |
+
<div class="row">
|
| 292 |
+
<select id="businessHoursMode">
|
| 293 |
+
<option value="false" selected>Time Profile: 24x7 inbox</option>
|
| 294 |
+
<option value="true">Time Profile: business hours focus</option>
|
| 295 |
+
</select>
|
| 296 |
+
</div>
|
| 297 |
+
<div class="row">
|
| 298 |
+
<select id="escalationMode">
|
| 299 |
+
<option value="low">Escalation: Low</option>
|
| 300 |
+
<option value="normal" selected>Escalation: Normal</option>
|
| 301 |
+
<option value="high">Escalation: High</option>
|
| 302 |
+
</select>
|
| 303 |
+
</div>
|
| 304 |
+
</div>
|
| 305 |
+
<div class="row">
|
| 306 |
+
<button class="accent" id="btnReset">Start</button>
|
| 307 |
+
<button class="secondary" id="btnState">Check Progress</button>
|
| 308 |
+
</div>
|
| 309 |
+
<div class="status" id="status">Ready. Start a scenario.</div>
|
| 310 |
+
</section>
|
| 311 |
+
|
| 312 |
+
<section class="card">
|
| 313 |
+
<h2>Your Decision</h2>
|
| 314 |
+
<p class="help">Choose priority, who should handle it, and a short reason.</p>
|
| 315 |
+
<div class="row">
|
| 316 |
+
<select id="label">
|
| 317 |
+
<option value="urgent">Urgent</option>
|
| 318 |
+
<option value="normal" selected>Normal</option>
|
| 319 |
+
<option value="spam">Spam</option>
|
| 320 |
+
<option value="archive">Archive</option>
|
| 321 |
+
</select>
|
| 322 |
+
</div>
|
| 323 |
+
<div class="row">
|
| 324 |
+
<input id="routeTo" placeholder="Who should handle this? (billing, safety, engineering, support)" value="general" />
|
| 325 |
+
</div>
|
| 326 |
+
<div class="row">
|
| 327 |
+
<textarea id="summary" placeholder="Write one clear sentence with key clues from the email.">Needs review.</textarea>
|
| 328 |
+
</div>
|
| 329 |
+
<div class="row">
|
| 330 |
+
<button id="btnStep">Send Decision</button>
|
| 331 |
+
</div>
|
| 332 |
+
</section>
|
| 333 |
+
|
| 334 |
+
<section class="card wide">
|
| 335 |
+
<h2>Current Email</h2>
|
| 336 |
+
<div class="email-block">
|
| 337 |
+
<div class="email-row"><strong>Subject:</strong> <span id="mailSubject">No email loaded yet.</span></div>
|
| 338 |
+
<div class="email-row"><strong>From:</strong> <span id="mailSender">-</span></div>
|
| 339 |
+
<div class="email-row"><strong>Message:</strong> <span id="mailBody">Start a scenario to load an email.</span></div>
|
| 340 |
+
</div>
|
| 341 |
+
</section>
|
| 342 |
+
|
| 343 |
+
<section class="card">
|
| 344 |
+
<h2>Live Progress</h2>
|
| 345 |
+
<div class="metric"><span>Task</span><strong id="insightTask">-</strong></div>
|
| 346 |
+
<div class="metric"><span>Scenario</span><strong id="insightScenario">-</strong></div>
|
| 347 |
+
<div class="metric"><span>Progress</span><strong id="insightProgress">0/0</strong></div>
|
| 348 |
+
<div class="metric"><span>Last Reward</span><strong id="insightReward">-</strong></div>
|
| 349 |
+
<div class="metric"><span>Base Score</span><strong id="insightBase">-</strong></div>
|
| 350 |
+
</section>
|
| 351 |
+
|
| 352 |
+
<section class="card">
|
| 353 |
+
<h2>Coach Notes</h2>
|
| 354 |
+
<p class="help">Use this to improve your next triage action.</p>
|
| 355 |
+
<div class="coach" id="coachNotes">Start a scenario and submit one decision to get feedback.</div>
|
| 356 |
+
<div class="chip-row">
|
| 357 |
+
<button class="chip" id="chipSafety">Quick Fill: Urgent + Safety</button>
|
| 358 |
+
<button class="chip" id="chipBilling">Quick Fill: Normal + Billing</button>
|
| 359 |
+
<button class="chip" id="chipSpam">Quick Fill: Spam + General</button>
|
| 360 |
+
</div>
|
| 361 |
+
</section>
|
| 362 |
+
|
| 363 |
+
<section class="card wide">
|
| 364 |
+
<h2>Details (Advanced)</h2>
|
| 365 |
+
<pre id="output">Waiting for your first action...</pre>
|
| 366 |
+
</section>
|
| 367 |
+
</div>
|
| 368 |
+
</div>
|
| 369 |
+
|
| 370 |
+
<script>
|
| 371 |
+
const statusEl = document.getElementById('status');
|
| 372 |
+
const badgeEl = document.getElementById('badge');
|
| 373 |
+
const outEl = document.getElementById('output');
|
| 374 |
+
const mailSubjectEl = document.getElementById('mailSubject');
|
| 375 |
+
const mailSenderEl = document.getElementById('mailSender');
|
| 376 |
+
const mailBodyEl = document.getElementById('mailBody');
|
| 377 |
+
const taskIdEl = document.getElementById('taskId');
|
| 378 |
+
const productionControlsEl = document.getElementById('productionControls');
|
| 379 |
+
const insightTaskEl = document.getElementById('insightTask');
|
| 380 |
+
const insightScenarioEl = document.getElementById('insightScenario');
|
| 381 |
+
const insightProgressEl = document.getElementById('insightProgress');
|
| 382 |
+
const insightRewardEl = document.getElementById('insightReward');
|
| 383 |
+
const insightBaseEl = document.getElementById('insightBase');
|
| 384 |
+
const coachNotesEl = document.getElementById('coachNotes');
|
| 385 |
+
|
| 386 |
+
function setStatus(msg, isError = false) {
|
| 387 |
+
statusEl.textContent = msg;
|
| 388 |
+
statusEl.classList.toggle('error', isError);
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
function writeOutput(value) {
|
| 392 |
+
outEl.textContent = typeof value === 'string' ? value : JSON.stringify(value, null, 2);
|
| 393 |
+
}
|
| 394 |
+
|
| 395 |
+
function updateEmailPanel(data) {
|
| 396 |
+
if (!data || !data.observation) {
|
| 397 |
+
return;
|
| 398 |
+
}
|
| 399 |
+
const obs = data.observation;
|
| 400 |
+
mailSubjectEl.textContent = obs.subject || 'No subject';
|
| 401 |
+
mailSenderEl.textContent = obs.sender || '-';
|
| 402 |
+
mailBodyEl.textContent = obs.body || '';
|
| 403 |
+
}
|
| 404 |
+
|
| 405 |
+
function updateProductionControlsVisibility() {
|
| 406 |
+
const isProduction = taskIdEl.value === 'task_production';
|
| 407 |
+
productionControlsEl.style.display = isProduction ? 'block' : 'none';
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
function safeNumber(value) {
|
| 411 |
+
return typeof value === 'number' && !Number.isNaN(value) ? value : null;
|
| 412 |
+
}
|
| 413 |
+
|
| 414 |
+
function updateInsights(data) {
|
| 415 |
+
const info = (data && data.info) ? data.info : {};
|
| 416 |
+
const taskValue = info.task_id || data.task_id || (data.observation && data.observation.task_id) || '-';
|
| 417 |
+
const scenarioValue = info.scenario_id || '-';
|
| 418 |
+
|
| 419 |
+
insightTaskEl.textContent = taskValue;
|
| 420 |
+
insightScenarioEl.textContent = scenarioValue;
|
| 421 |
+
|
| 422 |
+
const emailsProcessed = safeNumber(info.emails_processed);
|
| 423 |
+
const emailsTotal = safeNumber(info.emails_total);
|
| 424 |
+
if (emailsProcessed !== null && emailsTotal !== null) {
|
| 425 |
+
insightProgressEl.textContent = `${emailsProcessed}/${emailsTotal}`;
|
| 426 |
+
} else if (safeNumber(data.current_step) !== null && safeNumber(data.total_steps) !== null) {
|
| 427 |
+
insightProgressEl.textContent = `${data.current_step}/${data.total_steps}`;
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
const rewardValue = safeNumber(data.reward);
|
| 431 |
+
insightRewardEl.textContent = rewardValue !== null ? rewardValue.toFixed(2) : '-';
|
| 432 |
+
|
| 433 |
+
const baseScoreValue = safeNumber(info.base_score);
|
| 434 |
+
insightBaseEl.textContent = baseScoreValue !== null ? baseScoreValue.toFixed(2) : '-';
|
| 435 |
+
|
| 436 |
+
const tips = [];
|
| 437 |
+
if (info.validation_error) {
|
| 438 |
+
tips.push('Action format is invalid. Keep label/summary/route_to filled correctly.');
|
| 439 |
+
}
|
| 440 |
+
|
| 441 |
+
const routeNoise = safeNumber(info.grade_route_noise_penalty);
|
| 442 |
+
if (routeNoise !== null && routeNoise > 0.01) {
|
| 443 |
+
tips.push('Route to one best owner team. Avoid sending to many teams at once.');
|
| 444 |
+
}
|
| 445 |
+
|
| 446 |
+
const summaryMatch = safeNumber(info.grade_summary_match);
|
| 447 |
+
if (summaryMatch !== null && summaryMatch < 0.6) {
|
| 448 |
+
tips.push('Summary is weak. Include concrete clues from subject/body/thread.');
|
| 449 |
+
}
|
| 450 |
+
|
| 451 |
+
const labelMatch = safeNumber(info.grade_label_match);
|
| 452 |
+
if (labelMatch !== null && labelMatch < 1.0) {
|
| 453 |
+
tips.push('Priority label may be off. Re-check urgency and risk signals.');
|
| 454 |
+
}
|
| 455 |
+
|
| 456 |
+
const routeMatch = safeNumber(info.grade_route_match);
|
| 457 |
+
if (routeMatch !== null && routeMatch < 1.0) {
|
| 458 |
+
tips.push('Routing looks off. Pick the team that directly owns this issue.');
|
| 459 |
+
}
|
| 460 |
+
|
| 461 |
+
const urgencyComponent = safeNumber(info.grade_urgency_component);
|
| 462 |
+
if (urgencyComponent !== null && urgencyComponent < 0.2) {
|
| 463 |
+
tips.push('For high-risk complaints, mark urgent and route to safety first.');
|
| 464 |
+
}
|
| 465 |
+
|
| 466 |
+
if (!tips.length && typeof info.grading_feedback === 'string' && info.grading_feedback) {
|
| 467 |
+
tips.push(info.grading_feedback);
|
| 468 |
+
}
|
| 469 |
+
|
| 470 |
+
coachNotesEl.textContent = tips.length
|
| 471 |
+
? tips.join(' ')
|
| 472 |
+
: 'Looks good. Keep your next route precise and your summary evidence-based.';
|
| 473 |
+
}
|
| 474 |
+
|
| 475 |
+
function prefillAction(label, routeTo, summary) {
|
| 476 |
+
document.getElementById('label').value = label;
|
| 477 |
+
document.getElementById('routeTo').value = routeTo;
|
| 478 |
+
document.getElementById('summary').value = summary;
|
| 479 |
+
}
|
| 480 |
+
|
| 481 |
+
async function postJson(path, payload) {
|
| 482 |
+
const response = await fetch(path, {
|
| 483 |
+
method: 'POST',
|
| 484 |
+
headers: { 'Content-Type': 'application/json' },
|
| 485 |
+
body: JSON.stringify(payload || {}),
|
| 486 |
+
});
|
| 487 |
+
const text = await response.text();
|
| 488 |
+
let data = text;
|
| 489 |
+
try { data = JSON.parse(text); } catch (e) {}
|
| 490 |
+
if (!response.ok) {
|
| 491 |
+
throw new Error('HTTP ' + response.status + ' - ' + text);
|
| 492 |
+
}
|
| 493 |
+
return data;
|
| 494 |
+
}
|
| 495 |
+
|
| 496 |
+
async function warmup() {
|
| 497 |
+
try {
|
| 498 |
+
const res = await fetch('/meta');
|
| 499 |
+
const data = await res.json();
|
| 500 |
+
badgeEl.textContent = data.status === 'ok' ? 'ready' : 'check service';
|
| 501 |
+
} catch (e) {
|
| 502 |
+
badgeEl.textContent = 'offline';
|
| 503 |
+
}
|
| 504 |
+
}
|
| 505 |
+
|
| 506 |
+
document.getElementById('btnReset').addEventListener('click', async () => {
|
| 507 |
+
const taskId = taskIdEl.value;
|
| 508 |
+
setStatus('Starting a new scenario...');
|
| 509 |
+
try {
|
| 510 |
+
const payload = { task_id: taskId };
|
| 511 |
+
if (taskId === 'task_production') {
|
| 512 |
+
payload.production_profile = document.getElementById('productionProfile').value;
|
| 513 |
+
payload.business_hours_mode = document.getElementById('businessHoursMode').value === 'true';
|
| 514 |
+
payload.escalation_mode = document.getElementById('escalationMode').value;
|
| 515 |
+
}
|
| 516 |
+
const data = await postJson('/reset', payload);
|
| 517 |
+
setStatus('Scenario started. Read the email below.');
|
| 518 |
+
updateEmailPanel(data);
|
| 519 |
+
updateInsights(data);
|
| 520 |
+
writeOutput(data);
|
| 521 |
+
} catch (e) {
|
| 522 |
+
setStatus('Could not start scenario. See details below.', true);
|
| 523 |
+
writeOutput(String(e));
|
| 524 |
+
}
|
| 525 |
+
});
|
| 526 |
+
|
| 527 |
+
document.getElementById('btnState').addEventListener('click', async () => {
|
| 528 |
+
setStatus('Checking progress...');
|
| 529 |
+
try {
|
| 530 |
+
const data = await postJson('/state', {});
|
| 531 |
+
setStatus('Progress updated.');
|
| 532 |
+
updateInsights(data);
|
| 533 |
+
writeOutput(data);
|
| 534 |
+
} catch (e) {
|
| 535 |
+
setStatus('Could not fetch progress. See details below.', true);
|
| 536 |
+
writeOutput(String(e));
|
| 537 |
+
}
|
| 538 |
+
});
|
| 539 |
+
|
| 540 |
+
document.getElementById('btnStep').addEventListener('click', async () => {
|
| 541 |
+
const payload = {
|
| 542 |
+
label: document.getElementById('label').value,
|
| 543 |
+
summary: document.getElementById('summary').value,
|
| 544 |
+
route_to: document.getElementById('routeTo').value,
|
| 545 |
+
};
|
| 546 |
+
setStatus('Sending your decision...');
|
| 547 |
+
try {
|
| 548 |
+
const data = await postJson('/step', payload);
|
| 549 |
+
setStatus('Decision saved.');
|
| 550 |
+
updateEmailPanel(data);
|
| 551 |
+
updateInsights(data);
|
| 552 |
+
writeOutput(data);
|
| 553 |
+
} catch (e) {
|
| 554 |
+
setStatus('Could not submit decision. See details below.', true);
|
| 555 |
+
writeOutput(String(e));
|
| 556 |
+
}
|
| 557 |
+
});
|
| 558 |
+
|
| 559 |
+
document.getElementById('chipSafety').addEventListener('click', () => {
|
| 560 |
+
prefillAction('urgent', 'safety', 'Potential safety risk with immediate escalation needed.');
|
| 561 |
+
});
|
| 562 |
+
|
| 563 |
+
document.getElementById('chipBilling').addEventListener('click', () => {
|
| 564 |
+
prefillAction('normal', 'billing', 'Customer billing issue needs finance team review and response.');
|
| 565 |
+
});
|
| 566 |
+
|
| 567 |
+
document.getElementById('chipSpam').addEventListener('click', () => {
|
| 568 |
+
prefillAction('spam', 'general', 'Likely phishing or irrelevant message with suspicious external request.');
|
| 569 |
+
});
|
| 570 |
+
|
| 571 |
+
taskIdEl.addEventListener('change', updateProductionControlsVisibility);
|
| 572 |
+
|
| 573 |
+
updateProductionControlsVisibility();
|
| 574 |
+
warmup();
|
| 575 |
+
</script>
|
| 576 |
+
</body>
|
| 577 |
+
</html>
|
| 578 |
+
"""
|
| 579 |
+
|
| 580 |
+
app = Flask(__name__)
|
| 581 |
+
current_env = EmailTriageEnv(task_id="task_easy")
|
| 582 |
+
SCENARIO_COUNTERS = {task_id: 0 for task_id in list_task_ids()}
|
| 583 |
+
DEFAULT_EVAL_SPLIT = os.getenv("OPENENV_EVAL_SPLIT", "public")
|
| 584 |
+
ALLOW_CLIENT_EVAL_OVERRIDE = (
|
| 585 |
+
os.getenv("OPENENV_ALLOW_CLIENT_EVAL_OVERRIDE", "false").strip().lower() == "true"
|
| 586 |
+
)
|
| 587 |
+
|
| 588 |
+
|
| 589 |
+
@app.get("/")
|
| 590 |
+
def root_page():
|
| 591 |
+
"""Render a lightweight frontend for interacting with the environment."""
|
| 592 |
+
return Response(FRONTEND_HTML, mimetype="text/html")
|
| 593 |
+
|
| 594 |
+
|
| 595 |
+
@app.get("/meta")
|
| 596 |
+
def root_endpoint():
|
| 597 |
+
"""Return service metadata for health checks and machine clients."""
|
| 598 |
+
return jsonify(
|
| 599 |
+
{
|
| 600 |
+
"name": "email-triage-env",
|
| 601 |
+
"status": "ok",
|
| 602 |
+
"endpoints": {
|
| 603 |
+
"reset": {"method": "POST", "path": "/reset"},
|
| 604 |
+
"step": {"method": "POST", "path": "/step"},
|
| 605 |
+
"state": {"method": "POST", "path": "/state"},
|
| 606 |
+
},
|
| 607 |
+
"scenario_pools": {
|
| 608 |
+
"public": {
|
| 609 |
+
task_id: get_task_scenario_count(task_id, "public")
|
| 610 |
+
for task_id in list_task_ids()
|
| 611 |
+
},
|
| 612 |
+
},
|
| 613 |
+
"eval_split": DEFAULT_EVAL_SPLIT,
|
| 614 |
+
"production_runtime_controls": {
|
| 615 |
+
"production_profile": ["light", "standard", "heavy"],
|
| 616 |
+
"business_hours_mode": [True, False],
|
| 617 |
+
"escalation_mode": ["low", "normal", "high"],
|
| 618 |
+
},
|
| 619 |
+
}
|
| 620 |
+
)
|
| 621 |
+
|
| 622 |
+
|
| 623 |
+
@app.post("/reset")
|
| 624 |
+
def reset_endpoint():
|
| 625 |
+
"""Reset the environment with a selected task and return ResetResult JSON.
|
| 626 |
+
|
| 627 |
+
Returns:
|
| 628 |
+
Flask response containing reset payload.
|
| 629 |
+
"""
|
| 630 |
+
global current_env
|
| 631 |
+
global SCENARIO_COUNTERS
|
| 632 |
+
|
| 633 |
+
payload = request.get_json(silent=True)
|
| 634 |
+
if payload is None:
|
| 635 |
+
payload = {}
|
| 636 |
+
elif not isinstance(payload, dict):
|
| 637 |
+
return jsonify({"error": "Malformed JSON payload."}), 400
|
| 638 |
+
|
| 639 |
+
task_id = payload.get("task_id", "task_easy")
|
| 640 |
+
if not isinstance(task_id, str):
|
| 641 |
+
return jsonify({"error": "Field 'task_id' must be a string."}), 400
|
| 642 |
+
|
| 643 |
+
runtime_options: dict[str, object] = {}
|
| 644 |
+
if task_id == "task_production":
|
| 645 |
+
production_profile = payload.get("production_profile", "standard")
|
| 646 |
+
if not isinstance(production_profile, str) or production_profile not in {
|
| 647 |
+
"light",
|
| 648 |
+
"standard",
|
| 649 |
+
"heavy",
|
| 650 |
+
}:
|
| 651 |
+
return (
|
| 652 |
+
jsonify(
|
| 653 |
+
{
|
| 654 |
+
"error": (
|
| 655 |
+
"Field 'production_profile' must be one of "
|
| 656 |
+
"light/standard/heavy."
|
| 657 |
+
)
|
| 658 |
+
}
|
| 659 |
+
),
|
| 660 |
+
400,
|
| 661 |
+
)
|
| 662 |
+
|
| 663 |
+
escalation_mode = payload.get("escalation_mode", "normal")
|
| 664 |
+
if not isinstance(escalation_mode, str) or escalation_mode not in {
|
| 665 |
+
"low",
|
| 666 |
+
"normal",
|
| 667 |
+
"high",
|
| 668 |
+
}:
|
| 669 |
+
return (
|
| 670 |
+
jsonify(
|
| 671 |
+
{
|
| 672 |
+
"error": (
|
| 673 |
+
"Field 'escalation_mode' must be one of "
|
| 674 |
+
"low/normal/high."
|
| 675 |
+
)
|
| 676 |
+
}
|
| 677 |
+
),
|
| 678 |
+
400,
|
| 679 |
+
)
|
| 680 |
+
|
| 681 |
+
business_hours_mode = payload.get("business_hours_mode", False)
|
| 682 |
+
if isinstance(business_hours_mode, str):
|
| 683 |
+
business_hours_mode = business_hours_mode.strip().lower() in {
|
| 684 |
+
"1",
|
| 685 |
+
"true",
|
| 686 |
+
"yes",
|
| 687 |
+
"on",
|
| 688 |
+
}
|
| 689 |
+
elif not isinstance(business_hours_mode, bool):
|
| 690 |
+
return jsonify({"error": "Field 'business_hours_mode' must be boolean."}), 400
|
| 691 |
+
|
| 692 |
+
runtime_options = {
|
| 693 |
+
"production_profile": production_profile,
|
| 694 |
+
"business_hours_mode": business_hours_mode,
|
| 695 |
+
"escalation_mode": escalation_mode,
|
| 696 |
+
}
|
| 697 |
+
|
| 698 |
+
if not ALLOW_CLIENT_EVAL_OVERRIDE and (
|
| 699 |
+
"eval_split" in payload or "scenario_index" in payload
|
| 700 |
+
):
|
| 701 |
+
return jsonify(
|
| 702 |
+
{
|
| 703 |
+
"error": (
|
| 704 |
+
"Client overrides for eval_split/scenario_index are disabled "
|
| 705 |
+
"by server policy."
|
| 706 |
+
)
|
| 707 |
+
}
|
| 708 |
+
), 400
|
| 709 |
+
|
| 710 |
+
eval_split = DEFAULT_EVAL_SPLIT
|
| 711 |
+
if ALLOW_CLIENT_EVAL_OVERRIDE:
|
| 712 |
+
requested_split = payload.get("eval_split", DEFAULT_EVAL_SPLIT)
|
| 713 |
+
if not isinstance(requested_split, str):
|
| 714 |
+
return jsonify({"error": "Field 'eval_split' must be a string."}), 400
|
| 715 |
+
eval_split = requested_split
|
| 716 |
+
|
| 717 |
+
requested_index = payload.get("scenario_index") if ALLOW_CLIENT_EVAL_OVERRIDE else None
|
| 718 |
+
if requested_index is not None and (not isinstance(requested_index, int) or requested_index < 0):
|
| 719 |
+
return jsonify({"error": "Field 'scenario_index' must be a non-negative integer."}), 400
|
| 720 |
+
|
| 721 |
+
try:
|
| 722 |
+
scenario_count = get_task_scenario_count(task_id, eval_split)
|
| 723 |
+
if requested_index is None:
|
| 724 |
+
scenario_index = SCENARIO_COUNTERS.get(task_id, 0)
|
| 725 |
+
if scenario_count > 0:
|
| 726 |
+
SCENARIO_COUNTERS[task_id] = (scenario_index + 1) % scenario_count
|
| 727 |
+
else:
|
| 728 |
+
scenario_index = requested_index
|
| 729 |
+
|
| 730 |
+
current_env = EmailTriageEnv(
|
| 731 |
+
task_id=task_id,
|
| 732 |
+
scenario_index=scenario_index,
|
| 733 |
+
split=eval_split,
|
| 734 |
+
runtime_options=runtime_options,
|
| 735 |
+
)
|
| 736 |
+
reset_result = current_env.reset()
|
| 737 |
+
except KeyError as error:
|
| 738 |
+
return jsonify({"error": str(error)}), 400
|
| 739 |
+
|
| 740 |
+
return jsonify(reset_result.model_dump())
|
| 741 |
+
|
| 742 |
+
|
| 743 |
+
@app.post("/step")
|
| 744 |
+
def step_endpoint():
|
| 745 |
+
"""Advance environment by one action and return StepResult JSON.
|
| 746 |
+
|
| 747 |
+
Returns:
|
| 748 |
+
Flask response containing step payload.
|
| 749 |
+
"""
|
| 750 |
+
payload = request.get_json(silent=True)
|
| 751 |
+
if payload is None:
|
| 752 |
+
return jsonify({"error": "Malformed JSON payload."}), 400
|
| 753 |
+
|
| 754 |
+
step_result = current_env.step(payload)
|
| 755 |
+
return jsonify(step_result.model_dump())
|
| 756 |
+
|
| 757 |
+
|
| 758 |
+
@app.post("/state")
|
| 759 |
+
def state_endpoint():
|
| 760 |
+
"""Return read-only EnvironmentState JSON snapshot.
|
| 761 |
+
|
| 762 |
+
Returns:
|
| 763 |
+
Flask response containing state payload.
|
| 764 |
+
"""
|
| 765 |
+
state_result = current_env.state()
|
| 766 |
+
return jsonify(state_result.model_dump())
|
| 767 |
+
|
| 768 |
+
|
| 769 |
+
def main() -> None:
|
| 770 |
+
"""Run the Flask app for local and script-based launches."""
|
| 771 |
+
app.run(host="0.0.0.0", port=7860)
|
| 772 |
+
|
| 773 |
+
|
| 774 |
+
if __name__ == "__main__":
|
| 775 |
+
main()
|
server/__pycache__/app.cpython-314.pyc
ADDED
|
Binary file (28.4 kB). View file
|
|
|
server/app.py
ADDED
|
@@ -0,0 +1,775 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Auxiliary server entrypoint required by OpenEnv local validation checks."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
from flask import Flask, Response, jsonify, request
|
| 6 |
+
|
| 7 |
+
from environment import EmailTriageEnv
|
| 8 |
+
from tasks import get_task_scenario_count, list_task_ids
|
| 9 |
+
|
| 10 |
+
FRONTEND_HTML = """<!doctype html>
|
| 11 |
+
<html lang="en">
|
| 12 |
+
<head>
|
| 13 |
+
<meta charset="utf-8" />
|
| 14 |
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
| 15 |
+
<title>Inbox Helper Practice</title>
|
| 16 |
+
<style>
|
| 17 |
+
@import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;600;700&family=IBM+Plex+Mono:wght@400;500&display=swap');
|
| 18 |
+
|
| 19 |
+
:root {
|
| 20 |
+
--bg: #f5f1e9;
|
| 21 |
+
--paper: #fffaf2;
|
| 22 |
+
--ink: #102433;
|
| 23 |
+
--accent: #ea6a2a;
|
| 24 |
+
--accent-soft: #ffd6bf;
|
| 25 |
+
--line: #d7cabb;
|
| 26 |
+
--ok: #0f7b6c;
|
| 27 |
+
--warn: #9a3a12;
|
| 28 |
+
--radius: 14px;
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
* { box-sizing: border-box; }
|
| 32 |
+
|
| 33 |
+
body {
|
| 34 |
+
margin: 0;
|
| 35 |
+
font-family: 'Space Grotesk', sans-serif;
|
| 36 |
+
color: var(--ink);
|
| 37 |
+
background:
|
| 38 |
+
radial-gradient(1100px 460px at -10% -20%, #f2bc9f 0%, transparent 60%),
|
| 39 |
+
radial-gradient(1100px 520px at 120% 115%, #b8d7cf 0%, transparent 62%),
|
| 40 |
+
var(--bg);
|
| 41 |
+
min-height: 100vh;
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
.wrap {
|
| 45 |
+
max-width: 1100px;
|
| 46 |
+
margin: 28px auto;
|
| 47 |
+
padding: 0 16px;
|
| 48 |
+
animation: reveal .45s ease-out;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
@keyframes reveal {
|
| 52 |
+
from { opacity: 0; transform: translateY(10px); }
|
| 53 |
+
to { opacity: 1; transform: translateY(0); }
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
.title {
|
| 57 |
+
display: flex;
|
| 58 |
+
justify-content: space-between;
|
| 59 |
+
align-items: baseline;
|
| 60 |
+
gap: 14px;
|
| 61 |
+
margin-bottom: 14px;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
h1 {
|
| 65 |
+
margin: 0;
|
| 66 |
+
font-size: clamp(1.5rem, 2vw, 2.2rem);
|
| 67 |
+
letter-spacing: .4px;
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
.subtitle {
|
| 71 |
+
margin: 6px 0 0;
|
| 72 |
+
font-size: .95rem;
|
| 73 |
+
opacity: .8;
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
.badge {
|
| 77 |
+
background: var(--accent-soft);
|
| 78 |
+
border: 1px solid #f2b693;
|
| 79 |
+
color: #7f2e0b;
|
| 80 |
+
padding: 6px 10px;
|
| 81 |
+
border-radius: 999px;
|
| 82 |
+
font-size: .85rem;
|
| 83 |
+
font-weight: 600;
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
.grid {
|
| 87 |
+
display: grid;
|
| 88 |
+
grid-template-columns: 1fr;
|
| 89 |
+
gap: 14px;
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
@media (min-width: 900px) {
|
| 93 |
+
.grid { grid-template-columns: 1fr 1fr; }
|
| 94 |
+
.wide { grid-column: span 2; }
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
.card {
|
| 98 |
+
background: var(--paper);
|
| 99 |
+
border: 1px solid var(--line);
|
| 100 |
+
border-radius: var(--radius);
|
| 101 |
+
padding: 14px;
|
| 102 |
+
box-shadow: 0 8px 28px rgba(16, 36, 51, 0.08);
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
.card h2 {
|
| 106 |
+
margin: 0 0 10px;
|
| 107 |
+
font-size: 1rem;
|
| 108 |
+
text-transform: uppercase;
|
| 109 |
+
letter-spacing: .08em;
|
| 110 |
+
opacity: .86;
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
.row {
|
| 114 |
+
display: flex;
|
| 115 |
+
flex-wrap: wrap;
|
| 116 |
+
gap: 8px;
|
| 117 |
+
align-items: center;
|
| 118 |
+
margin-bottom: 10px;
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
select, input, textarea, button {
|
| 122 |
+
font-family: inherit;
|
| 123 |
+
font-size: .95rem;
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
select, input, textarea {
|
| 127 |
+
width: 100%;
|
| 128 |
+
border: 1px solid #cdbba6;
|
| 129 |
+
border-radius: 10px;
|
| 130 |
+
padding: 9px 10px;
|
| 131 |
+
background: #fff;
|
| 132 |
+
color: var(--ink);
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
textarea {
|
| 136 |
+
min-height: 92px;
|
| 137 |
+
resize: vertical;
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
button {
|
| 141 |
+
border: 0;
|
| 142 |
+
border-radius: 10px;
|
| 143 |
+
padding: 9px 12px;
|
| 144 |
+
font-weight: 700;
|
| 145 |
+
background: var(--ink);
|
| 146 |
+
color: #fff;
|
| 147 |
+
cursor: pointer;
|
| 148 |
+
transition: transform .12s ease, opacity .12s ease;
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
button.secondary {
|
| 152 |
+
background: #285066;
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
button.accent {
|
| 156 |
+
background: var(--accent);
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
button:hover { transform: translateY(-1px); }
|
| 160 |
+
button:active { transform: translateY(0); opacity: .92; }
|
| 161 |
+
|
| 162 |
+
.status {
|
| 163 |
+
padding: 8px 10px;
|
| 164 |
+
border-radius: 10px;
|
| 165 |
+
background: #eef7f5;
|
| 166 |
+
border: 1px solid #c7e4de;
|
| 167 |
+
color: var(--ok);
|
| 168 |
+
font-weight: 600;
|
| 169 |
+
min-height: 40px;
|
| 170 |
+
display: flex;
|
| 171 |
+
align-items: center;
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
.status.error {
|
| 175 |
+
background: #fff1ea;
|
| 176 |
+
border-color: #ffc8ae;
|
| 177 |
+
color: var(--warn);
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
pre {
|
| 181 |
+
margin: 0;
|
| 182 |
+
white-space: pre-wrap;
|
| 183 |
+
background: #0f1b24;
|
| 184 |
+
color: #d9efe9;
|
| 185 |
+
border-radius: 10px;
|
| 186 |
+
padding: 12px;
|
| 187 |
+
max-height: 340px;
|
| 188 |
+
overflow: auto;
|
| 189 |
+
font-family: 'IBM Plex Mono', monospace;
|
| 190 |
+
font-size: .85rem;
|
| 191 |
+
border: 1px solid #21313f;
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
.email-block {
|
| 195 |
+
background: #fff;
|
| 196 |
+
border: 1px solid #d9ccbc;
|
| 197 |
+
border-radius: 10px;
|
| 198 |
+
padding: 12px;
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
.email-row {
|
| 202 |
+
margin-bottom: 8px;
|
| 203 |
+
font-size: .95rem;
|
| 204 |
+
line-height: 1.35;
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
.email-row strong {
|
| 208 |
+
display: inline-block;
|
| 209 |
+
min-width: 66px;
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
.help {
|
| 213 |
+
margin: 0 0 10px;
|
| 214 |
+
font-size: .9rem;
|
| 215 |
+
opacity: .8;
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
.metric {
|
| 219 |
+
display: flex;
|
| 220 |
+
justify-content: space-between;
|
| 221 |
+
align-items: center;
|
| 222 |
+
margin-bottom: 8px;
|
| 223 |
+
padding-bottom: 6px;
|
| 224 |
+
border-bottom: 1px dashed #dbcfbe;
|
| 225 |
+
font-size: .95rem;
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
.metric strong {
|
| 229 |
+
font-weight: 700;
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
.coach {
|
| 233 |
+
background: #fff7ed;
|
| 234 |
+
border: 1px solid #f2caa9;
|
| 235 |
+
border-radius: 10px;
|
| 236 |
+
padding: 10px;
|
| 237 |
+
min-height: 74px;
|
| 238 |
+
line-height: 1.4;
|
| 239 |
+
font-size: .92rem;
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
.chip-row {
|
| 243 |
+
display: flex;
|
| 244 |
+
flex-wrap: wrap;
|
| 245 |
+
gap: 8px;
|
| 246 |
+
margin-top: 10px;
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
.chip {
|
| 250 |
+
background: #eaf3ff;
|
| 251 |
+
border: 1px solid #b9d1ef;
|
| 252 |
+
color: #184469;
|
| 253 |
+
border-radius: 999px;
|
| 254 |
+
padding: 6px 10px;
|
| 255 |
+
font-size: .84rem;
|
| 256 |
+
cursor: pointer;
|
| 257 |
+
font-weight: 600;
|
| 258 |
+
}
|
| 259 |
+
</style>
|
| 260 |
+
</head>
|
| 261 |
+
<body>
|
| 262 |
+
<div class="wrap">
|
| 263 |
+
<div class="title">
|
| 264 |
+
<div>
|
| 265 |
+
<h1>Inbox Helper Practice</h1>
|
| 266 |
+
<p class="subtitle">Practice deciding priority, category, and who should handle each email.</p>
|
| 267 |
+
</div>
|
| 268 |
+
<span class="badge" id="badge">connecting...</span>
|
| 269 |
+
</div>
|
| 270 |
+
|
| 271 |
+
<div class="grid">
|
| 272 |
+
<section class="card">
|
| 273 |
+
<h2>Start a Scenario</h2>
|
| 274 |
+
<p class="help">Pick a difficulty, then click Start.</p>
|
| 275 |
+
<div class="row">
|
| 276 |
+
<select id="taskId">
|
| 277 |
+
<option value="task_easy">Easy: one clear email</option>
|
| 278 |
+
<option value="task_medium">Medium: mixed inbox</option>
|
| 279 |
+
<option value="task_hard">Hard: high-risk complaint</option>
|
| 280 |
+
<option value="task_production">Production: full inbox simulator</option>
|
| 281 |
+
</select>
|
| 282 |
+
</div>
|
| 283 |
+
<div id="productionControls" style="display:none;">
|
| 284 |
+
<div class="row">
|
| 285 |
+
<select id="productionProfile">
|
| 286 |
+
<option value="light">Workload: Light</option>
|
| 287 |
+
<option value="standard" selected>Workload: Standard</option>
|
| 288 |
+
<option value="heavy">Workload: Heavy</option>
|
| 289 |
+
</select>
|
| 290 |
+
</div>
|
| 291 |
+
<div class="row">
|
| 292 |
+
<select id="businessHoursMode">
|
| 293 |
+
<option value="false" selected>Time Profile: 24x7 inbox</option>
|
| 294 |
+
<option value="true">Time Profile: business hours focus</option>
|
| 295 |
+
</select>
|
| 296 |
+
</div>
|
| 297 |
+
<div class="row">
|
| 298 |
+
<select id="escalationMode">
|
| 299 |
+
<option value="low">Escalation: Low</option>
|
| 300 |
+
<option value="normal" selected>Escalation: Normal</option>
|
| 301 |
+
<option value="high">Escalation: High</option>
|
| 302 |
+
</select>
|
| 303 |
+
</div>
|
| 304 |
+
</div>
|
| 305 |
+
<div class="row">
|
| 306 |
+
<button class="accent" id="btnReset">Start</button>
|
| 307 |
+
<button class="secondary" id="btnState">Check Progress</button>
|
| 308 |
+
</div>
|
| 309 |
+
<div class="status" id="status">Ready. Start a scenario.</div>
|
| 310 |
+
</section>
|
| 311 |
+
|
| 312 |
+
<section class="card">
|
| 313 |
+
<h2>Your Decision</h2>
|
| 314 |
+
<p class="help">Choose priority, who should handle it, and a short reason.</p>
|
| 315 |
+
<div class="row">
|
| 316 |
+
<select id="label">
|
| 317 |
+
<option value="urgent">Urgent</option>
|
| 318 |
+
<option value="normal" selected>Normal</option>
|
| 319 |
+
<option value="spam">Spam</option>
|
| 320 |
+
<option value="archive">Archive</option>
|
| 321 |
+
</select>
|
| 322 |
+
</div>
|
| 323 |
+
<div class="row">
|
| 324 |
+
<input id="routeTo" placeholder="Who should handle this? (billing, safety, engineering, support)" value="general" />
|
| 325 |
+
</div>
|
| 326 |
+
<div class="row">
|
| 327 |
+
<textarea id="summary" placeholder="Write one clear sentence with key clues from the email.">Needs review.</textarea>
|
| 328 |
+
</div>
|
| 329 |
+
<div class="row">
|
| 330 |
+
<button id="btnStep">Send Decision</button>
|
| 331 |
+
</div>
|
| 332 |
+
</section>
|
| 333 |
+
|
| 334 |
+
<section class="card wide">
|
| 335 |
+
<h2>Current Email</h2>
|
| 336 |
+
<div class="email-block">
|
| 337 |
+
<div class="email-row"><strong>Subject:</strong> <span id="mailSubject">No email loaded yet.</span></div>
|
| 338 |
+
<div class="email-row"><strong>From:</strong> <span id="mailSender">-</span></div>
|
| 339 |
+
<div class="email-row"><strong>Message:</strong> <span id="mailBody">Start a scenario to load an email.</span></div>
|
| 340 |
+
</div>
|
| 341 |
+
</section>
|
| 342 |
+
|
| 343 |
+
<section class="card">
|
| 344 |
+
<h2>Live Progress</h2>
|
| 345 |
+
<div class="metric"><span>Task</span><strong id="insightTask">-</strong></div>
|
| 346 |
+
<div class="metric"><span>Scenario</span><strong id="insightScenario">-</strong></div>
|
| 347 |
+
<div class="metric"><span>Progress</span><strong id="insightProgress">0/0</strong></div>
|
| 348 |
+
<div class="metric"><span>Last Reward</span><strong id="insightReward">-</strong></div>
|
| 349 |
+
<div class="metric"><span>Base Score</span><strong id="insightBase">-</strong></div>
|
| 350 |
+
</section>
|
| 351 |
+
|
| 352 |
+
<section class="card">
|
| 353 |
+
<h2>Coach Notes</h2>
|
| 354 |
+
<p class="help">Use this to improve your next triage action.</p>
|
| 355 |
+
<div class="coach" id="coachNotes">Start a scenario and submit one decision to get feedback.</div>
|
| 356 |
+
<div class="chip-row">
|
| 357 |
+
<button class="chip" id="chipSafety">Quick Fill: Urgent + Safety</button>
|
| 358 |
+
<button class="chip" id="chipBilling">Quick Fill: Normal + Billing</button>
|
| 359 |
+
<button class="chip" id="chipSpam">Quick Fill: Spam + General</button>
|
| 360 |
+
</div>
|
| 361 |
+
</section>
|
| 362 |
+
|
| 363 |
+
<section class="card wide">
|
| 364 |
+
<h2>Details (Advanced)</h2>
|
| 365 |
+
<pre id="output">Waiting for your first action...</pre>
|
| 366 |
+
</section>
|
| 367 |
+
</div>
|
| 368 |
+
</div>
|
| 369 |
+
|
| 370 |
+
<script>
|
| 371 |
+
const statusEl = document.getElementById('status');
|
| 372 |
+
const badgeEl = document.getElementById('badge');
|
| 373 |
+
const outEl = document.getElementById('output');
|
| 374 |
+
const mailSubjectEl = document.getElementById('mailSubject');
|
| 375 |
+
const mailSenderEl = document.getElementById('mailSender');
|
| 376 |
+
const mailBodyEl = document.getElementById('mailBody');
|
| 377 |
+
const taskIdEl = document.getElementById('taskId');
|
| 378 |
+
const productionControlsEl = document.getElementById('productionControls');
|
| 379 |
+
const insightTaskEl = document.getElementById('insightTask');
|
| 380 |
+
const insightScenarioEl = document.getElementById('insightScenario');
|
| 381 |
+
const insightProgressEl = document.getElementById('insightProgress');
|
| 382 |
+
const insightRewardEl = document.getElementById('insightReward');
|
| 383 |
+
const insightBaseEl = document.getElementById('insightBase');
|
| 384 |
+
const coachNotesEl = document.getElementById('coachNotes');
|
| 385 |
+
|
| 386 |
+
function setStatus(msg, isError = false) {
|
| 387 |
+
statusEl.textContent = msg;
|
| 388 |
+
statusEl.classList.toggle('error', isError);
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
function writeOutput(value) {
|
| 392 |
+
outEl.textContent = typeof value === 'string' ? value : JSON.stringify(value, null, 2);
|
| 393 |
+
}
|
| 394 |
+
|
| 395 |
+
function updateEmailPanel(data) {
|
| 396 |
+
if (!data || !data.observation) {
|
| 397 |
+
return;
|
| 398 |
+
}
|
| 399 |
+
const obs = data.observation;
|
| 400 |
+
mailSubjectEl.textContent = obs.subject || 'No subject';
|
| 401 |
+
mailSenderEl.textContent = obs.sender || '-';
|
| 402 |
+
mailBodyEl.textContent = obs.body || '';
|
| 403 |
+
}
|
| 404 |
+
|
| 405 |
+
function updateProductionControlsVisibility() {
|
| 406 |
+
const isProduction = taskIdEl.value === 'task_production';
|
| 407 |
+
productionControlsEl.style.display = isProduction ? 'block' : 'none';
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
function safeNumber(value) {
|
| 411 |
+
return typeof value === 'number' && !Number.isNaN(value) ? value : null;
|
| 412 |
+
}
|
| 413 |
+
|
| 414 |
+
function updateInsights(data) {
|
| 415 |
+
const info = (data && data.info) ? data.info : {};
|
| 416 |
+
const taskValue = info.task_id || data.task_id || (data.observation && data.observation.task_id) || '-';
|
| 417 |
+
const scenarioValue = info.scenario_id || '-';
|
| 418 |
+
|
| 419 |
+
insightTaskEl.textContent = taskValue;
|
| 420 |
+
insightScenarioEl.textContent = scenarioValue;
|
| 421 |
+
|
| 422 |
+
const emailsProcessed = safeNumber(info.emails_processed);
|
| 423 |
+
const emailsTotal = safeNumber(info.emails_total);
|
| 424 |
+
if (emailsProcessed !== null && emailsTotal !== null) {
|
| 425 |
+
insightProgressEl.textContent = `${emailsProcessed}/${emailsTotal}`;
|
| 426 |
+
} else if (safeNumber(data.current_step) !== null && safeNumber(data.total_steps) !== null) {
|
| 427 |
+
insightProgressEl.textContent = `${data.current_step}/${data.total_steps}`;
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
const rewardValue = safeNumber(data.reward);
|
| 431 |
+
insightRewardEl.textContent = rewardValue !== null ? rewardValue.toFixed(2) : '-';
|
| 432 |
+
|
| 433 |
+
const baseScoreValue = safeNumber(info.base_score);
|
| 434 |
+
insightBaseEl.textContent = baseScoreValue !== null ? baseScoreValue.toFixed(2) : '-';
|
| 435 |
+
|
| 436 |
+
const tips = [];
|
| 437 |
+
if (info.validation_error) {
|
| 438 |
+
tips.push('Action format is invalid. Keep label/summary/route_to filled correctly.');
|
| 439 |
+
}
|
| 440 |
+
|
| 441 |
+
const routeNoise = safeNumber(info.grade_route_noise_penalty);
|
| 442 |
+
if (routeNoise !== null && routeNoise > 0.01) {
|
| 443 |
+
tips.push('Route to one best owner team. Avoid sending to many teams at once.');
|
| 444 |
+
}
|
| 445 |
+
|
| 446 |
+
const summaryMatch = safeNumber(info.grade_summary_match);
|
| 447 |
+
if (summaryMatch !== null && summaryMatch < 0.6) {
|
| 448 |
+
tips.push('Summary is weak. Include concrete clues from subject/body/thread.');
|
| 449 |
+
}
|
| 450 |
+
|
| 451 |
+
const labelMatch = safeNumber(info.grade_label_match);
|
| 452 |
+
if (labelMatch !== null && labelMatch < 1.0) {
|
| 453 |
+
tips.push('Priority label may be off. Re-check urgency and risk signals.');
|
| 454 |
+
}
|
| 455 |
+
|
| 456 |
+
const routeMatch = safeNumber(info.grade_route_match);
|
| 457 |
+
if (routeMatch !== null && routeMatch < 1.0) {
|
| 458 |
+
tips.push('Routing looks off. Pick the team that directly owns this issue.');
|
| 459 |
+
}
|
| 460 |
+
|
| 461 |
+
const urgencyComponent = safeNumber(info.grade_urgency_component);
|
| 462 |
+
if (urgencyComponent !== null && urgencyComponent < 0.2) {
|
| 463 |
+
tips.push('For high-risk complaints, mark urgent and route to safety first.');
|
| 464 |
+
}
|
| 465 |
+
|
| 466 |
+
if (!tips.length && typeof info.grading_feedback === 'string' && info.grading_feedback) {
|
| 467 |
+
tips.push(info.grading_feedback);
|
| 468 |
+
}
|
| 469 |
+
|
| 470 |
+
coachNotesEl.textContent = tips.length
|
| 471 |
+
? tips.join(' ')
|
| 472 |
+
: 'Looks good. Keep your next route precise and your summary evidence-based.';
|
| 473 |
+
}
|
| 474 |
+
|
| 475 |
+
function prefillAction(label, routeTo, summary) {
|
| 476 |
+
document.getElementById('label').value = label;
|
| 477 |
+
document.getElementById('routeTo').value = routeTo;
|
| 478 |
+
document.getElementById('summary').value = summary;
|
| 479 |
+
}
|
| 480 |
+
|
| 481 |
+
async function postJson(path, payload) {
|
| 482 |
+
const response = await fetch(path, {
|
| 483 |
+
method: 'POST',
|
| 484 |
+
headers: { 'Content-Type': 'application/json' },
|
| 485 |
+
body: JSON.stringify(payload || {}),
|
| 486 |
+
});
|
| 487 |
+
const text = await response.text();
|
| 488 |
+
let data = text;
|
| 489 |
+
try { data = JSON.parse(text); } catch (e) {}
|
| 490 |
+
if (!response.ok) {
|
| 491 |
+
throw new Error('HTTP ' + response.status + ' - ' + text);
|
| 492 |
+
}
|
| 493 |
+
return data;
|
| 494 |
+
}
|
| 495 |
+
|
| 496 |
+
async function warmup() {
|
| 497 |
+
try {
|
| 498 |
+
const res = await fetch('/meta');
|
| 499 |
+
const data = await res.json();
|
| 500 |
+
badgeEl.textContent = data.status === 'ok' ? 'ready' : 'check service';
|
| 501 |
+
} catch (e) {
|
| 502 |
+
badgeEl.textContent = 'offline';
|
| 503 |
+
}
|
| 504 |
+
}
|
| 505 |
+
|
| 506 |
+
document.getElementById('btnReset').addEventListener('click', async () => {
|
| 507 |
+
const taskId = taskIdEl.value;
|
| 508 |
+
setStatus('Starting a new scenario...');
|
| 509 |
+
try {
|
| 510 |
+
const payload = { task_id: taskId };
|
| 511 |
+
if (taskId === 'task_production') {
|
| 512 |
+
payload.production_profile = document.getElementById('productionProfile').value;
|
| 513 |
+
payload.business_hours_mode = document.getElementById('businessHoursMode').value === 'true';
|
| 514 |
+
payload.escalation_mode = document.getElementById('escalationMode').value;
|
| 515 |
+
}
|
| 516 |
+
const data = await postJson('/reset', payload);
|
| 517 |
+
setStatus('Scenario started. Read the email below.');
|
| 518 |
+
updateEmailPanel(data);
|
| 519 |
+
updateInsights(data);
|
| 520 |
+
writeOutput(data);
|
| 521 |
+
} catch (e) {
|
| 522 |
+
setStatus('Could not start scenario. See details below.', true);
|
| 523 |
+
writeOutput(String(e));
|
| 524 |
+
}
|
| 525 |
+
});
|
| 526 |
+
|
| 527 |
+
document.getElementById('btnState').addEventListener('click', async () => {
|
| 528 |
+
setStatus('Checking progress...');
|
| 529 |
+
try {
|
| 530 |
+
const data = await postJson('/state', {});
|
| 531 |
+
setStatus('Progress updated.');
|
| 532 |
+
updateInsights(data);
|
| 533 |
+
writeOutput(data);
|
| 534 |
+
} catch (e) {
|
| 535 |
+
setStatus('Could not fetch progress. See details below.', true);
|
| 536 |
+
writeOutput(String(e));
|
| 537 |
+
}
|
| 538 |
+
});
|
| 539 |
+
|
| 540 |
+
document.getElementById('btnStep').addEventListener('click', async () => {
|
| 541 |
+
const payload = {
|
| 542 |
+
label: document.getElementById('label').value,
|
| 543 |
+
summary: document.getElementById('summary').value,
|
| 544 |
+
route_to: document.getElementById('routeTo').value,
|
| 545 |
+
};
|
| 546 |
+
setStatus('Sending your decision...');
|
| 547 |
+
try {
|
| 548 |
+
const data = await postJson('/step', payload);
|
| 549 |
+
setStatus('Decision saved.');
|
| 550 |
+
updateEmailPanel(data);
|
| 551 |
+
updateInsights(data);
|
| 552 |
+
writeOutput(data);
|
| 553 |
+
} catch (e) {
|
| 554 |
+
setStatus('Could not submit decision. See details below.', true);
|
| 555 |
+
writeOutput(String(e));
|
| 556 |
+
}
|
| 557 |
+
});
|
| 558 |
+
|
| 559 |
+
document.getElementById('chipSafety').addEventListener('click', () => {
|
| 560 |
+
prefillAction('urgent', 'safety', 'Potential safety risk with immediate escalation needed.');
|
| 561 |
+
});
|
| 562 |
+
|
| 563 |
+
document.getElementById('chipBilling').addEventListener('click', () => {
|
| 564 |
+
prefillAction('normal', 'billing', 'Customer billing issue needs finance team review and response.');
|
| 565 |
+
});
|
| 566 |
+
|
| 567 |
+
document.getElementById('chipSpam').addEventListener('click', () => {
|
| 568 |
+
prefillAction('spam', 'general', 'Likely phishing or irrelevant message with suspicious external request.');
|
| 569 |
+
});
|
| 570 |
+
|
| 571 |
+
taskIdEl.addEventListener('change', updateProductionControlsVisibility);
|
| 572 |
+
|
| 573 |
+
updateProductionControlsVisibility();
|
| 574 |
+
warmup();
|
| 575 |
+
</script>
|
| 576 |
+
</body>
|
| 577 |
+
</html>
|
| 578 |
+
"""
|
| 579 |
+
|
| 580 |
+
app = Flask(__name__)
|
| 581 |
+
current_env = EmailTriageEnv(task_id="task_easy")
|
| 582 |
+
SCENARIO_COUNTERS = {task_id: 0 for task_id in list_task_ids()}
|
| 583 |
+
DEFAULT_EVAL_SPLIT = os.getenv("OPENENV_EVAL_SPLIT", "public")
|
| 584 |
+
ALLOW_CLIENT_EVAL_OVERRIDE = (
|
| 585 |
+
os.getenv("OPENENV_ALLOW_CLIENT_EVAL_OVERRIDE", "false").strip().lower() == "true"
|
| 586 |
+
)
|
| 587 |
+
|
| 588 |
+
|
| 589 |
+
@app.get("/")
|
| 590 |
+
def root_page():
|
| 591 |
+
"""Render a lightweight frontend for interacting with the environment."""
|
| 592 |
+
return Response(FRONTEND_HTML, mimetype="text/html")
|
| 593 |
+
|
| 594 |
+
|
| 595 |
+
@app.get("/meta")
|
| 596 |
+
def root_endpoint():
|
| 597 |
+
"""Return service metadata for health checks and machine clients."""
|
| 598 |
+
return jsonify(
|
| 599 |
+
{
|
| 600 |
+
"name": "email-triage-env",
|
| 601 |
+
"status": "ok",
|
| 602 |
+
"endpoints": {
|
| 603 |
+
"reset": {"method": "POST", "path": "/reset"},
|
| 604 |
+
"step": {"method": "POST", "path": "/step"},
|
| 605 |
+
"state": {"method": "POST", "path": "/state"},
|
| 606 |
+
},
|
| 607 |
+
"scenario_pools": {
|
| 608 |
+
"public": {
|
| 609 |
+
task_id: get_task_scenario_count(task_id, "public")
|
| 610 |
+
for task_id in list_task_ids()
|
| 611 |
+
},
|
| 612 |
+
},
|
| 613 |
+
"eval_split": DEFAULT_EVAL_SPLIT,
|
| 614 |
+
"production_runtime_controls": {
|
| 615 |
+
"production_profile": ["light", "standard", "heavy"],
|
| 616 |
+
"business_hours_mode": [True, False],
|
| 617 |
+
"escalation_mode": ["low", "normal", "high"],
|
| 618 |
+
},
|
| 619 |
+
}
|
| 620 |
+
)
|
| 621 |
+
|
| 622 |
+
|
| 623 |
+
@app.post("/reset")
|
| 624 |
+
def reset_endpoint():
|
| 625 |
+
"""Reset the environment with a selected task and return ResetResult JSON.
|
| 626 |
+
|
| 627 |
+
Returns:
|
| 628 |
+
Flask response containing reset payload.
|
| 629 |
+
"""
|
| 630 |
+
global current_env
|
| 631 |
+
global SCENARIO_COUNTERS
|
| 632 |
+
|
| 633 |
+
payload = request.get_json(silent=True)
|
| 634 |
+
if payload is None:
|
| 635 |
+
payload = {}
|
| 636 |
+
elif not isinstance(payload, dict):
|
| 637 |
+
return jsonify({"error": "Malformed JSON payload."}), 400
|
| 638 |
+
|
| 639 |
+
task_id = payload.get("task_id", "task_easy")
|
| 640 |
+
if not isinstance(task_id, str):
|
| 641 |
+
return jsonify({"error": "Field 'task_id' must be a string."}), 400
|
| 642 |
+
|
| 643 |
+
runtime_options: dict[str, object] = {}
|
| 644 |
+
if task_id == "task_production":
|
| 645 |
+
production_profile = payload.get("production_profile", "standard")
|
| 646 |
+
if not isinstance(production_profile, str) or production_profile not in {
|
| 647 |
+
"light",
|
| 648 |
+
"standard",
|
| 649 |
+
"heavy",
|
| 650 |
+
}:
|
| 651 |
+
return (
|
| 652 |
+
jsonify(
|
| 653 |
+
{
|
| 654 |
+
"error": (
|
| 655 |
+
"Field 'production_profile' must be one of "
|
| 656 |
+
"light/standard/heavy."
|
| 657 |
+
)
|
| 658 |
+
}
|
| 659 |
+
),
|
| 660 |
+
400,
|
| 661 |
+
)
|
| 662 |
+
|
| 663 |
+
escalation_mode = payload.get("escalation_mode", "normal")
|
| 664 |
+
if not isinstance(escalation_mode, str) or escalation_mode not in {
|
| 665 |
+
"low",
|
| 666 |
+
"normal",
|
| 667 |
+
"high",
|
| 668 |
+
}:
|
| 669 |
+
return (
|
| 670 |
+
jsonify(
|
| 671 |
+
{
|
| 672 |
+
"error": (
|
| 673 |
+
"Field 'escalation_mode' must be one of "
|
| 674 |
+
"low/normal/high."
|
| 675 |
+
)
|
| 676 |
+
}
|
| 677 |
+
),
|
| 678 |
+
400,
|
| 679 |
+
)
|
| 680 |
+
|
| 681 |
+
business_hours_mode = payload.get("business_hours_mode", False)
|
| 682 |
+
if isinstance(business_hours_mode, str):
|
| 683 |
+
business_hours_mode = business_hours_mode.strip().lower() in {
|
| 684 |
+
"1",
|
| 685 |
+
"true",
|
| 686 |
+
"yes",
|
| 687 |
+
"on",
|
| 688 |
+
}
|
| 689 |
+
elif not isinstance(business_hours_mode, bool):
|
| 690 |
+
return jsonify({"error": "Field 'business_hours_mode' must be boolean."}), 400
|
| 691 |
+
|
| 692 |
+
runtime_options = {
|
| 693 |
+
"production_profile": production_profile,
|
| 694 |
+
"business_hours_mode": business_hours_mode,
|
| 695 |
+
"escalation_mode": escalation_mode,
|
| 696 |
+
}
|
| 697 |
+
|
| 698 |
+
if not ALLOW_CLIENT_EVAL_OVERRIDE and (
|
| 699 |
+
"eval_split" in payload or "scenario_index" in payload
|
| 700 |
+
):
|
| 701 |
+
return jsonify(
|
| 702 |
+
{
|
| 703 |
+
"error": (
|
| 704 |
+
"Client overrides for eval_split/scenario_index are disabled "
|
| 705 |
+
"by server policy."
|
| 706 |
+
)
|
| 707 |
+
}
|
| 708 |
+
), 400
|
| 709 |
+
|
| 710 |
+
eval_split = DEFAULT_EVAL_SPLIT
|
| 711 |
+
if ALLOW_CLIENT_EVAL_OVERRIDE:
|
| 712 |
+
requested_split = payload.get("eval_split", DEFAULT_EVAL_SPLIT)
|
| 713 |
+
if not isinstance(requested_split, str):
|
| 714 |
+
return jsonify({"error": "Field 'eval_split' must be a string."}), 400
|
| 715 |
+
eval_split = requested_split
|
| 716 |
+
|
| 717 |
+
requested_index = payload.get("scenario_index") if ALLOW_CLIENT_EVAL_OVERRIDE else None
|
| 718 |
+
if requested_index is not None and (not isinstance(requested_index, int) or requested_index < 0):
|
| 719 |
+
return jsonify({"error": "Field 'scenario_index' must be a non-negative integer."}), 400
|
| 720 |
+
|
| 721 |
+
try:
|
| 722 |
+
scenario_count = get_task_scenario_count(task_id, eval_split)
|
| 723 |
+
if requested_index is None:
|
| 724 |
+
scenario_index = SCENARIO_COUNTERS.get(task_id, 0)
|
| 725 |
+
if scenario_count > 0:
|
| 726 |
+
SCENARIO_COUNTERS[task_id] = (scenario_index + 1) % scenario_count
|
| 727 |
+
else:
|
| 728 |
+
scenario_index = requested_index
|
| 729 |
+
|
| 730 |
+
current_env = EmailTriageEnv(
|
| 731 |
+
task_id=task_id,
|
| 732 |
+
scenario_index=scenario_index,
|
| 733 |
+
split=eval_split,
|
| 734 |
+
runtime_options=runtime_options,
|
| 735 |
+
)
|
| 736 |
+
reset_result = current_env.reset()
|
| 737 |
+
except KeyError as error:
|
| 738 |
+
return jsonify({"error": str(error)}), 400
|
| 739 |
+
|
| 740 |
+
return jsonify(reset_result.model_dump())
|
| 741 |
+
|
| 742 |
+
|
| 743 |
+
@app.post("/step")
|
| 744 |
+
def step_endpoint():
|
| 745 |
+
"""Advance environment by one action and return StepResult JSON.
|
| 746 |
+
|
| 747 |
+
Returns:
|
| 748 |
+
Flask response containing step payload.
|
| 749 |
+
"""
|
| 750 |
+
payload = request.get_json(silent=True)
|
| 751 |
+
if payload is None:
|
| 752 |
+
return jsonify({"error": "Malformed JSON payload."}), 400
|
| 753 |
+
|
| 754 |
+
step_result = current_env.step(payload)
|
| 755 |
+
return jsonify(step_result.model_dump())
|
| 756 |
+
|
| 757 |
+
|
| 758 |
+
@app.post("/state")
|
| 759 |
+
def state_endpoint():
|
| 760 |
+
"""Return read-only EnvironmentState JSON snapshot.
|
| 761 |
+
|
| 762 |
+
Returns:
|
| 763 |
+
Flask response containing state payload.
|
| 764 |
+
"""
|
| 765 |
+
state_result = current_env.state()
|
| 766 |
+
return jsonify(state_result.model_dump())
|
| 767 |
+
|
| 768 |
+
|
| 769 |
+
def main() -> None:
|
| 770 |
+
"""Run the Flask app for local and script-based launches."""
|
| 771 |
+
app.run(host="0.0.0.0", port=7860)
|
| 772 |
+
|
| 773 |
+
|
| 774 |
+
if __name__ == "__main__":
|
| 775 |
+
main()
|
tasks.py
ADDED
|
@@ -0,0 +1,748 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Task definitions and scenario pools for the OpenEnv email triage environment."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import random
|
| 6 |
+
from datetime import datetime, timedelta, timezone
|
| 7 |
+
|
| 8 |
+
from typing import cast
|
| 9 |
+
|
| 10 |
+
TASK_LIBRARY: dict[str, dict[str, object]] = {
|
| 11 |
+
"task_easy": {
|
| 12 |
+
"description": "Classify and route one unambiguous operational email.",
|
| 13 |
+
"scenario_pool": [
|
| 14 |
+
{
|
| 15 |
+
"scenario_id": "easy_invoice_confirmation",
|
| 16 |
+
"emails": [
|
| 17 |
+
{
|
| 18 |
+
"email_id": "easy-001",
|
| 19 |
+
"subject": "Quarterly invoice available",
|
| 20 |
+
"body": (
|
| 21 |
+
"Hello Team, your Q1 invoice is now ready in the billing portal. "
|
| 22 |
+
"Please confirm the purchase order number by Friday."
|
| 23 |
+
),
|
| 24 |
+
"sender": "accounts@vendor-example.com",
|
| 25 |
+
"timestamp": "2026-03-25T09:15:00Z",
|
| 26 |
+
"thread_history": [
|
| 27 |
+
"Last month: requested invoice schedule for Q1 and Q2."
|
| 28 |
+
],
|
| 29 |
+
}
|
| 30 |
+
],
|
| 31 |
+
"ground_truth": [
|
| 32 |
+
{
|
| 33 |
+
"label": "normal",
|
| 34 |
+
"route_to": "billing",
|
| 35 |
+
"priority_weight": 1.0,
|
| 36 |
+
"summary_keywords": ["invoice", "purchase order", "billing portal"],
|
| 37 |
+
}
|
| 38 |
+
],
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"scenario_id": "easy_password_lockout",
|
| 42 |
+
"emails": [
|
| 43 |
+
{
|
| 44 |
+
"email_id": "easy-002",
|
| 45 |
+
"subject": "Locked out of admin dashboard",
|
| 46 |
+
"body": (
|
| 47 |
+
"I cannot access the admin dashboard after MFA reset, and our "
|
| 48 |
+
"client demo starts in 20 minutes. Please help immediately."
|
| 49 |
+
),
|
| 50 |
+
"sender": "sales-lead@acme-enterprise.com",
|
| 51 |
+
"timestamp": "2026-03-28T11:40:00Z",
|
| 52 |
+
"thread_history": ["MFA reset was completed this morning."],
|
| 53 |
+
}
|
| 54 |
+
],
|
| 55 |
+
"ground_truth": [
|
| 56 |
+
{
|
| 57 |
+
"label": "urgent",
|
| 58 |
+
"route_to": "support",
|
| 59 |
+
"priority_weight": 1.2,
|
| 60 |
+
"summary_keywords": ["locked out", "mfa", "demo"],
|
| 61 |
+
}
|
| 62 |
+
],
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
"scenario_id": "easy_newsletter_archive",
|
| 66 |
+
"emails": [
|
| 67 |
+
{
|
| 68 |
+
"email_id": "easy-003",
|
| 69 |
+
"subject": "Monthly partner newsletter",
|
| 70 |
+
"body": (
|
| 71 |
+
"Sharing this month's partner newsletter with product updates. "
|
| 72 |
+
"No action needed unless you want to read the highlights."
|
| 73 |
+
),
|
| 74 |
+
"sender": "updates@partner-network.io",
|
| 75 |
+
"timestamp": "2026-03-30T08:10:00Z",
|
| 76 |
+
"thread_history": [],
|
| 77 |
+
}
|
| 78 |
+
],
|
| 79 |
+
"ground_truth": [
|
| 80 |
+
{
|
| 81 |
+
"label": "archive",
|
| 82 |
+
"route_to": "general",
|
| 83 |
+
"priority_weight": 0.8,
|
| 84 |
+
"summary_keywords": ["newsletter", "no action", "updates"],
|
| 85 |
+
}
|
| 86 |
+
],
|
| 87 |
+
},
|
| 88 |
+
],
|
| 89 |
+
"private_eval_pool": [],
|
| 90 |
+
},
|
| 91 |
+
"task_medium": {
|
| 92 |
+
"description": "Triage five mixed-priority emails with ambiguous contextual signals.",
|
| 93 |
+
"scenario_pool": [
|
| 94 |
+
{
|
| 95 |
+
"scenario_id": "medium_ops_mix_a",
|
| 96 |
+
"emails": [
|
| 97 |
+
{
|
| 98 |
+
"email_id": "med-001",
|
| 99 |
+
"subject": "URGENT: Your account will be disabled in 30 minutes",
|
| 100 |
+
"body": (
|
| 101 |
+
"Click this external short link to keep your mailbox active. "
|
| 102 |
+
"If you do not click now, your account will be deleted."
|
| 103 |
+
),
|
| 104 |
+
"sender": "it-admin@secure-mail-help.net",
|
| 105 |
+
"timestamp": "2026-03-26T07:08:00Z",
|
| 106 |
+
"thread_history": [],
|
| 107 |
+
},
|
| 108 |
+
{
|
| 109 |
+
"email_id": "med-002",
|
| 110 |
+
"subject": "Can someone review production error spikes?",
|
| 111 |
+
"body": (
|
| 112 |
+
"We are seeing a 28% spike in checkout failures after the 06:10 UTC "
|
| 113 |
+
"deploy. Please triage and assign on-call ownership immediately."
|
| 114 |
+
),
|
| 115 |
+
"sender": "ops-manager@acme-enterprise.com",
|
| 116 |
+
"timestamp": "2026-03-26T06:21:00Z",
|
| 117 |
+
"thread_history": ["Pager alert opened at 06:18 UTC."],
|
| 118 |
+
},
|
| 119 |
+
{
|
| 120 |
+
"email_id": "med-003",
|
| 121 |
+
"subject": "RE: promo campaign winner list",
|
| 122 |
+
"body": (
|
| 123 |
+
"Subject line looks like a campaign thread, but this message confirms "
|
| 124 |
+
"a customer reported duplicate card charges. Please review and respond."
|
| 125 |
+
),
|
| 126 |
+
"sender": "care-escalations@acme-enterprise.com",
|
| 127 |
+
"timestamp": "2026-03-26T11:42:00Z",
|
| 128 |
+
"thread_history": [
|
| 129 |
+
"Marketing team forwarded customer complaint for billing review."
|
| 130 |
+
],
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"email_id": "med-004",
|
| 134 |
+
"subject": "Safety escalation: charger overheating case #4812",
|
| 135 |
+
"body": (
|
| 136 |
+
"Customer reports visible smoke from charging dock during normal use. "
|
| 137 |
+
"No injuries reported, but immediate safety review requested."
|
| 138 |
+
),
|
| 139 |
+
"sender": "support-lead@acme-enterprise.com",
|
| 140 |
+
"timestamp": "2026-03-26T10:03:00Z",
|
| 141 |
+
"thread_history": ["Ticket severity raised from P2 to P1."],
|
| 142 |
+
},
|
| 143 |
+
{
|
| 144 |
+
"email_id": "med-005",
|
| 145 |
+
"subject": "FYI: April all-hands agenda",
|
| 146 |
+
"body": (
|
| 147 |
+
"Sharing the all-hands agenda draft. No action required unless you "
|
| 148 |
+
"want to propose additional topics by Monday."
|
| 149 |
+
),
|
| 150 |
+
"sender": "people-ops@acme-enterprise.com",
|
| 151 |
+
"timestamp": "2026-03-26T14:25:00Z",
|
| 152 |
+
"thread_history": [],
|
| 153 |
+
},
|
| 154 |
+
],
|
| 155 |
+
"ground_truth": [
|
| 156 |
+
{
|
| 157 |
+
"label": "spam",
|
| 158 |
+
"route_to": "general",
|
| 159 |
+
"priority_weight": 1.0,
|
| 160 |
+
"summary_keywords": ["external link", "disable", "phishing"],
|
| 161 |
+
},
|
| 162 |
+
{
|
| 163 |
+
"label": "urgent",
|
| 164 |
+
"route_to": "engineering",
|
| 165 |
+
"priority_weight": 1.5,
|
| 166 |
+
"summary_keywords": ["checkout", "failures", "on-call"],
|
| 167 |
+
},
|
| 168 |
+
{
|
| 169 |
+
"label": "normal",
|
| 170 |
+
"route_to": "billing",
|
| 171 |
+
"priority_weight": 1.2,
|
| 172 |
+
"summary_keywords": ["duplicate", "charges", "billing"],
|
| 173 |
+
},
|
| 174 |
+
{
|
| 175 |
+
"label": "urgent",
|
| 176 |
+
"route_to": "safety",
|
| 177 |
+
"priority_weight": 1.6,
|
| 178 |
+
"summary_keywords": ["smoke", "overheating", "safety"],
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"label": "archive",
|
| 182 |
+
"route_to": "general",
|
| 183 |
+
"priority_weight": 0.8,
|
| 184 |
+
"summary_keywords": ["all-hands", "agenda", "no action required"],
|
| 185 |
+
},
|
| 186 |
+
],
|
| 187 |
+
},
|
| 188 |
+
{
|
| 189 |
+
"scenario_id": "medium_ops_mix_b",
|
| 190 |
+
"emails": [
|
| 191 |
+
{
|
| 192 |
+
"email_id": "med-b-001",
|
| 193 |
+
"subject": "Action required: verify payroll account immediately",
|
| 194 |
+
"body": (
|
| 195 |
+
"Your payroll account appears locked. Verify your credentials on this "
|
| 196 |
+
"new portal link to avoid delayed salary processing."
|
| 197 |
+
),
|
| 198 |
+
"sender": "payroll-security@alerts-payroll.net",
|
| 199 |
+
"timestamp": "2026-04-01T07:00:00Z",
|
| 200 |
+
"thread_history": [],
|
| 201 |
+
},
|
| 202 |
+
{
|
| 203 |
+
"email_id": "med-b-002",
|
| 204 |
+
"subject": "Incident: checkout API timeout in eu-west",
|
| 205 |
+
"body": (
|
| 206 |
+
"Payments API timeout crossed SLO in eu-west for 11 minutes. "
|
| 207 |
+
"Revenue impact probable. On-call escalation required."
|
| 208 |
+
),
|
| 209 |
+
"sender": "sre@acme-enterprise.com",
|
| 210 |
+
"timestamp": "2026-04-01T06:40:00Z",
|
| 211 |
+
"thread_history": ["Auto-remediation attempt failed."],
|
| 212 |
+
},
|
| 213 |
+
{
|
| 214 |
+
"email_id": "med-b-003",
|
| 215 |
+
"subject": "Question about duplicate invoice #4421",
|
| 216 |
+
"body": (
|
| 217 |
+
"Customer says invoice #4421 appears twice in the portal and asks "
|
| 218 |
+
"which one should be paid."
|
| 219 |
+
),
|
| 220 |
+
"sender": "support@acme-enterprise.com",
|
| 221 |
+
"timestamp": "2026-04-01T09:22:00Z",
|
| 222 |
+
"thread_history": ["Ticket tagged as finance inquiry."],
|
| 223 |
+
},
|
| 224 |
+
{
|
| 225 |
+
"email_id": "med-b-004",
|
| 226 |
+
"subject": "Potential safety issue in warehouse charging bay",
|
| 227 |
+
"body": (
|
| 228 |
+
"Night shift observed sparks from charging rack slot C after routine use. "
|
| 229 |
+
"Operations paused affected rack."
|
| 230 |
+
),
|
| 231 |
+
"sender": "warehouse-lead@acme-enterprise.com",
|
| 232 |
+
"timestamp": "2026-04-01T04:50:00Z",
|
| 233 |
+
"thread_history": ["Photos attached in internal ticket."],
|
| 234 |
+
},
|
| 235 |
+
{
|
| 236 |
+
"email_id": "med-b-005",
|
| 237 |
+
"subject": "Reminder: volunteer signup closes Friday",
|
| 238 |
+
"body": "Friendly reminder to sign up for community day volunteer slots.",
|
| 239 |
+
"sender": "people-ops@acme-enterprise.com",
|
| 240 |
+
"timestamp": "2026-04-01T10:12:00Z",
|
| 241 |
+
"thread_history": [],
|
| 242 |
+
},
|
| 243 |
+
],
|
| 244 |
+
"ground_truth": [
|
| 245 |
+
{
|
| 246 |
+
"label": "spam",
|
| 247 |
+
"route_to": "general",
|
| 248 |
+
"priority_weight": 1.0,
|
| 249 |
+
"summary_keywords": ["verify", "portal link", "payroll"],
|
| 250 |
+
},
|
| 251 |
+
{
|
| 252 |
+
"label": "urgent",
|
| 253 |
+
"route_to": "engineering",
|
| 254 |
+
"priority_weight": 1.5,
|
| 255 |
+
"summary_keywords": ["timeout", "slo", "on-call"],
|
| 256 |
+
},
|
| 257 |
+
{
|
| 258 |
+
"label": "normal",
|
| 259 |
+
"route_to": "billing",
|
| 260 |
+
"priority_weight": 1.2,
|
| 261 |
+
"summary_keywords": ["duplicate invoice", "paid", "finance"],
|
| 262 |
+
},
|
| 263 |
+
{
|
| 264 |
+
"label": "urgent",
|
| 265 |
+
"route_to": "safety",
|
| 266 |
+
"priority_weight": 1.6,
|
| 267 |
+
"summary_keywords": ["sparks", "charging", "paused"],
|
| 268 |
+
},
|
| 269 |
+
{
|
| 270 |
+
"label": "archive",
|
| 271 |
+
"route_to": "general",
|
| 272 |
+
"priority_weight": 0.8,
|
| 273 |
+
"summary_keywords": ["reminder", "volunteer", "signup"],
|
| 274 |
+
},
|
| 275 |
+
],
|
| 276 |
+
},
|
| 277 |
+
],
|
| 278 |
+
"private_eval_pool": [],
|
| 279 |
+
},
|
| 280 |
+
"task_hard": {
|
| 281 |
+
"description": "Handle ambiguous complaints that mix safety, legal, and billing risk.",
|
| 282 |
+
"scenario_pool": [
|
| 283 |
+
{
|
| 284 |
+
"scenario_id": "hard_cross_function_a",
|
| 285 |
+
"emails": [
|
| 286 |
+
{
|
| 287 |
+
"email_id": "hard-001",
|
| 288 |
+
"subject": "Formal complaint: unsafe device behavior and disputed charges",
|
| 289 |
+
"body": (
|
| 290 |
+
"I was charged twice for the replacement kit, and during testing the "
|
| 291 |
+
"unit became hot enough to scorch the desk surface. I need billing "
|
| 292 |
+
"correction and urgent safety follow-up today."
|
| 293 |
+
),
|
| 294 |
+
"sender": "legal-customer@enterprise-client.com",
|
| 295 |
+
"timestamp": "2026-03-26T08:33:00Z",
|
| 296 |
+
"thread_history": [
|
| 297 |
+
"Support asked customer to share photos; customer replied with incident details."
|
| 298 |
+
],
|
| 299 |
+
},
|
| 300 |
+
{
|
| 301 |
+
"email_id": "hard-002",
|
| 302 |
+
"subject": "Escalation follow-up: compliance and refund timeline",
|
| 303 |
+
"body": (
|
| 304 |
+
"Following up on the same incident, compliance team requests confirmation "
|
| 305 |
+
"of safety escalation and billing refund timeline before we close the case."
|
| 306 |
+
),
|
| 307 |
+
"sender": "procurement@enterprise-client.com",
|
| 308 |
+
"timestamp": "2026-03-26T09:07:00Z",
|
| 309 |
+
"thread_history": ["Legal requested cross-team response within 4 business hours."],
|
| 310 |
+
},
|
| 311 |
+
],
|
| 312 |
+
"ground_truth": [
|
| 313 |
+
{
|
| 314 |
+
"label": "urgent",
|
| 315 |
+
"route_to": "safety",
|
| 316 |
+
"cc_route": "billing",
|
| 317 |
+
"penalize_spam": 0.2,
|
| 318 |
+
"summary_keywords": ["unsafe", "overheating", "double charge", "refund"],
|
| 319 |
+
},
|
| 320 |
+
{
|
| 321 |
+
"label": "urgent",
|
| 322 |
+
"route_to": "safety",
|
| 323 |
+
"cc_route": "billing",
|
| 324 |
+
"penalize_spam": 0.2,
|
| 325 |
+
"summary_keywords": ["compliance", "safety escalation", "refund timeline"],
|
| 326 |
+
},
|
| 327 |
+
],
|
| 328 |
+
},
|
| 329 |
+
{
|
| 330 |
+
"scenario_id": "hard_cross_function_b",
|
| 331 |
+
"emails": [
|
| 332 |
+
{
|
| 333 |
+
"email_id": "hard-b-001",
|
| 334 |
+
"subject": "Executive escalation: battery smoke + invoice mismatch",
|
| 335 |
+
"body": (
|
| 336 |
+
"A strategic customer reported smoke from a battery dock and also found an "
|
| 337 |
+
"invoice mismatch on the emergency replacement shipment."
|
| 338 |
+
),
|
| 339 |
+
"sender": "exec-office@acme-enterprise.com",
|
| 340 |
+
"timestamp": "2026-04-01T07:40:00Z",
|
| 341 |
+
"thread_history": ["Account owner requested immediate cross-functional response."],
|
| 342 |
+
},
|
| 343 |
+
{
|
| 344 |
+
"email_id": "hard-b-002",
|
| 345 |
+
"subject": "Legal asks for mitigation proof and refund confirmation",
|
| 346 |
+
"body": (
|
| 347 |
+
"Legal team needs written mitigation steps for the safety issue and proof "
|
| 348 |
+
"that refund correction has been processed."
|
| 349 |
+
),
|
| 350 |
+
"sender": "legal@enterprise-client.com",
|
| 351 |
+
"timestamp": "2026-04-01T08:05:00Z",
|
| 352 |
+
"thread_history": ["Deadline: before board review this afternoon."],
|
| 353 |
+
},
|
| 354 |
+
],
|
| 355 |
+
"ground_truth": [
|
| 356 |
+
{
|
| 357 |
+
"label": "urgent",
|
| 358 |
+
"route_to": "safety",
|
| 359 |
+
"cc_route": "billing",
|
| 360 |
+
"penalize_spam": 0.2,
|
| 361 |
+
"summary_keywords": ["smoke", "invoice mismatch", "strategic customer"],
|
| 362 |
+
},
|
| 363 |
+
{
|
| 364 |
+
"label": "urgent",
|
| 365 |
+
"route_to": "safety",
|
| 366 |
+
"cc_route": "billing",
|
| 367 |
+
"penalize_spam": 0.2,
|
| 368 |
+
"summary_keywords": ["mitigation", "refund confirmation", "legal"],
|
| 369 |
+
},
|
| 370 |
+
],
|
| 371 |
+
},
|
| 372 |
+
],
|
| 373 |
+
"private_eval_pool": [],
|
| 374 |
+
},
|
| 375 |
+
"task_production": {
|
| 376 |
+
"description": (
|
| 377 |
+
"Simulate a production inbox with mixed business-critical, safety, billing, "
|
| 378 |
+
"support, and spam threads."
|
| 379 |
+
),
|
| 380 |
+
"scenario_pool": [],
|
| 381 |
+
"private_eval_pool": [],
|
| 382 |
+
},
|
| 383 |
+
}
|
| 384 |
+
|
| 385 |
+
PRODUCTION_SCENARIO_COUNT = 1000
|
| 386 |
+
|
| 387 |
+
PRODUCTION_TEMPLATE_LIBRARY: dict[str, dict[str, object]] = {
|
| 388 |
+
"phishing_link": {
|
| 389 |
+
"label": "spam",
|
| 390 |
+
"route_to": "general",
|
| 391 |
+
"priority_weight": 1.0,
|
| 392 |
+
"summary_keywords": ["phishing", "external link", "credential"],
|
| 393 |
+
"subject_options": [
|
| 394 |
+
"Immediate verification required for shared inbox",
|
| 395 |
+
"Mailbox suspension warning - action needed",
|
| 396 |
+
],
|
| 397 |
+
"body_options": [
|
| 398 |
+
"Security team asks you to verify credentials using this external short link.",
|
| 399 |
+
"Your mailbox will be disabled unless you confirm account details on new portal.",
|
| 400 |
+
],
|
| 401 |
+
"sender_options": ["security-alert@mail-checkup.net", "admin-support@auth-updates.co"],
|
| 402 |
+
},
|
| 403 |
+
"incident_checkout": {
|
| 404 |
+
"label": "urgent",
|
| 405 |
+
"route_to": "engineering",
|
| 406 |
+
"priority_weight": 1.8,
|
| 407 |
+
"summary_keywords": ["checkout", "incident", "on-call"],
|
| 408 |
+
"subject_options": [
|
| 409 |
+
"SEV-1: checkout failures rising",
|
| 410 |
+
"Revenue incident: payment flow degraded",
|
| 411 |
+
],
|
| 412 |
+
"body_options": [
|
| 413 |
+
"Checkout success rate dropped after deployment and on-call escalation is pending.",
|
| 414 |
+
"Payment API latency breach is impacting order completion in production.",
|
| 415 |
+
],
|
| 416 |
+
"sender_options": ["incident-bot@acme-enterprise.com", "sre-lead@acme-enterprise.com"],
|
| 417 |
+
},
|
| 418 |
+
"billing_refund": {
|
| 419 |
+
"label": "normal",
|
| 420 |
+
"route_to": "billing",
|
| 421 |
+
"priority_weight": 1.3,
|
| 422 |
+
"summary_keywords": ["refund", "invoice", "duplicate charge"],
|
| 423 |
+
"subject_options": [
|
| 424 |
+
"Customer reports duplicate charge",
|
| 425 |
+
"Refund timeline confirmation request",
|
| 426 |
+
],
|
| 427 |
+
"body_options": [
|
| 428 |
+
"Customer sees duplicate charge on invoice and asks for correction timeline.",
|
| 429 |
+
"Account manager requests status update for pending reimbursement case.",
|
| 430 |
+
],
|
| 431 |
+
"sender_options": ["care-escalations@acme-enterprise.com", "finance-ops@acme-enterprise.com"],
|
| 432 |
+
},
|
| 433 |
+
"safety_smoke": {
|
| 434 |
+
"label": "urgent",
|
| 435 |
+
"route_to": "safety",
|
| 436 |
+
"priority_weight": 1.9,
|
| 437 |
+
"summary_keywords": ["safety", "smoke", "overheating"],
|
| 438 |
+
"subject_options": [
|
| 439 |
+
"Safety escalation: smoke seen during charging",
|
| 440 |
+
"Urgent product safety complaint",
|
| 441 |
+
],
|
| 442 |
+
"body_options": [
|
| 443 |
+
"Customer reports visible smoke and overheating during normal charging operation.",
|
| 444 |
+
"Field ops flagged possible thermal event requiring immediate safety review.",
|
| 445 |
+
],
|
| 446 |
+
"sender_options": ["support-lead@acme-enterprise.com", "field-ops@acme-enterprise.com"],
|
| 447 |
+
},
|
| 448 |
+
"support_access": {
|
| 449 |
+
"label": "urgent",
|
| 450 |
+
"route_to": "support",
|
| 451 |
+
"priority_weight": 1.4,
|
| 452 |
+
"summary_keywords": ["locked out", "access", "mfa"],
|
| 453 |
+
"subject_options": [
|
| 454 |
+
"Executive locked out after MFA reset",
|
| 455 |
+
"Cannot access admin dashboard before meeting",
|
| 456 |
+
],
|
| 457 |
+
"body_options": [
|
| 458 |
+
"User cannot access dashboard after security reset and requests immediate help.",
|
| 459 |
+
"Critical user lockout blocks live customer session starting shortly.",
|
| 460 |
+
],
|
| 461 |
+
"sender_options": ["sales-lead@acme-enterprise.com", "exec-assistant@acme-enterprise.com"],
|
| 462 |
+
},
|
| 463 |
+
"archive_digest": {
|
| 464 |
+
"label": "archive",
|
| 465 |
+
"route_to": "general",
|
| 466 |
+
"priority_weight": 0.7,
|
| 467 |
+
"summary_keywords": ["newsletter", "digest", "no action"],
|
| 468 |
+
"subject_options": [
|
| 469 |
+
"Monthly partner digest",
|
| 470 |
+
"Internal culture newsletter",
|
| 471 |
+
],
|
| 472 |
+
"body_options": [
|
| 473 |
+
"Sharing monthly digest for awareness only; no action required.",
|
| 474 |
+
"Newsletter update with optional reads and no operational request.",
|
| 475 |
+
],
|
| 476 |
+
"sender_options": ["people-ops@acme-enterprise.com", "updates@partner-network.io"],
|
| 477 |
+
},
|
| 478 |
+
}
|
| 479 |
+
|
| 480 |
+
PRODUCTION_EVENT_PLAN: list[str] = [
|
| 481 |
+
"phishing_link",
|
| 482 |
+
"incident_checkout",
|
| 483 |
+
"billing_refund",
|
| 484 |
+
"safety_smoke",
|
| 485 |
+
"support_access",
|
| 486 |
+
"archive_digest",
|
| 487 |
+
"incident_checkout",
|
| 488 |
+
"billing_refund",
|
| 489 |
+
"safety_smoke",
|
| 490 |
+
"support_access",
|
| 491 |
+
"archive_digest",
|
| 492 |
+
"incident_checkout",
|
| 493 |
+
"billing_refund",
|
| 494 |
+
"phishing_link",
|
| 495 |
+
"safety_smoke",
|
| 496 |
+
"support_access",
|
| 497 |
+
"incident_checkout",
|
| 498 |
+
"archive_digest",
|
| 499 |
+
]
|
| 500 |
+
|
| 501 |
+
PRODUCTION_PROFILE_EVENT_PLANS: dict[str, list[str]] = {
|
| 502 |
+
"light": PRODUCTION_EVENT_PLAN[:12],
|
| 503 |
+
"standard": PRODUCTION_EVENT_PLAN,
|
| 504 |
+
"heavy": PRODUCTION_EVENT_PLAN
|
| 505 |
+
+ [
|
| 506 |
+
"incident_checkout",
|
| 507 |
+
"safety_smoke",
|
| 508 |
+
"incident_checkout",
|
| 509 |
+
"billing_refund",
|
| 510 |
+
"support_access",
|
| 511 |
+
"safety_smoke",
|
| 512 |
+
"phishing_link",
|
| 513 |
+
"incident_checkout",
|
| 514 |
+
"billing_refund",
|
| 515 |
+
"archive_digest",
|
| 516 |
+
],
|
| 517 |
+
}
|
| 518 |
+
|
| 519 |
+
|
| 520 |
+
def _normalize_production_profile(profile_value: object) -> str:
|
| 521 |
+
"""Normalize production profile value to one of light/standard/heavy."""
|
| 522 |
+
profile = str(profile_value or "standard").strip().lower()
|
| 523 |
+
return profile if profile in PRODUCTION_PROFILE_EVENT_PLANS else "standard"
|
| 524 |
+
|
| 525 |
+
|
| 526 |
+
def _coerce_bool(value: object, default: bool = False) -> bool:
|
| 527 |
+
"""Coerce bool-ish values from runtime options."""
|
| 528 |
+
if isinstance(value, bool):
|
| 529 |
+
return value
|
| 530 |
+
if isinstance(value, str):
|
| 531 |
+
normalized = value.strip().lower()
|
| 532 |
+
if normalized in {"1", "true", "yes", "on"}:
|
| 533 |
+
return True
|
| 534 |
+
if normalized in {"0", "false", "no", "off"}:
|
| 535 |
+
return False
|
| 536 |
+
return default
|
| 537 |
+
|
| 538 |
+
|
| 539 |
+
def _load_private_eval_overrides() -> dict[str, list[dict[str, object]]]:
|
| 540 |
+
"""Load private-eval scenarios from OPENENV_PRIVATE_SCENARIOS_JSON.
|
| 541 |
+
|
| 542 |
+
Expected shape:
|
| 543 |
+
{
|
| 544 |
+
"task_easy": [ {"scenario_id": "...", "emails": [...], "ground_truth": [...]}, ... ],
|
| 545 |
+
"task_medium": [...],
|
| 546 |
+
"task_hard": [...]
|
| 547 |
+
}
|
| 548 |
+
"""
|
| 549 |
+
raw_payload = os.getenv("OPENENV_PRIVATE_SCENARIOS_JSON", "").strip()
|
| 550 |
+
if not raw_payload:
|
| 551 |
+
return {}
|
| 552 |
+
|
| 553 |
+
try:
|
| 554 |
+
parsed_payload = json.loads(raw_payload)
|
| 555 |
+
except json.JSONDecodeError:
|
| 556 |
+
return {}
|
| 557 |
+
|
| 558 |
+
if not isinstance(parsed_payload, dict):
|
| 559 |
+
return {}
|
| 560 |
+
|
| 561 |
+
validated: dict[str, list[dict[str, object]]] = {}
|
| 562 |
+
for task_id, pool in parsed_payload.items():
|
| 563 |
+
if not isinstance(task_id, str) or task_id not in TASK_LIBRARY:
|
| 564 |
+
continue
|
| 565 |
+
if isinstance(pool, list):
|
| 566 |
+
validated[task_id] = cast(list[dict[str, object]], pool)
|
| 567 |
+
|
| 568 |
+
return validated
|
| 569 |
+
|
| 570 |
+
|
| 571 |
+
def _build_production_task_definition(
|
| 572 |
+
scenario_index: int,
|
| 573 |
+
split: str,
|
| 574 |
+
runtime_options: dict[str, object] | None = None,
|
| 575 |
+
) -> dict[str, object]:
|
| 576 |
+
"""Build a deterministic production-style inbox scenario."""
|
| 577 |
+
options = runtime_options or {}
|
| 578 |
+
profile = _normalize_production_profile(options.get("production_profile", "standard"))
|
| 579 |
+
business_hours_mode = _coerce_bool(options.get("business_hours_mode", False), False)
|
| 580 |
+
|
| 581 |
+
seed_base = 910000 if split == "private_eval" else 510000
|
| 582 |
+
profile_seed = {"light": 101, "standard": 202, "heavy": 303}[profile]
|
| 583 |
+
rng = random.Random(seed_base + max(0, scenario_index) + profile_seed)
|
| 584 |
+
|
| 585 |
+
base_datetime = datetime(2026, 4, 1, 7, 30, tzinfo=timezone.utc)
|
| 586 |
+
base_datetime += timedelta(minutes=max(0, scenario_index) % 720)
|
| 587 |
+
if business_hours_mode:
|
| 588 |
+
base_datetime = base_datetime.replace(hour=9, minute=0)
|
| 589 |
+
|
| 590 |
+
thread_counts: dict[str, int] = {}
|
| 591 |
+
emails: list[dict[str, object]] = []
|
| 592 |
+
ground_truth: list[dict[str, object]] = []
|
| 593 |
+
event_plan = PRODUCTION_PROFILE_EVENT_PLANS[profile]
|
| 594 |
+
|
| 595 |
+
for idx, template_key in enumerate(event_plan):
|
| 596 |
+
template = PRODUCTION_TEMPLATE_LIBRARY[template_key]
|
| 597 |
+
thread_family = template_key.split("_")[0]
|
| 598 |
+
thread_counts[thread_family] = thread_counts.get(thread_family, 0) + 1
|
| 599 |
+
thread_number = thread_counts[thread_family]
|
| 600 |
+
|
| 601 |
+
subject = str(rng.choice(cast(list[str], template["subject_options"])))
|
| 602 |
+
body = str(rng.choice(cast(list[str], template["body_options"])))
|
| 603 |
+
sender = str(rng.choice(cast(list[str], template["sender_options"])))
|
| 604 |
+
|
| 605 |
+
if thread_number > 1:
|
| 606 |
+
subject = f"RE[{thread_number}]: {subject}"
|
| 607 |
+
|
| 608 |
+
if business_hours_mode:
|
| 609 |
+
business_window_minutes = 8 * 60
|
| 610 |
+
minute_offset = (idx * 17) % business_window_minutes
|
| 611 |
+
day_offset = (idx * 17) // business_window_minutes
|
| 612 |
+
event_dt = (base_datetime + timedelta(days=day_offset, minutes=minute_offset)).replace(
|
| 613 |
+
hour=9 + ((minute_offset // 60) % 8),
|
| 614 |
+
minute=minute_offset % 60,
|
| 615 |
+
)
|
| 616 |
+
else:
|
| 617 |
+
event_dt = base_datetime + timedelta(minutes=idx * 9)
|
| 618 |
+
timestamp = event_dt.isoformat().replace("+00:00", "Z")
|
| 619 |
+
thread_history = []
|
| 620 |
+
if thread_number > 1:
|
| 621 |
+
thread_history.append(
|
| 622 |
+
f"Previous {thread_family} thread update #{thread_number - 1} in operations inbox."
|
| 623 |
+
)
|
| 624 |
+
|
| 625 |
+
emails.append(
|
| 626 |
+
{
|
| 627 |
+
"email_id": f"prod-{scenario_index:04d}-{idx + 1:03d}",
|
| 628 |
+
"subject": subject,
|
| 629 |
+
"body": body,
|
| 630 |
+
"sender": sender,
|
| 631 |
+
"timestamp": timestamp,
|
| 632 |
+
"thread_history": thread_history,
|
| 633 |
+
}
|
| 634 |
+
)
|
| 635 |
+
|
| 636 |
+
ground_truth.append(
|
| 637 |
+
{
|
| 638 |
+
"label": str(template["label"]),
|
| 639 |
+
"route_to": str(template["route_to"]),
|
| 640 |
+
"priority_weight": float(template["priority_weight"]),
|
| 641 |
+
"summary_keywords": cast(list[str], template["summary_keywords"]),
|
| 642 |
+
}
|
| 643 |
+
)
|
| 644 |
+
|
| 645 |
+
return {
|
| 646 |
+
"task_id": "task_production",
|
| 647 |
+
"split": split,
|
| 648 |
+
"scenario_id": f"production-{split}-{profile}-{scenario_index:04d}",
|
| 649 |
+
"description": str(TASK_LIBRARY["task_production"]["description"]),
|
| 650 |
+
"emails": emails,
|
| 651 |
+
"ground_truth": ground_truth,
|
| 652 |
+
"runtime_options": {
|
| 653 |
+
"production_profile": profile,
|
| 654 |
+
"business_hours_mode": business_hours_mode,
|
| 655 |
+
},
|
| 656 |
+
}
|
| 657 |
+
|
| 658 |
+
|
| 659 |
+
def _normalize_split(split: str | None) -> str:
|
| 660 |
+
"""Return normalized split name constrained to known values."""
|
| 661 |
+
return "private_eval" if split == "private_eval" else "public"
|
| 662 |
+
|
| 663 |
+
|
| 664 |
+
def get_task_definition(
|
| 665 |
+
task_id: str,
|
| 666 |
+
scenario_index: int = 0,
|
| 667 |
+
split: str | None = None,
|
| 668 |
+
runtime_options: dict[str, object] | None = None,
|
| 669 |
+
) -> dict[str, object]:
|
| 670 |
+
"""Return selected task scenario by task_id, split, and deterministic index.
|
| 671 |
+
|
| 672 |
+
Args:
|
| 673 |
+
task_id: Task identifier.
|
| 674 |
+
scenario_index: Deterministic index into selected scenario pool.
|
| 675 |
+
split: Scenario split selector; supports public and private_eval.
|
| 676 |
+
|
| 677 |
+
Returns:
|
| 678 |
+
Concrete task definition dictionary.
|
| 679 |
+
|
| 680 |
+
Raises:
|
| 681 |
+
KeyError: If task_id is not defined.
|
| 682 |
+
"""
|
| 683 |
+
if task_id not in TASK_LIBRARY:
|
| 684 |
+
raise KeyError(f"Unknown task_id: {task_id}")
|
| 685 |
+
|
| 686 |
+
normalized_split = _normalize_split(split)
|
| 687 |
+
if task_id == "task_production":
|
| 688 |
+
return _build_production_task_definition(
|
| 689 |
+
scenario_index,
|
| 690 |
+
normalized_split,
|
| 691 |
+
runtime_options,
|
| 692 |
+
)
|
| 693 |
+
|
| 694 |
+
task_record = TASK_LIBRARY[task_id]
|
| 695 |
+
if normalized_split == "private_eval":
|
| 696 |
+
private_overrides = _load_private_eval_overrides()
|
| 697 |
+
pool = private_overrides.get(task_id, [])
|
| 698 |
+
if not pool:
|
| 699 |
+
pool = cast(list[dict[str, object]], task_record.get("private_eval_pool", []))
|
| 700 |
+
if not pool:
|
| 701 |
+
raise KeyError(
|
| 702 |
+
"No private_eval scenarios configured for "
|
| 703 |
+
f"{task_id}. Set OPENENV_PRIVATE_SCENARIOS_JSON."
|
| 704 |
+
)
|
| 705 |
+
else:
|
| 706 |
+
pool = cast(list[dict[str, object]], task_record.get("scenario_pool", []))
|
| 707 |
+
|
| 708 |
+
if not pool:
|
| 709 |
+
raise KeyError(f"No public scenarios configured for {task_id}")
|
| 710 |
+
|
| 711 |
+
safe_index = scenario_index % len(pool)
|
| 712 |
+
scenario = pool[safe_index]
|
| 713 |
+
|
| 714 |
+
return {
|
| 715 |
+
"task_id": task_id,
|
| 716 |
+
"split": normalized_split,
|
| 717 |
+
"scenario_id": str(scenario.get("scenario_id", f"{task_id}-{safe_index}")),
|
| 718 |
+
"description": str(task_record.get("description", "")),
|
| 719 |
+
"emails": cast(list[dict[str, object]], scenario.get("emails", [])),
|
| 720 |
+
"ground_truth": cast(list[dict[str, object]], scenario.get("ground_truth", [])),
|
| 721 |
+
}
|
| 722 |
+
|
| 723 |
+
|
| 724 |
+
def get_task_scenario_count(task_id: str, split: str | None = None) -> int:
|
| 725 |
+
"""Return number of scenarios for a task in selected split."""
|
| 726 |
+
if task_id not in TASK_LIBRARY:
|
| 727 |
+
raise KeyError(f"Unknown task_id: {task_id}")
|
| 728 |
+
|
| 729 |
+
if task_id == "task_production":
|
| 730 |
+
return PRODUCTION_SCENARIO_COUNT
|
| 731 |
+
|
| 732 |
+
task_record = TASK_LIBRARY[task_id]
|
| 733 |
+
normalized_split = _normalize_split(split)
|
| 734 |
+
if normalized_split == "private_eval":
|
| 735 |
+
private_overrides = _load_private_eval_overrides()
|
| 736 |
+
pool = private_overrides.get(task_id, [])
|
| 737 |
+
if not pool:
|
| 738 |
+
pool = cast(list[dict[str, object]], task_record.get("private_eval_pool", []))
|
| 739 |
+
return len(pool)
|
| 740 |
+
else:
|
| 741 |
+
pool = cast(list[dict[str, object]], task_record.get("scenario_pool", []))
|
| 742 |
+
|
| 743 |
+
return len(pool)
|
| 744 |
+
|
| 745 |
+
|
| 746 |
+
def list_task_ids() -> list[str]:
|
| 747 |
+
"""Return all supported task identifiers in deterministic order."""
|
| 748 |
+
return ["task_easy", "task_medium", "task_hard", "task_production"]
|
uv.lock
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version = 1
|
| 2 |
+
revision = 1
|
| 3 |
+
requires-python = ">=3.11"
|
| 4 |
+
|
| 5 |
+
[[package]]
|
| 6 |
+
name = "email-triage-env"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
+
source = { editable = "." }
|
validate-submission.sh
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
#
|
| 3 |
+
# validate-submission.sh - OpenEnv Submission Validator
|
| 4 |
+
#
|
| 5 |
+
# Checks that your HF Space is live, Docker image builds, and openenv validate passes.
|
| 6 |
+
#
|
| 7 |
+
# Prerequisites:
|
| 8 |
+
# - Docker: https://docs.docker.com/get-docker/
|
| 9 |
+
# - openenv-core: pip install openenv-core
|
| 10 |
+
# - curl (usually pre-installed)
|
| 11 |
+
#
|
| 12 |
+
# Run:
|
| 13 |
+
# curl -fsSL https://raw.githubusercontent.com/<owner>/<repo>/main/validate-submission.sh | bash -s -- <ping_url> [repo_dir]
|
| 14 |
+
#
|
| 15 |
+
# Or download and run locally:
|
| 16 |
+
# chmod +x validate-submission.sh
|
| 17 |
+
# ./validate-submission.sh <ping_url> [repo_dir]
|
| 18 |
+
#
|
| 19 |
+
# Arguments:
|
| 20 |
+
# ping_url Space URL (e.g. https://your-space.hf.space or https://huggingface.co/spaces/<owner>/<space>)
|
| 21 |
+
# repo_dir Path to your repo (default: current directory)
|
| 22 |
+
#
|
| 23 |
+
# Examples:
|
| 24 |
+
# ./validate-submission.sh https://my-team.hf.space
|
| 25 |
+
# ./validate-submission.sh https://my-team.hf.space ./my-repo
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
set -uo pipefail
|
| 29 |
+
|
| 30 |
+
DOCKER_BUILD_TIMEOUT=600
|
| 31 |
+
if [ -t 1 ]; then
|
| 32 |
+
RED='\033[0;31m'
|
| 33 |
+
GREEN='\033[0;32m'
|
| 34 |
+
YELLOW='\033[1;33m'
|
| 35 |
+
BOLD='\033[1m'
|
| 36 |
+
NC='\033[0m'
|
| 37 |
+
else
|
| 38 |
+
RED='' GREEN='' YELLOW='' BOLD='' NC=''
|
| 39 |
+
fi
|
| 40 |
+
|
| 41 |
+
run_with_timeout() {
|
| 42 |
+
local secs="$1"; shift
|
| 43 |
+
if command -v timeout &>/dev/null; then
|
| 44 |
+
timeout "$secs" "$@"
|
| 45 |
+
elif command -v gtimeout &>/dev/null; then
|
| 46 |
+
gtimeout "$secs" "$@"
|
| 47 |
+
else
|
| 48 |
+
"$@" &
|
| 49 |
+
local pid=$!
|
| 50 |
+
( sleep "$secs" && kill "$pid" 2>/dev/null ) &
|
| 51 |
+
local watcher=$!
|
| 52 |
+
wait "$pid" 2>/dev/null
|
| 53 |
+
local rc=$?
|
| 54 |
+
kill "$watcher" 2>/dev/null
|
| 55 |
+
wait "$watcher" 2>/dev/null
|
| 56 |
+
return $rc
|
| 57 |
+
fi
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
portable_mktemp() {
|
| 61 |
+
local prefix="${1:-validate}"
|
| 62 |
+
mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
normalize_ping_url() {
|
| 66 |
+
local input_url="${1%/}"
|
| 67 |
+
if [[ "$input_url" =~ ^https?://huggingface\.co/spaces/([^/]+)/([^/]+)$ ]]; then
|
| 68 |
+
local owner
|
| 69 |
+
local space
|
| 70 |
+
owner="$(printf '%s' "${BASH_REMATCH[1]}" | tr '[:upper:]' '[:lower:]')"
|
| 71 |
+
space="$(printf '%s' "${BASH_REMATCH[2]}" | tr '[:upper:]' '[:lower:]')"
|
| 72 |
+
printf "https://%s-%s.hf.space" "$owner" "$space"
|
| 73 |
+
else
|
| 74 |
+
printf "%s" "$input_url"
|
| 75 |
+
fi
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
CLEANUP_FILES=()
|
| 79 |
+
cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
|
| 80 |
+
trap cleanup EXIT
|
| 81 |
+
|
| 82 |
+
PING_URL="${1:-}"
|
| 83 |
+
REPO_DIR="${2:-.}"
|
| 84 |
+
|
| 85 |
+
if [ -z "$PING_URL" ]; then
|
| 86 |
+
printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
|
| 87 |
+
printf "\n"
|
| 88 |
+
printf " ping_url Space URL (e.g. https://your-space.hf.space or https://huggingface.co/spaces/<owner>/<space>)\n"
|
| 89 |
+
printf " repo_dir Path to your repo (default: current directory)\n"
|
| 90 |
+
exit 1
|
| 91 |
+
fi
|
| 92 |
+
|
| 93 |
+
if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
|
| 94 |
+
printf "Error: directory '%s' not found\n" "${2:-.}"
|
| 95 |
+
exit 1
|
| 96 |
+
fi
|
| 97 |
+
PING_URL="$(normalize_ping_url "$PING_URL")"
|
| 98 |
+
export PING_URL
|
| 99 |
+
PASS=0
|
| 100 |
+
|
| 101 |
+
log() { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
|
| 102 |
+
pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
|
| 103 |
+
fail() { log "${RED}FAILED${NC} -- $1"; }
|
| 104 |
+
hint() { printf " ${YELLOW}Hint:${NC} %b\n" "$1"; }
|
| 105 |
+
stop_at() {
|
| 106 |
+
printf "\n"
|
| 107 |
+
printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
|
| 108 |
+
exit 1
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
printf "\n"
|
| 112 |
+
printf "${BOLD}========================================${NC}\n"
|
| 113 |
+
printf "${BOLD} OpenEnv Submission Validator${NC}\n"
|
| 114 |
+
printf "${BOLD}========================================${NC}\n"
|
| 115 |
+
log "Repo: $REPO_DIR"
|
| 116 |
+
log "Ping URL: $PING_URL"
|
| 117 |
+
printf "\n"
|
| 118 |
+
|
| 119 |
+
log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
|
| 120 |
+
|
| 121 |
+
CURL_OUTPUT=$(portable_mktemp "validate-curl")
|
| 122 |
+
CLEANUP_FILES+=("$CURL_OUTPUT")
|
| 123 |
+
HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
|
| 124 |
+
-H "Content-Type: application/json" -d '{}' \
|
| 125 |
+
"$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
|
| 126 |
+
|
| 127 |
+
if [ "$HTTP_CODE" = "200" ]; then
|
| 128 |
+
pass "HF Space is live and responds to /reset"
|
| 129 |
+
elif [ "$HTTP_CODE" = "000" ]; then
|
| 130 |
+
fail "HF Space not reachable (connection failed or timed out)"
|
| 131 |
+
hint "Check your network connection and that the Space is running."
|
| 132 |
+
hint "Try: curl -s -o /dev/null -w '%%{http_code}' -X POST $PING_URL/reset"
|
| 133 |
+
stop_at "Step 1"
|
| 134 |
+
else
|
| 135 |
+
fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
|
| 136 |
+
hint "Make sure your Space is running and the URL is correct."
|
| 137 |
+
hint "Try opening $PING_URL in your browser first."
|
| 138 |
+
stop_at "Step 1"
|
| 139 |
+
fi
|
| 140 |
+
|
| 141 |
+
log "${BOLD}Step 2/3: Running docker build${NC} ..."
|
| 142 |
+
|
| 143 |
+
if ! command -v docker &>/dev/null; then
|
| 144 |
+
fail "docker command not found"
|
| 145 |
+
hint "Install Docker: https://docs.docker.com/get-docker/"
|
| 146 |
+
stop_at "Step 2"
|
| 147 |
+
fi
|
| 148 |
+
|
| 149 |
+
if [ -f "$REPO_DIR/Dockerfile" ]; then
|
| 150 |
+
DOCKER_CONTEXT="$REPO_DIR"
|
| 151 |
+
elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
|
| 152 |
+
DOCKER_CONTEXT="$REPO_DIR/server"
|
| 153 |
+
else
|
| 154 |
+
fail "No Dockerfile found in repo root or server/ directory"
|
| 155 |
+
stop_at "Step 2"
|
| 156 |
+
fi
|
| 157 |
+
|
| 158 |
+
log " Found Dockerfile in $DOCKER_CONTEXT"
|
| 159 |
+
|
| 160 |
+
BUILD_OK=false
|
| 161 |
+
BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
|
| 162 |
+
|
| 163 |
+
if [ "$BUILD_OK" = true ]; then
|
| 164 |
+
pass "Docker build succeeded"
|
| 165 |
+
else
|
| 166 |
+
fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
|
| 167 |
+
printf "%s\n" "$BUILD_OUTPUT" | tail -20
|
| 168 |
+
stop_at "Step 2"
|
| 169 |
+
fi
|
| 170 |
+
|
| 171 |
+
log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
|
| 172 |
+
|
| 173 |
+
if ! command -v openenv &>/dev/null; then
|
| 174 |
+
fail "openenv command not found"
|
| 175 |
+
hint "Install it: pip install openenv-core"
|
| 176 |
+
stop_at "Step 3"
|
| 177 |
+
fi
|
| 178 |
+
|
| 179 |
+
VALIDATE_OK=false
|
| 180 |
+
VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
|
| 181 |
+
|
| 182 |
+
if [ "$VALIDATE_OK" = true ]; then
|
| 183 |
+
pass "openenv validate passed"
|
| 184 |
+
[ -n "$VALIDATE_OUTPUT" ] && log " $VALIDATE_OUTPUT"
|
| 185 |
+
else
|
| 186 |
+
fail "openenv validate failed"
|
| 187 |
+
printf "%s\n" "$VALIDATE_OUTPUT"
|
| 188 |
+
stop_at "Step 3"
|
| 189 |
+
fi
|
| 190 |
+
|
| 191 |
+
printf "\n"
|
| 192 |
+
printf "${BOLD}========================================${NC}\n"
|
| 193 |
+
printf "${GREEN}${BOLD} All 3/3 checks passed!${NC}\n"
|
| 194 |
+
printf "${GREEN}${BOLD} Your submission is ready to submit.${NC}\n"
|
| 195 |
+
printf "${BOLD}========================================${NC}\n"
|
| 196 |
+
printf "\n"
|
| 197 |
+
|
| 198 |
+
exit 0
|