Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- Dockerfile +17 -0
- README.md +201 -10
- __init__.py +2 -0
- client.py +51 -0
- env/__init__.py +1 -0
- env/environment.py +265 -0
- env/models.py +60 -0
- env/tasks.py +277 -0
- environment.py +3 -0
- inference.py +173 -0
- models.py +4 -0
- openenv.yaml +10 -0
- pyproject.toml +23 -0
- requirements.txt +2 -0
- server/__init__.py +2 -0
- server/app.py +24 -0
- tasks.py +4 -0
- uv.lock +15 -0
Dockerfile
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 4 |
+
ENV PYTHONUNBUFFERED=1
|
| 5 |
+
|
| 6 |
+
WORKDIR /app
|
| 7 |
+
|
| 8 |
+
COPY requirements.txt /app/requirements.txt
|
| 9 |
+
RUN pip install --no-cache-dir -r /app/requirements.txt
|
| 10 |
+
|
| 11 |
+
COPY env /app/env
|
| 12 |
+
COPY inference.py /app/inference.py
|
| 13 |
+
COPY openenv.yaml /app/openenv.yaml
|
| 14 |
+
COPY README.md /app/README.md
|
| 15 |
+
|
| 16 |
+
ENV ENABLE_WEB_INTERFACE=true
|
| 17 |
+
CMD ["python", "inference.py"]
|
README.md
CHANGED
|
@@ -1,10 +1,201 @@
|
|
| 1 |
-
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk: docker
|
| 7 |
-
pinned: false
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: openenv-productivity
|
| 3 |
+
emoji: 🚀
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
base_path: /web
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# OpenEnv Productivity Benchmark
|
| 12 |
+
|
| 13 |
+
`openenv-productivity` is a deterministic reinforcement learning benchmark with three productivity-oriented tasks designed for the OpenEnv RL Challenge. The environment is intentionally small, reproducible, and easy to validate while still rewarding iterative improvement over multiple steps.
|
| 14 |
+
|
| 15 |
+
## Environment Overview
|
| 16 |
+
|
| 17 |
+
The benchmark exposes a single environment class: `ProductivityEnvironment` in `env/environment.py`.
|
| 18 |
+
|
| 19 |
+
Implemented API:
|
| 20 |
+
|
| 21 |
+
- `reset(task_name="easy") -> Observation`
|
| 22 |
+
- `step(action) -> (Observation, Reward, done, info)`
|
| 23 |
+
- `state() -> Observation`
|
| 24 |
+
|
| 25 |
+
The environment is deterministic:
|
| 26 |
+
|
| 27 |
+
- No randomness is used anywhere in task generation or grading.
|
| 28 |
+
- All tasks use fixed payloads and fixed graders.
|
| 29 |
+
- Reward shaping is stable and repeatable for identical action sequences.
|
| 30 |
+
|
| 31 |
+
## Observation Space
|
| 32 |
+
|
| 33 |
+
Observations are validated with Pydantic through the `Observation` model in `env/models.py`.
|
| 34 |
+
|
| 35 |
+
Observation fields:
|
| 36 |
+
|
| 37 |
+
- `benchmark`: benchmark name
|
| 38 |
+
- `task_name`: `easy`, `medium`, or `hard`
|
| 39 |
+
- `instruction`: natural-language task instruction
|
| 40 |
+
- `payload`: task data and target schema
|
| 41 |
+
- `action_format`: supported action patterns
|
| 42 |
+
- `step_count`: current step index
|
| 43 |
+
- `max_steps`: maximum allowed steps
|
| 44 |
+
- `best_score`: best score seen so far in the episode
|
| 45 |
+
- `last_action`: previous action string
|
| 46 |
+
- `last_feedback`: deterministic grader feedback
|
| 47 |
+
- `done`: terminal flag
|
| 48 |
+
|
| 49 |
+
## Action Space
|
| 50 |
+
|
| 51 |
+
Actions are validated with Pydantic through the `Action` model.
|
| 52 |
+
|
| 53 |
+
Supported actions:
|
| 54 |
+
|
| 55 |
+
- `inspect`
|
| 56 |
+
- `propose:{"field":"value"}`
|
| 57 |
+
- `final:{"field":"value"}`
|
| 58 |
+
|
| 59 |
+
Notes:
|
| 60 |
+
|
| 61 |
+
- `inspect` lets an agent spend a step rereading the state, but it still incurs the step penalty.
|
| 62 |
+
- `propose:` is useful for incremental reward collection.
|
| 63 |
+
- `final:` ends the episode immediately, even if the answer is incomplete.
|
| 64 |
+
- Malformed actions and malformed JSON receive deterministic penalties.
|
| 65 |
+
|
| 66 |
+
## Reward Logic
|
| 67 |
+
|
| 68 |
+
Rewards are represented by the Pydantic `Reward` model.
|
| 69 |
+
|
| 70 |
+
Per-step reward includes:
|
| 71 |
+
|
| 72 |
+
- positive delta when a proposal improves over the previous best score
|
| 73 |
+
- partial credit from field-level grading
|
| 74 |
+
- `-0.02` step penalty on every step
|
| 75 |
+
- wrong-answer penalty when a submission regresses or scores zero
|
| 76 |
+
- malformed action penalty for invalid actions or invalid JSON
|
| 77 |
+
- loop penalty when the same action is repeated consecutively
|
| 78 |
+
|
| 79 |
+
This shaping discourages loops, rewards iterative progress, and stays deterministic.
|
| 80 |
+
|
| 81 |
+
## Tasks
|
| 82 |
+
|
| 83 |
+
Exactly three tasks are included.
|
| 84 |
+
|
| 85 |
+
### 1. Easy: Email Classification
|
| 86 |
+
|
| 87 |
+
Goal:
|
| 88 |
+
|
| 89 |
+
- classify an email into `label`
|
| 90 |
+
- assign `priority`
|
| 91 |
+
- determine `needs_reply`
|
| 92 |
+
|
| 93 |
+
Edge cases handled:
|
| 94 |
+
|
| 95 |
+
- sender intent outweighs superficial phrasing
|
| 96 |
+
- reply detection is based on explicit requested action
|
| 97 |
+
- constrained label and priority vocabularies
|
| 98 |
+
|
| 99 |
+
Expected strong baseline score:
|
| 100 |
+
|
| 101 |
+
- `1.00` within 1 to 2 steps
|
| 102 |
+
|
| 103 |
+
### 2. Medium: Calendar Scheduling
|
| 104 |
+
|
| 105 |
+
Goal:
|
| 106 |
+
|
| 107 |
+
- schedule a 60-minute meeting for all required participants
|
| 108 |
+
- avoid lunch and blocked windows
|
| 109 |
+
- select a room with enough capacity
|
| 110 |
+
|
| 111 |
+
Edge cases handled:
|
| 112 |
+
|
| 113 |
+
- partial overlap is not sufficient
|
| 114 |
+
- room capacity must satisfy participant count
|
| 115 |
+
- blocked windows override individual availability
|
| 116 |
+
|
| 117 |
+
Expected strong baseline score:
|
| 118 |
+
|
| 119 |
+
- `1.00` within 1 to 3 steps
|
| 120 |
+
|
| 121 |
+
### 3. Hard: Data Cleaning
|
| 122 |
+
|
| 123 |
+
Goal:
|
| 124 |
+
|
| 125 |
+
- clean a tabular dataset with duplicate IDs and malformed emails
|
| 126 |
+
- keep first duplicate occurrence only
|
| 127 |
+
- compute a normalized total from retained rows
|
| 128 |
+
|
| 129 |
+
Edge cases handled:
|
| 130 |
+
|
| 131 |
+
- whitespace trimming before validation
|
| 132 |
+
- duplicate handling before retention
|
| 133 |
+
- numeric normalization to two decimals
|
| 134 |
+
|
| 135 |
+
Expected strong baseline score:
|
| 136 |
+
|
| 137 |
+
- `1.00` within 2 to 4 steps
|
| 138 |
+
|
| 139 |
+
## Setup
|
| 140 |
+
|
| 141 |
+
### Local
|
| 142 |
+
|
| 143 |
+
```bash
|
| 144 |
+
python -m venv .venv
|
| 145 |
+
. .venv/bin/activate
|
| 146 |
+
pip install -r requirements.txt
|
| 147 |
+
export API_BASE_URL="https://your-openai-compatible-endpoint/v1"
|
| 148 |
+
export MODEL_NAME="your-model"
|
| 149 |
+
export HF_TOKEN="your-token"
|
| 150 |
+
python inference.py --task easy
|
| 151 |
+
```
|
| 152 |
+
|
| 153 |
+
On Windows PowerShell:
|
| 154 |
+
|
| 155 |
+
```powershell
|
| 156 |
+
python -m venv .venv
|
| 157 |
+
.venv\Scripts\Activate.ps1
|
| 158 |
+
pip install -r requirements.txt
|
| 159 |
+
$env:API_BASE_URL="https://your-openai-compatible-endpoint/v1"
|
| 160 |
+
$env:MODEL_NAME="your-model"
|
| 161 |
+
$env:HF_TOKEN="your-token"
|
| 162 |
+
python inference.py --task easy
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
## Inference Output Contract
|
| 166 |
+
|
| 167 |
+
`inference.py` emits only these lines:
|
| 168 |
+
|
| 169 |
+
```text
|
| 170 |
+
[START] task=<task_name> env=<benchmark> model=<model_name>
|
| 171 |
+
[STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 172 |
+
[END] success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
|
| 173 |
+
```
|
| 174 |
+
|
| 175 |
+
Compliance guarantees:
|
| 176 |
+
|
| 177 |
+
- reward formatted to two decimals
|
| 178 |
+
- lowercase booleans
|
| 179 |
+
- no extra blank lines
|
| 180 |
+
- `[END]` is always printed, including on failure
|
| 181 |
+
- max steps capped at five
|
| 182 |
+
|
| 183 |
+
## Docker
|
| 184 |
+
|
| 185 |
+
Build:
|
| 186 |
+
|
| 187 |
+
```bash
|
| 188 |
+
docker build -t openenv-productivity .
|
| 189 |
+
```
|
| 190 |
+
|
| 191 |
+
Run:
|
| 192 |
+
|
| 193 |
+
```bash
|
| 194 |
+
docker run --rm \
|
| 195 |
+
-e API_BASE_URL="https://your-openai-compatible-endpoint/v1" \
|
| 196 |
+
-e MODEL_NAME="your-model" \
|
| 197 |
+
-e HF_TOKEN="your-token" \
|
| 198 |
+
openenv-productivity
|
| 199 |
+
```
|
| 200 |
+
|
| 201 |
+
The container is intentionally lean and suitable for a 2 CPU / 8 GB RAM runtime.
|
__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""OpenEnv productivity environment package root."""
|
| 2 |
+
|
client.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import re
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
from openai import OpenAI
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def _compact(text: str) -> str:
|
| 12 |
+
return re.sub(r"\s+", " ", text).strip()
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class ProductivityClient:
|
| 16 |
+
"""Minimal OpenEnv client wrapper for remote action generation."""
|
| 17 |
+
|
| 18 |
+
def __init__(self) -> None:
|
| 19 |
+
api_base_url = os.getenv("API_BASE_URL")
|
| 20 |
+
model_name = os.getenv("MODEL_NAME")
|
| 21 |
+
token = os.getenv("HF_TOKEN")
|
| 22 |
+
|
| 23 |
+
if not api_base_url:
|
| 24 |
+
raise ValueError("missing API_BASE_URL")
|
| 25 |
+
if not model_name:
|
| 26 |
+
raise ValueError("missing MODEL_NAME")
|
| 27 |
+
if not token:
|
| 28 |
+
raise ValueError("missing HF_TOKEN")
|
| 29 |
+
|
| 30 |
+
self.model_name = model_name
|
| 31 |
+
self.client = OpenAI(base_url=api_base_url, api_key=token)
|
| 32 |
+
|
| 33 |
+
def act(self, observation: Any) -> str:
|
| 34 |
+
observation_json = json.dumps(observation, sort_keys=True, separators=(",", ":"))
|
| 35 |
+
response = self.client.chat.completions.create(
|
| 36 |
+
model=self.model_name,
|
| 37 |
+
temperature=0,
|
| 38 |
+
messages=[
|
| 39 |
+
{
|
| 40 |
+
"role": "system",
|
| 41 |
+
"content": (
|
| 42 |
+
"Reply with exactly one line and no explanation. "
|
| 43 |
+
"Allowed formats: inspect, propose:{...}, final:{...}."
|
| 44 |
+
),
|
| 45 |
+
},
|
| 46 |
+
{"role": "user", "content": observation_json},
|
| 47 |
+
],
|
| 48 |
+
)
|
| 49 |
+
content = response.choices[0].message.content or ""
|
| 50 |
+
action = content.splitlines()[0].strip() if content else ""
|
| 51 |
+
return _compact(action) if action else "inspect"
|
env/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Environment package for openenv-productivity."""
|
env/environment.py
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from decimal import Decimal, ROUND_HALF_UP
|
| 5 |
+
from typing import Any, Dict, Optional, Tuple
|
| 6 |
+
|
| 7 |
+
from env.models import Action, Observation, Reward, StepInfo
|
| 8 |
+
from env.tasks import get_task, task_names
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class ProductivityEnvironment:
|
| 12 |
+
benchmark_name = "openenv-productivity"
|
| 13 |
+
|
| 14 |
+
def __init__(self, max_steps: int = 5) -> None:
|
| 15 |
+
self._default_max_steps = max_steps
|
| 16 |
+
self._task_name = "easy"
|
| 17 |
+
self._task = get_task(self._task_name)
|
| 18 |
+
self._step_count = 0
|
| 19 |
+
self._done = False
|
| 20 |
+
self._best_score = 0.0
|
| 21 |
+
self._last_action: Optional[str] = None
|
| 22 |
+
self._prior_action: Optional[str] = None
|
| 23 |
+
self._last_feedback: Optional[str] = None
|
| 24 |
+
self._last_reward = Reward(
|
| 25 |
+
value=0.0,
|
| 26 |
+
score=0.0,
|
| 27 |
+
delta=0.0,
|
| 28 |
+
step_penalty=0.0,
|
| 29 |
+
wrong_answer_penalty=0.0,
|
| 30 |
+
loop_penalty=0.0,
|
| 31 |
+
malformed_penalty=0.0,
|
| 32 |
+
explanation="Environment initialized.",
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
@property
|
| 36 |
+
def max_steps(self) -> int:
|
| 37 |
+
return min(self._default_max_steps, self._task.max_steps)
|
| 38 |
+
|
| 39 |
+
def available_tasks(self) -> list[str]:
|
| 40 |
+
return task_names()
|
| 41 |
+
|
| 42 |
+
def reset(self, task_name: str = "easy") -> Observation:
|
| 43 |
+
self._task_name = task_name
|
| 44 |
+
self._task = get_task(task_name)
|
| 45 |
+
self._step_count = 0
|
| 46 |
+
self._done = False
|
| 47 |
+
self._best_score = 0.0
|
| 48 |
+
self._last_action = None
|
| 49 |
+
self._prior_action = None
|
| 50 |
+
self._last_feedback = "Environment reset."
|
| 51 |
+
self._last_reward = Reward(
|
| 52 |
+
value=0.0,
|
| 53 |
+
score=0.0,
|
| 54 |
+
delta=0.0,
|
| 55 |
+
step_penalty=0.0,
|
| 56 |
+
wrong_answer_penalty=0.0,
|
| 57 |
+
loop_penalty=0.0,
|
| 58 |
+
malformed_penalty=0.0,
|
| 59 |
+
explanation="Environment reset.",
|
| 60 |
+
)
|
| 61 |
+
return self.state()
|
| 62 |
+
|
| 63 |
+
def state(self) -> Observation:
|
| 64 |
+
return Observation(
|
| 65 |
+
benchmark=self.benchmark_name,
|
| 66 |
+
task_name=self._task.name,
|
| 67 |
+
instruction=self._task.instruction,
|
| 68 |
+
payload={
|
| 69 |
+
"data": self._task.public_payload(),
|
| 70 |
+
"schema": self._task.public_schema(),
|
| 71 |
+
},
|
| 72 |
+
action_format=[
|
| 73 |
+
"inspect",
|
| 74 |
+
'propose:{"field":"value"}',
|
| 75 |
+
'final:{"field":"value"}',
|
| 76 |
+
],
|
| 77 |
+
step_count=self._step_count,
|
| 78 |
+
max_steps=self.max_steps,
|
| 79 |
+
best_score=self._best_score,
|
| 80 |
+
last_action=self._last_action,
|
| 81 |
+
last_feedback=self._last_feedback,
|
| 82 |
+
done=self._done,
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
def step(self, action: Any) -> Tuple[Observation, Reward, bool, Dict[str, Any]]:
|
| 86 |
+
if self._done:
|
| 87 |
+
info = StepInfo(
|
| 88 |
+
current_score=self._best_score,
|
| 89 |
+
best_score=self._best_score,
|
| 90 |
+
terminated_by="already_done",
|
| 91 |
+
error="step called after completion",
|
| 92 |
+
)
|
| 93 |
+
reward = Reward(
|
| 94 |
+
value=-0.1,
|
| 95 |
+
score=self._best_score,
|
| 96 |
+
delta=0.0,
|
| 97 |
+
step_penalty=-0.1,
|
| 98 |
+
wrong_answer_penalty=0.0,
|
| 99 |
+
loop_penalty=0.0,
|
| 100 |
+
malformed_penalty=0.0,
|
| 101 |
+
explanation="Step rejected because the episode is already done.",
|
| 102 |
+
)
|
| 103 |
+
self._last_reward = reward
|
| 104 |
+
self._last_feedback = reward.explanation
|
| 105 |
+
return self.state(), reward, True, info.model_dump()
|
| 106 |
+
|
| 107 |
+
parsed_action, action_error = self._coerce_action(action)
|
| 108 |
+
self._step_count += 1
|
| 109 |
+
|
| 110 |
+
if action_error is not None or parsed_action is None:
|
| 111 |
+
reward = self._build_reward(
|
| 112 |
+
current_score=self._best_score,
|
| 113 |
+
previous_best=self._best_score,
|
| 114 |
+
malformed_penalty=-0.25,
|
| 115 |
+
explanation=f"Malformed action: {action_error}",
|
| 116 |
+
)
|
| 117 |
+
self._last_feedback = reward.explanation
|
| 118 |
+
self._maybe_finish()
|
| 119 |
+
info = StepInfo(
|
| 120 |
+
current_score=self._best_score,
|
| 121 |
+
best_score=self._best_score,
|
| 122 |
+
terminated_by="max_steps" if self._done else None,
|
| 123 |
+
error=action_error,
|
| 124 |
+
)
|
| 125 |
+
return self.state(), reward, self._done, info.model_dump()
|
| 126 |
+
|
| 127 |
+
if parsed_action.raw == "inspect":
|
| 128 |
+
self._prior_action = self._last_action
|
| 129 |
+
self._last_action = parsed_action.raw
|
| 130 |
+
reward = self._build_reward(
|
| 131 |
+
current_score=self._best_score,
|
| 132 |
+
previous_best=self._best_score,
|
| 133 |
+
explanation="Inspection used. No new answer submitted.",
|
| 134 |
+
)
|
| 135 |
+
self._last_feedback = reward.explanation
|
| 136 |
+
self._maybe_finish()
|
| 137 |
+
info = StepInfo(
|
| 138 |
+
parsed_action={"type": "inspect"},
|
| 139 |
+
current_score=self._best_score,
|
| 140 |
+
best_score=self._best_score,
|
| 141 |
+
terminated_by="max_steps" if self._done else None,
|
| 142 |
+
)
|
| 143 |
+
return self.state(), reward, self._done, info.model_dump()
|
| 144 |
+
|
| 145 |
+
action_type, candidate = self._parse_payload_action(parsed_action.raw)
|
| 146 |
+
self._prior_action = self._last_action
|
| 147 |
+
self._last_action = parsed_action.raw
|
| 148 |
+
|
| 149 |
+
if candidate is None:
|
| 150 |
+
reward = self._build_reward(
|
| 151 |
+
current_score=self._best_score,
|
| 152 |
+
previous_best=self._best_score,
|
| 153 |
+
malformed_penalty=-0.25,
|
| 154 |
+
explanation="Malformed JSON payload in action.",
|
| 155 |
+
)
|
| 156 |
+
self._last_feedback = reward.explanation
|
| 157 |
+
self._maybe_finish()
|
| 158 |
+
info = StepInfo(
|
| 159 |
+
parsed_action={"type": action_type},
|
| 160 |
+
current_score=self._best_score,
|
| 161 |
+
best_score=self._best_score,
|
| 162 |
+
terminated_by="max_steps" if self._done else None,
|
| 163 |
+
error="invalid_json_payload",
|
| 164 |
+
)
|
| 165 |
+
return self.state(), reward, self._done, info.model_dump()
|
| 166 |
+
|
| 167 |
+
current_score, components = self._task.grade_submission(candidate)
|
| 168 |
+
previous_best = self._best_score
|
| 169 |
+
if current_score > self._best_score:
|
| 170 |
+
self._best_score = current_score
|
| 171 |
+
|
| 172 |
+
wrong_answer_penalty = 0.0
|
| 173 |
+
if current_score < previous_best:
|
| 174 |
+
wrong_answer_penalty = -0.15
|
| 175 |
+
elif current_score == 0.0:
|
| 176 |
+
wrong_answer_penalty = -0.05
|
| 177 |
+
|
| 178 |
+
explanation = (
|
| 179 |
+
f"Submitted {action_type} with score {current_score:.2f}. "
|
| 180 |
+
f"Components: {json.dumps(components, sort_keys=True)}."
|
| 181 |
+
)
|
| 182 |
+
reward = self._build_reward(
|
| 183 |
+
current_score=current_score,
|
| 184 |
+
previous_best=previous_best,
|
| 185 |
+
wrong_answer_penalty=wrong_answer_penalty,
|
| 186 |
+
explanation=explanation,
|
| 187 |
+
)
|
| 188 |
+
self._last_feedback = explanation
|
| 189 |
+
|
| 190 |
+
terminated_by = None
|
| 191 |
+
if action_type == "final":
|
| 192 |
+
self._done = True
|
| 193 |
+
terminated_by = "final_action"
|
| 194 |
+
elif self._best_score >= 1.0:
|
| 195 |
+
self._done = True
|
| 196 |
+
terminated_by = "perfect_score"
|
| 197 |
+
else:
|
| 198 |
+
self._maybe_finish()
|
| 199 |
+
if self._done:
|
| 200 |
+
terminated_by = "max_steps"
|
| 201 |
+
|
| 202 |
+
info = StepInfo(
|
| 203 |
+
parsed_action={"type": action_type, "candidate": candidate, "components": components},
|
| 204 |
+
current_score=current_score,
|
| 205 |
+
best_score=self._best_score,
|
| 206 |
+
terminated_by=terminated_by,
|
| 207 |
+
)
|
| 208 |
+
return self.state(), reward, self._done, info.model_dump()
|
| 209 |
+
|
| 210 |
+
def _coerce_action(self, action: Any) -> Tuple[Optional[Action], Optional[str]]:
|
| 211 |
+
try:
|
| 212 |
+
if isinstance(action, Action):
|
| 213 |
+
return action, None
|
| 214 |
+
if isinstance(action, dict) and "raw" in action:
|
| 215 |
+
return Action.model_validate(action), None
|
| 216 |
+
return Action(raw=str(action)), None
|
| 217 |
+
except Exception as exc:
|
| 218 |
+
return None, str(exc)
|
| 219 |
+
|
| 220 |
+
def _parse_payload_action(self, raw: str) -> Tuple[str, Optional[Dict[str, Any]]]:
|
| 221 |
+
if raw.startswith("propose:"):
|
| 222 |
+
action_type = "propose"
|
| 223 |
+
payload_text = raw[len("propose:") :]
|
| 224 |
+
else:
|
| 225 |
+
action_type = "final"
|
| 226 |
+
payload_text = raw[len("final:") :]
|
| 227 |
+
|
| 228 |
+
try:
|
| 229 |
+
parsed = json.loads(payload_text)
|
| 230 |
+
except json.JSONDecodeError:
|
| 231 |
+
return action_type, None
|
| 232 |
+
|
| 233 |
+
if not isinstance(parsed, dict):
|
| 234 |
+
return action_type, None
|
| 235 |
+
return action_type, parsed
|
| 236 |
+
|
| 237 |
+
def _build_reward(
|
| 238 |
+
self,
|
| 239 |
+
current_score: float,
|
| 240 |
+
previous_best: float,
|
| 241 |
+
explanation: str,
|
| 242 |
+
wrong_answer_penalty: float = 0.0,
|
| 243 |
+
malformed_penalty: float = 0.0,
|
| 244 |
+
) -> Reward:
|
| 245 |
+
delta = max(current_score - previous_best, 0.0)
|
| 246 |
+
step_penalty = -0.02
|
| 247 |
+
loop_penalty = -0.05 if self._last_action is not None and self._last_action == self._prior_action else 0.0
|
| 248 |
+
value = delta + step_penalty + wrong_answer_penalty + loop_penalty + malformed_penalty
|
| 249 |
+
value = float(Decimal(str(value)).quantize(Decimal("0.01"), rounding=ROUND_HALF_UP))
|
| 250 |
+
reward = Reward(
|
| 251 |
+
value=max(-1.0, min(1.0, value)),
|
| 252 |
+
score=current_score,
|
| 253 |
+
delta=float(Decimal(str(delta)).quantize(Decimal("0.01"), rounding=ROUND_HALF_UP)),
|
| 254 |
+
step_penalty=step_penalty,
|
| 255 |
+
wrong_answer_penalty=wrong_answer_penalty,
|
| 256 |
+
loop_penalty=loop_penalty,
|
| 257 |
+
malformed_penalty=malformed_penalty,
|
| 258 |
+
explanation=explanation,
|
| 259 |
+
)
|
| 260 |
+
self._last_reward = reward
|
| 261 |
+
return reward
|
| 262 |
+
|
| 263 |
+
def _maybe_finish(self) -> None:
|
| 264 |
+
if self._step_count >= self.max_steps:
|
| 265 |
+
self._done = True
|
env/models.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Any, Dict, List, Literal, Optional
|
| 4 |
+
|
| 5 |
+
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class Action(BaseModel):
|
| 9 |
+
model_config = ConfigDict(extra="forbid", str_strip_whitespace=True)
|
| 10 |
+
|
| 11 |
+
raw: str = Field(..., min_length=1, max_length=4000)
|
| 12 |
+
|
| 13 |
+
@field_validator("raw")
|
| 14 |
+
@classmethod
|
| 15 |
+
def validate_action_prefix(cls, value: str) -> str:
|
| 16 |
+
allowed_prefixes = ("inspect", "propose:", "final:")
|
| 17 |
+
if not value.startswith(allowed_prefixes):
|
| 18 |
+
raise ValueError(
|
| 19 |
+
"action must start with 'inspect', 'propose:', or 'final:'"
|
| 20 |
+
)
|
| 21 |
+
return value
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class Reward(BaseModel):
|
| 25 |
+
model_config = ConfigDict(extra="forbid")
|
| 26 |
+
|
| 27 |
+
value: float = Field(..., ge=-1.0, le=1.0)
|
| 28 |
+
score: float = Field(..., ge=0.0, le=1.0)
|
| 29 |
+
delta: float = Field(..., ge=-1.0, le=1.0)
|
| 30 |
+
step_penalty: float = Field(..., ge=-1.0, le=0.0)
|
| 31 |
+
wrong_answer_penalty: float = Field(..., ge=-1.0, le=0.0)
|
| 32 |
+
loop_penalty: float = Field(..., ge=-1.0, le=0.0)
|
| 33 |
+
malformed_penalty: float = Field(..., ge=-1.0, le=0.0)
|
| 34 |
+
explanation: str = Field(..., min_length=1, max_length=500)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class Observation(BaseModel):
|
| 38 |
+
model_config = ConfigDict(extra="forbid")
|
| 39 |
+
|
| 40 |
+
benchmark: str
|
| 41 |
+
task_name: Literal["easy", "medium", "hard"]
|
| 42 |
+
instruction: str
|
| 43 |
+
payload: Dict[str, Any]
|
| 44 |
+
action_format: List[str]
|
| 45 |
+
step_count: int = Field(..., ge=0)
|
| 46 |
+
max_steps: int = Field(..., ge=1)
|
| 47 |
+
best_score: float = Field(..., ge=0.0, le=1.0)
|
| 48 |
+
last_action: Optional[str] = None
|
| 49 |
+
last_feedback: Optional[str] = None
|
| 50 |
+
done: bool = False
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class StepInfo(BaseModel):
|
| 54 |
+
model_config = ConfigDict(extra="forbid")
|
| 55 |
+
|
| 56 |
+
parsed_action: Optional[Dict[str, Any]] = None
|
| 57 |
+
current_score: float = Field(..., ge=0.0, le=1.0)
|
| 58 |
+
best_score: float = Field(..., ge=0.0, le=1.0)
|
| 59 |
+
terminated_by: Optional[str] = None
|
| 60 |
+
error: Optional[str] = None
|
env/tasks.py
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import copy
|
| 4 |
+
import json
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
from decimal import Decimal, InvalidOperation, ROUND_HALF_UP
|
| 7 |
+
from typing import Any, Dict, List, Tuple
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def _normalize_text(value: Any) -> str:
|
| 11 |
+
return " ".join(str(value).strip().lower().split())
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def _normalize_bool(value: Any) -> str:
|
| 15 |
+
normalized = _normalize_text(value)
|
| 16 |
+
if normalized in {"yes", "true", "1", "reply", "needed"}:
|
| 17 |
+
return "yes"
|
| 18 |
+
if normalized in {"no", "false", "0", "none", "not needed"}:
|
| 19 |
+
return "no"
|
| 20 |
+
return normalized
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _normalize_date(value: Any) -> str:
|
| 24 |
+
return _normalize_text(value).replace("/", "-")
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def _normalize_time(value: Any) -> str:
|
| 28 |
+
text = _normalize_text(value)
|
| 29 |
+
if len(text) == 4 and ":" not in text and text.isdigit():
|
| 30 |
+
return f"{text[:2]}:{text[2:]}"
|
| 31 |
+
return text
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def _normalize_list(values: Any) -> List[str]:
|
| 35 |
+
if not isinstance(values, list):
|
| 36 |
+
return []
|
| 37 |
+
return sorted({_normalize_text(item) for item in values if str(item).strip()})
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _normalize_decimal(value: Any) -> str:
|
| 41 |
+
try:
|
| 42 |
+
decimal = Decimal(str(value)).quantize(Decimal("0.01"), rounding=ROUND_HALF_UP)
|
| 43 |
+
except (InvalidOperation, TypeError, ValueError):
|
| 44 |
+
return ""
|
| 45 |
+
return format(decimal, ".2f")
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def _exact_match(candidate: Any, expected: Any, normalizer) -> float:
|
| 49 |
+
return 1.0 if normalizer(candidate) == normalizer(expected) else 0.0
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _score_list(candidate: Any, expected: List[str]) -> float:
|
| 53 |
+
actual = set(_normalize_list(candidate))
|
| 54 |
+
target = set(_normalize_list(expected))
|
| 55 |
+
if not target:
|
| 56 |
+
return 1.0 if not actual else 0.0
|
| 57 |
+
return len(actual.intersection(target)) / len(target)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
@dataclass(frozen=True)
|
| 61 |
+
class TaskSpec:
|
| 62 |
+
name: str
|
| 63 |
+
difficulty: str
|
| 64 |
+
instruction: str
|
| 65 |
+
payload: Dict[str, Any]
|
| 66 |
+
schema: Dict[str, str]
|
| 67 |
+
expected: Dict[str, Any]
|
| 68 |
+
max_steps: int
|
| 69 |
+
|
| 70 |
+
def grade_submission(self, candidate: Dict[str, Any]) -> Tuple[float, Dict[str, float]]:
|
| 71 |
+
if not isinstance(candidate, dict):
|
| 72 |
+
return 0.0, {key: 0.0 for key in self.expected.keys()}
|
| 73 |
+
|
| 74 |
+
if self.name == "easy":
|
| 75 |
+
components = {
|
| 76 |
+
"label": _exact_match(candidate.get("label"), self.expected["label"], _normalize_text),
|
| 77 |
+
"priority": _exact_match(candidate.get("priority"), self.expected["priority"], _normalize_text),
|
| 78 |
+
"needs_reply": _exact_match(
|
| 79 |
+
candidate.get("needs_reply"), self.expected["needs_reply"], _normalize_bool
|
| 80 |
+
),
|
| 81 |
+
}
|
| 82 |
+
weights = {"label": 0.6, "priority": 0.2, "needs_reply": 0.2}
|
| 83 |
+
elif self.name == "medium":
|
| 84 |
+
components = {
|
| 85 |
+
"day": _exact_match(candidate.get("day"), self.expected["day"], _normalize_date),
|
| 86 |
+
"start": _exact_match(candidate.get("start"), self.expected["start"], _normalize_time),
|
| 87 |
+
"end": _exact_match(candidate.get("end"), self.expected["end"], _normalize_time),
|
| 88 |
+
"participants": _score_list(candidate.get("participants"), self.expected["participants"]),
|
| 89 |
+
"room": _exact_match(candidate.get("room"), self.expected["room"], _normalize_text),
|
| 90 |
+
}
|
| 91 |
+
weights = {
|
| 92 |
+
"day": 0.2,
|
| 93 |
+
"start": 0.2,
|
| 94 |
+
"end": 0.2,
|
| 95 |
+
"participants": 0.2,
|
| 96 |
+
"room": 0.2,
|
| 97 |
+
}
|
| 98 |
+
else:
|
| 99 |
+
components = {
|
| 100 |
+
"valid_rows": _exact_match(candidate.get("valid_rows"), self.expected["valid_rows"], _normalize_text),
|
| 101 |
+
"duplicate_ids": _score_list(candidate.get("duplicate_ids"), self.expected["duplicate_ids"]),
|
| 102 |
+
"invalid_emails": _score_list(candidate.get("invalid_emails"), self.expected["invalid_emails"]),
|
| 103 |
+
"normalized_total": _exact_match(
|
| 104 |
+
candidate.get("normalized_total"), self.expected["normalized_total"], _normalize_decimal
|
| 105 |
+
),
|
| 106 |
+
"retained_ids": _score_list(candidate.get("retained_ids"), self.expected["retained_ids"]),
|
| 107 |
+
}
|
| 108 |
+
weights = {
|
| 109 |
+
"valid_rows": 0.2,
|
| 110 |
+
"duplicate_ids": 0.2,
|
| 111 |
+
"invalid_emails": 0.2,
|
| 112 |
+
"normalized_total": 0.2,
|
| 113 |
+
"retained_ids": 0.2,
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
score = sum(components[key] * weights[key] for key in weights)
|
| 117 |
+
score = float(Decimal(str(score)).quantize(Decimal("0.01"), rounding=ROUND_HALF_UP))
|
| 118 |
+
return score, components
|
| 119 |
+
|
| 120 |
+
def public_payload(self) -> Dict[str, Any]:
|
| 121 |
+
return copy.deepcopy(self.payload)
|
| 122 |
+
|
| 123 |
+
def public_schema(self) -> Dict[str, str]:
|
| 124 |
+
return copy.deepcopy(self.schema)
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
TASKS: Dict[str, TaskSpec] = {
|
| 128 |
+
"easy": TaskSpec(
|
| 129 |
+
name="easy",
|
| 130 |
+
difficulty="easy",
|
| 131 |
+
instruction=(
|
| 132 |
+
"Classify the email into a label, priority, and whether it needs a reply. "
|
| 133 |
+
"Output JSON with keys: label, priority, needs_reply. "
|
| 134 |
+
"Allowed labels: work, personal, spam, finance. "
|
| 135 |
+
"Allowed priorities: low, normal, high. "
|
| 136 |
+
"needs_reply must be yes or no."
|
| 137 |
+
),
|
| 138 |
+
payload={
|
| 139 |
+
"email": {
|
| 140 |
+
"from": "billing@northstarbank.example",
|
| 141 |
+
"subject": "Invoice 88421 available in your portal",
|
| 142 |
+
"body": (
|
| 143 |
+
"Hello, your March statement is now available. "
|
| 144 |
+
"No action is needed unless you notice an error."
|
| 145 |
+
),
|
| 146 |
+
},
|
| 147 |
+
"edge_cases": [
|
| 148 |
+
"Promotional language should not override sender intent.",
|
| 149 |
+
"If no response is requested, needs_reply should be no.",
|
| 150 |
+
"Classification must be based on content, not guesswork.",
|
| 151 |
+
],
|
| 152 |
+
},
|
| 153 |
+
schema={"label": "string", "priority": "string", "needs_reply": "string"},
|
| 154 |
+
expected={"label": "finance", "priority": "normal", "needs_reply": "no"},
|
| 155 |
+
max_steps=5,
|
| 156 |
+
),
|
| 157 |
+
"medium": TaskSpec(
|
| 158 |
+
name="medium",
|
| 159 |
+
difficulty="medium",
|
| 160 |
+
instruction=(
|
| 161 |
+
"Schedule a 60-minute project sync that includes every required participant. "
|
| 162 |
+
"Avoid blocked windows and lunch hours. "
|
| 163 |
+
"Output JSON with keys: day, start, end, participants, room."
|
| 164 |
+
),
|
| 165 |
+
payload={
|
| 166 |
+
"duration_minutes": 60,
|
| 167 |
+
"timezone": "Asia/Kolkata",
|
| 168 |
+
"required_participants": ["Alex", "Priya", "Sam"],
|
| 169 |
+
"blocked_windows": [
|
| 170 |
+
{"day": "2026-04-09", "start": "12:00", "end": "13:00", "reason": "lunch"},
|
| 171 |
+
{"day": "2026-04-09", "start": "16:00", "end": "17:00", "reason": "company all-hands"},
|
| 172 |
+
],
|
| 173 |
+
"availability": {
|
| 174 |
+
"Alex": [
|
| 175 |
+
{"day": "2026-04-09", "start": "09:00", "end": "11:00"},
|
| 176 |
+
{"day": "2026-04-09", "start": "14:00", "end": "16:00"},
|
| 177 |
+
],
|
| 178 |
+
"Priya": [
|
| 179 |
+
{"day": "2026-04-09", "start": "10:00", "end": "11:00"},
|
| 180 |
+
{"day": "2026-04-09", "start": "14:00", "end": "15:30"},
|
| 181 |
+
],
|
| 182 |
+
"Sam": [
|
| 183 |
+
{"day": "2026-04-09", "start": "09:30", "end": "10:30"},
|
| 184 |
+
{"day": "2026-04-09", "start": "14:00", "end": "17:00"},
|
| 185 |
+
],
|
| 186 |
+
},
|
| 187 |
+
"rooms": [
|
| 188 |
+
{"name": "Focus-2", "capacity": 2, "available": [{"day": "2026-04-09", "start": "14:00", "end": "15:00"}]},
|
| 189 |
+
{"name": "Focus-3", "capacity": 3, "available": [{"day": "2026-04-09", "start": "14:00", "end": "15:00"}]},
|
| 190 |
+
{"name": "Board-6", "capacity": 6, "available": [{"day": "2026-04-09", "start": "15:00", "end": "16:00"}]},
|
| 191 |
+
],
|
| 192 |
+
"edge_cases": [
|
| 193 |
+
"A room with insufficient capacity is invalid even if time matches.",
|
| 194 |
+
"The interval must fit every attendee exactly, not just overlap partially.",
|
| 195 |
+
"Lunch block must be avoided even if users appear available.",
|
| 196 |
+
],
|
| 197 |
+
},
|
| 198 |
+
schema={
|
| 199 |
+
"day": "YYYY-MM-DD",
|
| 200 |
+
"start": "HH:MM",
|
| 201 |
+
"end": "HH:MM",
|
| 202 |
+
"participants": "list[string]",
|
| 203 |
+
"room": "string",
|
| 204 |
+
},
|
| 205 |
+
expected={
|
| 206 |
+
"day": "2026-04-09",
|
| 207 |
+
"start": "14:00",
|
| 208 |
+
"end": "15:00",
|
| 209 |
+
"participants": ["alex", "priya", "sam"],
|
| 210 |
+
"room": "focus-3",
|
| 211 |
+
},
|
| 212 |
+
max_steps=5,
|
| 213 |
+
),
|
| 214 |
+
"hard": TaskSpec(
|
| 215 |
+
name="hard",
|
| 216 |
+
difficulty="hard",
|
| 217 |
+
instruction=(
|
| 218 |
+
"Clean the dataset deterministically using the provided rules. "
|
| 219 |
+
"Keep the first occurrence of a duplicate id, drop rows with invalid emails, "
|
| 220 |
+
"normalize amount to two decimals, and report summary metrics. "
|
| 221 |
+
"Output JSON with keys: valid_rows, duplicate_ids, invalid_emails, normalized_total, retained_ids."
|
| 222 |
+
),
|
| 223 |
+
payload={
|
| 224 |
+
"rules": [
|
| 225 |
+
"Trim whitespace from every string field.",
|
| 226 |
+
"Emails must contain one @ and at least one dot after @.",
|
| 227 |
+
"Duplicate ids are counted once per repeated id; keep the first occurrence only.",
|
| 228 |
+
"Rows with invalid emails are removed before summing amounts.",
|
| 229 |
+
"Sum uses the retained rows only and must be rounded to two decimals.",
|
| 230 |
+
],
|
| 231 |
+
"rows": [
|
| 232 |
+
{"id": "a001", "email": "alice@example.com", "amount": "120"},
|
| 233 |
+
{"id": "b002", "email": "bob@example.com ", "amount": "80.5"},
|
| 234 |
+
{"id": "c003", "email": "bad-email", "amount": "10.00"},
|
| 235 |
+
{"id": "c003", "email": "carol@example.com", "amount": "10.00"},
|
| 236 |
+
{"id": "d004", "email": " dan@example.org", "amount": "200.40"},
|
| 237 |
+
{"id": "e005", "email": "eve@example.org", "amount": "160.50"},
|
| 238 |
+
],
|
| 239 |
+
"edge_cases": [
|
| 240 |
+
"Whitespace around email fields should be removed before validation.",
|
| 241 |
+
"The second c003 row is discarded because the id is duplicate even though the email is valid.",
|
| 242 |
+
"Amounts may arrive as integers or decimal strings.",
|
| 243 |
+
],
|
| 244 |
+
},
|
| 245 |
+
schema={
|
| 246 |
+
"valid_rows": "integer",
|
| 247 |
+
"duplicate_ids": "list[string]",
|
| 248 |
+
"invalid_emails": "list[string]",
|
| 249 |
+
"normalized_total": "string or number with two decimals",
|
| 250 |
+
"retained_ids": "list[string]",
|
| 251 |
+
},
|
| 252 |
+
expected={
|
| 253 |
+
"valid_rows": 4,
|
| 254 |
+
"duplicate_ids": ["c003"],
|
| 255 |
+
"invalid_emails": ["bad-email"],
|
| 256 |
+
"normalized_total": "561.40",
|
| 257 |
+
"retained_ids": ["a001", "b002", "d004", "e005"],
|
| 258 |
+
},
|
| 259 |
+
max_steps=5,
|
| 260 |
+
),
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
def get_task(task_name: str) -> TaskSpec:
|
| 265 |
+
normalized = _normalize_text(task_name)
|
| 266 |
+
if normalized not in TASKS:
|
| 267 |
+
valid = ", ".join(sorted(TASKS.keys()))
|
| 268 |
+
raise ValueError(f"unknown task '{task_name}'. expected one of: {valid}")
|
| 269 |
+
return TASKS[normalized]
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
def task_names() -> List[str]:
|
| 273 |
+
return ["easy", "medium", "hard"]
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
def schema_json(task_name: str) -> str:
|
| 277 |
+
return json.dumps(get_task(task_name).public_schema(), sort_keys=True)
|
environment.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from env.environment import ProductivityEnvironment
|
| 2 |
+
|
| 3 |
+
__all__ = ["ProductivityEnvironment"]
|
inference.py
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
import re
|
| 7 |
+
from typing import Optional, Tuple
|
| 8 |
+
|
| 9 |
+
from openai import OpenAI
|
| 10 |
+
|
| 11 |
+
from env.environment import ProductivityEnvironment
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
MAX_STEPS = 5
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _compact(text: Optional[str]) -> str:
|
| 18 |
+
if text is None:
|
| 19 |
+
return "null"
|
| 20 |
+
return re.sub(r"\s+", " ", str(text)).strip() or "null"
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _bool_text(value: bool) -> str:
|
| 24 |
+
return "true" if value else "false"
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def _print_start(task_name: str, env_name: str, model_name: str) -> None:
|
| 28 |
+
print(f"[START] task={task_name} env={env_name} model={model_name}")
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def _print_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
|
| 32 |
+
print(
|
| 33 |
+
f"[STEP] step={step} action={_compact(action)} reward={reward:.2f} "
|
| 34 |
+
f"done={_bool_text(done)} error={_compact(error)}"
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def _print_end(success: bool, steps: int, rewards: list[float]) -> None:
|
| 39 |
+
reward_text = ",".join(f"{value:.2f}" for value in rewards)
|
| 40 |
+
print(f"[END] success={_bool_text(success)} steps={steps} rewards={reward_text}")
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _build_client() -> Tuple[Optional[OpenAI], Optional[str], Optional[str]]:
|
| 44 |
+
api_base_url = os.getenv("API_BASE_URL")
|
| 45 |
+
model_name = os.getenv("MODEL_NAME")
|
| 46 |
+
token = os.getenv("HF_TOKEN")
|
| 47 |
+
|
| 48 |
+
if not api_base_url:
|
| 49 |
+
return None, model_name, "missing API_BASE_URL"
|
| 50 |
+
if not model_name:
|
| 51 |
+
return None, None, "missing MODEL_NAME"
|
| 52 |
+
if not token:
|
| 53 |
+
return None, model_name, "missing HF_TOKEN"
|
| 54 |
+
|
| 55 |
+
try:
|
| 56 |
+
client = OpenAI(base_url=api_base_url, api_key=token)
|
| 57 |
+
except Exception as exc:
|
| 58 |
+
return None, model_name, f"client_initialization_failed:{_compact(exc)}"
|
| 59 |
+
return client, model_name, None
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def _extract_action(content: str) -> str:
|
| 63 |
+
text = content.strip()
|
| 64 |
+
if text.startswith("```"):
|
| 65 |
+
text = re.sub(r"^```[a-zA-Z0-9_-]*", "", text).strip()
|
| 66 |
+
text = re.sub(r"```$", "", text).strip()
|
| 67 |
+
return text.splitlines()[0].strip() if text else ""
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def _query_model(client: OpenAI, model_name: str, observation_json: str) -> Tuple[Optional[str], Optional[str]]:
|
| 71 |
+
try:
|
| 72 |
+
response = client.chat.completions.create(
|
| 73 |
+
model=model_name,
|
| 74 |
+
temperature=0,
|
| 75 |
+
messages=[
|
| 76 |
+
{
|
| 77 |
+
"role": "system",
|
| 78 |
+
"content": (
|
| 79 |
+
"You are solving a deterministic RL benchmark. "
|
| 80 |
+
"Reply with exactly one line and no explanation. "
|
| 81 |
+
"Allowed formats are inspect, propose:{...}, or final:{...}. "
|
| 82 |
+
"Use compact JSON. Prefer final:{...} once confident."
|
| 83 |
+
),
|
| 84 |
+
},
|
| 85 |
+
{"role": "user", "content": observation_json},
|
| 86 |
+
],
|
| 87 |
+
)
|
| 88 |
+
except Exception as exc:
|
| 89 |
+
return None, f"api_error:{_compact(exc)}"
|
| 90 |
+
|
| 91 |
+
try:
|
| 92 |
+
content = response.choices[0].message.content
|
| 93 |
+
except Exception as exc:
|
| 94 |
+
return None, f"malformed_response:{_compact(exc)}"
|
| 95 |
+
|
| 96 |
+
if not content or not str(content).strip():
|
| 97 |
+
return None, "empty_response"
|
| 98 |
+
|
| 99 |
+
action = _extract_action(str(content))
|
| 100 |
+
if not action:
|
| 101 |
+
return None, "empty_action"
|
| 102 |
+
return action, None
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def main() -> None:
|
| 106 |
+
parser = argparse.ArgumentParser()
|
| 107 |
+
parser.add_argument("--task", default=os.getenv("TASK_NAME", "easy"))
|
| 108 |
+
args = parser.parse_args()
|
| 109 |
+
|
| 110 |
+
env = ProductivityEnvironment(max_steps=MAX_STEPS)
|
| 111 |
+
task_name = args.task
|
| 112 |
+
model_name_for_log = os.getenv("MODEL_NAME") or "unknown"
|
| 113 |
+
rewards: list[float] = []
|
| 114 |
+
success = False
|
| 115 |
+
steps_taken = 0
|
| 116 |
+
|
| 117 |
+
_print_start(task_name, env.benchmark_name, model_name_for_log)
|
| 118 |
+
|
| 119 |
+
try:
|
| 120 |
+
observation = env.reset(task_name=task_name)
|
| 121 |
+
except Exception as exc:
|
| 122 |
+
_print_step(1, "inspect", 0.00, True, f"reset_failed:{_compact(exc)}")
|
| 123 |
+
_print_end(False, 1, [0.00])
|
| 124 |
+
return
|
| 125 |
+
|
| 126 |
+
client, model_name, init_error = _build_client()
|
| 127 |
+
if init_error is not None or client is None or model_name is None:
|
| 128 |
+
rewards.append(0.00)
|
| 129 |
+
_print_step(1, "inspect", 0.00, True, init_error)
|
| 130 |
+
_print_end(False, 1, rewards)
|
| 131 |
+
return
|
| 132 |
+
|
| 133 |
+
done = False
|
| 134 |
+
last_error: Optional[str] = None
|
| 135 |
+
|
| 136 |
+
for step_number in range(1, MAX_STEPS + 1):
|
| 137 |
+
steps_taken = step_number
|
| 138 |
+
action, model_error = _query_model(
|
| 139 |
+
client,
|
| 140 |
+
model_name,
|
| 141 |
+
json.dumps(observation.model_dump(), separators=(",", ":"), sort_keys=True),
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
if model_error is not None or action is None:
|
| 145 |
+
rewards.append(0.00)
|
| 146 |
+
_print_step(step_number, "inspect", 0.00, True, model_error)
|
| 147 |
+
done = True
|
| 148 |
+
last_error = model_error
|
| 149 |
+
break
|
| 150 |
+
|
| 151 |
+
try:
|
| 152 |
+
observation, reward, done, info = env.step(action)
|
| 153 |
+
error = info.get("error")
|
| 154 |
+
rewards.append(reward.value)
|
| 155 |
+
_print_step(step_number, action, reward.value, done, error)
|
| 156 |
+
last_error = error
|
| 157 |
+
except Exception as exc:
|
| 158 |
+
rewards.append(0.00)
|
| 159 |
+
_print_step(step_number, action, 0.00, True, f"step_failed:{_compact(exc)}")
|
| 160 |
+
done = True
|
| 161 |
+
last_error = str(exc)
|
| 162 |
+
break
|
| 163 |
+
|
| 164 |
+
if done:
|
| 165 |
+
break
|
| 166 |
+
|
| 167 |
+
if rewards:
|
| 168 |
+
success = done and env.state().best_score >= 1.0 and last_error in (None, "")
|
| 169 |
+
_print_end(success, max(steps_taken, 1), rewards if rewards else [0.00])
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
if __name__ == "__main__":
|
| 173 |
+
main()
|
models.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from env.models import Action, Observation, Reward, StepInfo
|
| 2 |
+
|
| 3 |
+
__all__ = ["Action", "Observation", "Reward", "StepInfo"]
|
| 4 |
+
|
openenv.yaml
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: openenv-productivity
|
| 2 |
+
version: "1.0.0"
|
| 3 |
+
description: Deterministic productivity benchmark with three tasks for the OpenEnv RL Challenge.
|
| 4 |
+
entrypoint: env.environment:ProductivityEnvironment
|
| 5 |
+
inference: python inference.py
|
| 6 |
+
max_steps: 5
|
| 7 |
+
tasks:
|
| 8 |
+
- easy
|
| 9 |
+
- medium
|
| 10 |
+
- hard
|
pyproject.toml
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=68", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "openenv-productivity"
|
| 7 |
+
version = "1.0.0"
|
| 8 |
+
description = "Deterministic productivity benchmark for the OpenEnv RL Challenge."
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
requires-python = ">=3.10"
|
| 11 |
+
dependencies = [
|
| 12 |
+
"openenv-core>=0.2.0",
|
| 13 |
+
"openai>=1.30.0,<3.0.0",
|
| 14 |
+
"pydantic>=2.7.0,<3.0.0",
|
| 15 |
+
"fastapi>=0.110.0,<1.0.0",
|
| 16 |
+
"uvicorn>=0.30.0,<1.0.0",
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
[project.scripts]
|
| 20 |
+
server = "server.app:main"
|
| 21 |
+
|
| 22 |
+
[tool.setuptools]
|
| 23 |
+
packages = ["env", "server"]
|
requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openai>=1.30.0,<2.0.0
|
| 2 |
+
pydantic>=2.7.0,<3.0.0
|
server/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Server package for OpenEnv deployment entrypoints."""
|
| 2 |
+
|
server/app.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
from fastapi import FastAPI
|
| 6 |
+
import uvicorn
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
app = FastAPI(title="openenv-productivity", version="1.0.0")
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@app.get("/health")
|
| 13 |
+
def health() -> dict[str, str]:
|
| 14 |
+
return {"status": "ok"}
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def main() -> None:
|
| 18 |
+
host = os.getenv("HOST", "0.0.0.0")
|
| 19 |
+
port = int(os.getenv("PORT", "7860"))
|
| 20 |
+
uvicorn.run("server.app:app", host=host, port=port, reload=False)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
if __name__ == "__main__":
|
| 24 |
+
main()
|
tasks.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from env.tasks import TASKS, TaskSpec, get_task, schema_json, task_names
|
| 2 |
+
|
| 3 |
+
__all__ = ["TASKS", "TaskSpec", "get_task", "schema_json", "task_names"]
|
| 4 |
+
|
uv.lock
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version = 1
|
| 2 |
+
revision = 1
|
| 3 |
+
requires-python = ">=3.10"
|
| 4 |
+
|
| 5 |
+
[[package]]
|
| 6 |
+
name = "openenv-productivity"
|
| 7 |
+
version = "1.0.0"
|
| 8 |
+
source = { virtual = "." }
|
| 9 |
+
dependencies = [
|
| 10 |
+
{ name = "fastapi", specifier = ">=0.110.0,<1.0.0" },
|
| 11 |
+
{ name = "openai", specifier = ">=1.30.0,<3.0.0" },
|
| 12 |
+
{ name = "openenv-core", specifier = ">=0.2.0" },
|
| 13 |
+
{ name = "pydantic", specifier = ">=2.7.0,<3.0.0" },
|
| 14 |
+
{ name = "uvicorn", specifier = ">=0.30.0,<1.0.0" },
|
| 15 |
+
]
|