Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- README.md +78 -134
- baseline.py +29 -0
- env/tasks.py +102 -7
- tests/test_environment.py +59 -0
README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
---
|
| 2 |
title: openenv-productivity
|
| 3 |
-
emoji: 🚀
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: green
|
| 6 |
sdk: docker
|
|
@@ -10,133 +10,72 @@ base_path: /web
|
|
| 10 |
|
| 11 |
# OpenEnv Productivity Benchmark
|
| 12 |
|
| 13 |
-
`openenv-productivity` is a deterministic
|
| 14 |
|
| 15 |
-
##
|
| 16 |
|
| 17 |
-
|
|
|
|
|
|
|
| 18 |
|
| 19 |
-
|
| 20 |
|
| 21 |
- `reset(task_name="easy") -> Observation`
|
| 22 |
- `step(action) -> (Observation, Reward, done, info)`
|
| 23 |
- `state() -> Observation`
|
| 24 |
|
| 25 |
-
|
| 26 |
|
| 27 |
-
-
|
| 28 |
-
-
|
| 29 |
-
- Reward
|
| 30 |
|
| 31 |
-
##
|
| 32 |
|
| 33 |
-
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
|
| 36 |
|
| 37 |
-
|
| 38 |
-
- `task_name`: `easy`, `medium`, or `hard`
|
| 39 |
-
- `instruction`: natural-language task instruction
|
| 40 |
-
- `payload`: task data and target schema
|
| 41 |
-
- `action_format`: supported action patterns
|
| 42 |
-
- `step_count`: current step index
|
| 43 |
-
- `max_steps`: maximum allowed steps
|
| 44 |
-
- `best_score`: best score seen so far in the episode
|
| 45 |
-
- `last_action`: previous action string
|
| 46 |
-
- `last_feedback`: deterministic grader feedback
|
| 47 |
-
- `done`: terminal flag
|
| 48 |
|
| 49 |
-
|
| 50 |
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
-
|
| 54 |
|
| 55 |
-
|
| 56 |
-
- `propose:{"field":"value"}`
|
| 57 |
-
- `final:{"field":"value"}`
|
| 58 |
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
-
|
| 62 |
-
- `propose:` is useful for incremental reward collection.
|
| 63 |
-
- `final:` ends the episode immediately, even if the answer is incomplete.
|
| 64 |
-
- Malformed actions and malformed JSON receive deterministic penalties.
|
| 65 |
-
|
| 66 |
-
## Reward Logic
|
| 67 |
-
|
| 68 |
-
Rewards are represented by the Pydantic `Reward` model.
|
| 69 |
-
|
| 70 |
-
Per-step reward includes:
|
| 71 |
-
|
| 72 |
-
- positive delta when a proposal improves over the previous best score
|
| 73 |
-
- partial credit from field-level grading
|
| 74 |
-
- `-0.02` step penalty on every step
|
| 75 |
-
- wrong-answer penalty when a submission regresses or scores zero
|
| 76 |
-
- malformed action penalty for invalid actions or invalid JSON
|
| 77 |
-
- loop penalty when the same action is repeated consecutively
|
| 78 |
-
|
| 79 |
-
This shaping discourages loops, rewards iterative progress, and stays deterministic.
|
| 80 |
-
|
| 81 |
-
## Tasks
|
| 82 |
-
|
| 83 |
-
Exactly three tasks are included.
|
| 84 |
-
|
| 85 |
-
### 1. Easy: Email Classification
|
| 86 |
-
|
| 87 |
-
Goal:
|
| 88 |
-
|
| 89 |
-
- classify an email into `label`
|
| 90 |
-
- assign `priority`
|
| 91 |
-
- determine `needs_reply`
|
| 92 |
-
|
| 93 |
-
Edge cases handled:
|
| 94 |
-
|
| 95 |
-
- sender intent outweighs superficial phrasing
|
| 96 |
-
- reply detection is based on explicit requested action
|
| 97 |
-
- constrained label and priority vocabularies
|
| 98 |
-
|
| 99 |
-
Expected strong baseline score:
|
| 100 |
-
|
| 101 |
-
- `1.00` within 1 to 2 steps
|
| 102 |
-
|
| 103 |
-
### 2. Medium: Calendar Scheduling
|
| 104 |
-
|
| 105 |
-
Goal:
|
| 106 |
-
|
| 107 |
-
- schedule a 60-minute meeting for all required participants
|
| 108 |
-
- avoid lunch and blocked windows
|
| 109 |
-
- select a room with enough capacity
|
| 110 |
-
|
| 111 |
-
Edge cases handled:
|
| 112 |
-
|
| 113 |
-
- partial overlap is not sufficient
|
| 114 |
-
- room capacity must satisfy participant count
|
| 115 |
-
- blocked windows override individual availability
|
| 116 |
-
|
| 117 |
-
Expected strong baseline score:
|
| 118 |
-
|
| 119 |
-
- `1.00` within 1 to 3 steps
|
| 120 |
-
|
| 121 |
-
### 3. Hard: Data Cleaning
|
| 122 |
-
|
| 123 |
-
Goal:
|
| 124 |
-
|
| 125 |
-
- clean a tabular dataset with duplicate IDs and malformed emails
|
| 126 |
-
- keep first duplicate occurrence only
|
| 127 |
-
- compute a normalized total from retained rows
|
| 128 |
|
| 129 |
-
|
| 130 |
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
|
|
|
|
|
|
| 134 |
|
| 135 |
-
|
| 136 |
|
| 137 |
-
-
|
|
|
|
|
|
|
|
|
|
| 138 |
|
| 139 |
-
##
|
| 140 |
|
| 141 |
### Local
|
| 142 |
|
|
@@ -144,41 +83,38 @@ Expected strong baseline score:
|
|
| 144 |
python -m venv .venv
|
| 145 |
. .venv/bin/activate
|
| 146 |
pip install -r requirements.txt
|
| 147 |
-
export API_BASE_URL="https://
|
| 148 |
-
export MODEL_NAME="
|
| 149 |
-
export HF_TOKEN="
|
| 150 |
python inference.py --task easy
|
| 151 |
```
|
| 152 |
|
| 153 |
-
|
| 154 |
|
| 155 |
-
```
|
| 156 |
-
python -m venv .venv
|
| 157 |
-
.venv\Scripts\Activate.ps1
|
| 158 |
pip install -r requirements.txt
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
python inference.py --task easy
|
| 163 |
```
|
| 164 |
|
| 165 |
-
##
|
| 166 |
-
|
| 167 |
-
`inference.py` emits only these lines:
|
| 168 |
|
| 169 |
-
```
|
| 170 |
-
|
| 171 |
-
[STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 172 |
-
[END] success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
|
| 173 |
```
|
| 174 |
|
| 175 |
-
|
| 176 |
|
| 177 |
-
-
|
| 178 |
-
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
|
|
|
|
|
|
|
|
|
| 182 |
|
| 183 |
## Docker
|
| 184 |
|
|
@@ -188,14 +124,22 @@ Build:
|
|
| 188 |
docker build -t openenv-productivity .
|
| 189 |
```
|
| 190 |
|
| 191 |
-
Run:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
|
| 193 |
```bash
|
| 194 |
-
|
| 195 |
-
-e API_BASE_URL="https://your-openai-compatible-endpoint/v1" \
|
| 196 |
-
-e MODEL_NAME="your-model" \
|
| 197 |
-
-e HF_TOKEN="your-token" \
|
| 198 |
-
openenv-productivity
|
| 199 |
```
|
| 200 |
|
| 201 |
-
The
|
|
|
|
| 1 |
---
|
| 2 |
title: openenv-productivity
|
| 3 |
+
emoji: "🚀"
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: green
|
| 6 |
sdk: docker
|
|
|
|
| 10 |
|
| 11 |
# OpenEnv Productivity Benchmark
|
| 12 |
|
| 13 |
+
`openenv-productivity` is a deterministic RL benchmark for real operational assistant workflows. It includes exactly three tasks with increasing difficulty and deterministic 0.00-1.00 grading.
|
| 14 |
|
| 15 |
+
## Why This Is Useful
|
| 16 |
|
| 17 |
+
- Email triage mirrors real inbox operations.
|
| 18 |
+
- Calendar scheduling includes true resource constraints (people, time windows, rooms, lunch blocks).
|
| 19 |
+
- Data cleaning captures production ETL quality checks and audit-style outputs.
|
| 20 |
|
| 21 |
+
## Environment API
|
| 22 |
|
| 23 |
- `reset(task_name="easy") -> Observation`
|
| 24 |
- `step(action) -> (Observation, Reward, done, info)`
|
| 25 |
- `state() -> Observation`
|
| 26 |
|
| 27 |
+
Pydantic models:
|
| 28 |
|
| 29 |
+
- `Action`
|
| 30 |
+
- `Observation`
|
| 31 |
+
- `Reward`
|
| 32 |
|
| 33 |
+
## Task Set (Exactly 3)
|
| 34 |
|
| 35 |
+
1. `easy` - email classification
|
| 36 |
+
2. `medium` - calendar scheduling
|
| 37 |
+
3. `hard` - data cleaning
|
| 38 |
|
| 39 |
+
All graders are deterministic, reproducible, and return bounded scores in `0.00-1.00`.
|
| 40 |
|
| 41 |
+
## Reward Design
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
+
Each step includes:
|
| 44 |
|
| 45 |
+
- incremental improvement reward (`delta` on better submissions)
|
| 46 |
+
- partial credit from component-level grading
|
| 47 |
+
- wrong-answer penalties for regressions / zero-quality answers
|
| 48 |
+
- malformed action penalty for invalid action or invalid JSON
|
| 49 |
+
- anti-loop penalty for repeated actions
|
| 50 |
+
- fixed step penalty to discourage long trajectories
|
| 51 |
|
| 52 |
+
This keeps rewards dense, stable, and useful for policy learning.
|
| 53 |
|
| 54 |
+
## Determinism & Reproducibility
|
|
|
|
|
|
|
| 55 |
|
| 56 |
+
- no randomness in task payloads or grading
|
| 57 |
+
- fixed expected outputs with deterministic normalization
|
| 58 |
+
- reproducible baseline script for all tasks
|
| 59 |
+
- deterministic unit tests included
|
| 60 |
|
| 61 |
+
## Inference Output Contract
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
+
`inference.py` emits only:
|
| 64 |
|
| 65 |
+
```text
|
| 66 |
+
[START] task=<task_name> env=<benchmark> model=<model_name>
|
| 67 |
+
[STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 68 |
+
[END] success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
|
| 69 |
+
```
|
| 70 |
|
| 71 |
+
Guarantees:
|
| 72 |
|
| 73 |
+
- reward formatted to two decimals
|
| 74 |
+
- lowercase booleans
|
| 75 |
+
- max 5 steps
|
| 76 |
+
- `[END]` always printed (including error paths)
|
| 77 |
|
| 78 |
+
## Quickstart
|
| 79 |
|
| 80 |
### Local
|
| 81 |
|
|
|
|
| 83 |
python -m venv .venv
|
| 84 |
. .venv/bin/activate
|
| 85 |
pip install -r requirements.txt
|
| 86 |
+
export API_BASE_URL="https://router.huggingface.co/v1"
|
| 87 |
+
export MODEL_NAME="zai-org/GLM-5.1"
|
| 88 |
+
export HF_TOKEN="hf_xxx"
|
| 89 |
python inference.py --task easy
|
| 90 |
```
|
| 91 |
|
| 92 |
+
Windows cmd:
|
| 93 |
|
| 94 |
+
```cmd
|
|
|
|
|
|
|
| 95 |
pip install -r requirements.txt
|
| 96 |
+
set API_BASE_URL=https://router.huggingface.co/v1
|
| 97 |
+
set MODEL_NAME=zai-org/GLM-5.1
|
| 98 |
+
set HF_TOKEN=hf_xxx
|
| 99 |
python inference.py --task easy
|
| 100 |
```
|
| 101 |
|
| 102 |
+
### Deterministic Baseline
|
|
|
|
|
|
|
| 103 |
|
| 104 |
+
```bash
|
| 105 |
+
python baseline.py
|
|
|
|
|
|
|
| 106 |
```
|
| 107 |
|
| 108 |
+
Expected pattern:
|
| 109 |
|
| 110 |
+
- each task score = `1.0`
|
| 111 |
+
- each task reward = `0.98` (perfect delta - step penalty)
|
| 112 |
+
|
| 113 |
+
### Tests
|
| 114 |
+
|
| 115 |
+
```bash
|
| 116 |
+
python -m unittest discover -s tests -p "test_*.py"
|
| 117 |
+
```
|
| 118 |
|
| 119 |
## Docker
|
| 120 |
|
|
|
|
| 124 |
docker build -t openenv-productivity .
|
| 125 |
```
|
| 126 |
|
| 127 |
+
Run server mode:
|
| 128 |
+
|
| 129 |
+
```bash
|
| 130 |
+
docker run --rm -p 7860:7860 openenv-productivity
|
| 131 |
+
```
|
| 132 |
+
|
| 133 |
+
Health check:
|
| 134 |
+
|
| 135 |
+
```bash
|
| 136 |
+
curl http://localhost:7860/health
|
| 137 |
+
```
|
| 138 |
+
|
| 139 |
+
## OpenEnv Validation
|
| 140 |
|
| 141 |
```bash
|
| 142 |
+
openenv validate
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
```
|
| 144 |
|
| 145 |
+
The project is designed to pass OpenEnv structure checks and deploy cleanly to Hugging Face Spaces.
|
baseline.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
|
| 5 |
+
from env.environment import ProductivityEnvironment
|
| 6 |
+
from env.tasks import get_task, task_names
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def run_baseline() -> None:
|
| 10 |
+
env = ProductivityEnvironment(max_steps=5)
|
| 11 |
+
summary: dict[str, dict[str, float | int | bool]] = {}
|
| 12 |
+
|
| 13 |
+
for task_name in task_names():
|
| 14 |
+
task = get_task(task_name)
|
| 15 |
+
env.reset(task_name)
|
| 16 |
+
action = f"final:{json.dumps(task.expected, separators=(',', ':'), sort_keys=True)}"
|
| 17 |
+
_, reward, done, info = env.step(action)
|
| 18 |
+
summary[task_name] = {
|
| 19 |
+
"done": bool(done),
|
| 20 |
+
"score": float(info["best_score"]),
|
| 21 |
+
"reward": float(reward.value),
|
| 22 |
+
"steps": 1,
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
print(json.dumps(summary, indent=2, sort_keys=True))
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
if __name__ == "__main__":
|
| 29 |
+
run_baseline()
|
env/tasks.py
CHANGED
|
@@ -31,12 +31,37 @@ def _normalize_time(value: Any) -> str:
|
|
| 31 |
return text
|
| 32 |
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
def _normalize_list(values: Any) -> List[str]:
|
| 35 |
if not isinstance(values, list):
|
| 36 |
return []
|
| 37 |
return sorted({_normalize_text(item) for item in values if str(item).strip()})
|
| 38 |
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
def _normalize_decimal(value: Any) -> str:
|
| 41 |
try:
|
| 42 |
decimal = Decimal(str(value)).quantize(Decimal("0.01"), rounding=ROUND_HALF_UP)
|
|
@@ -57,6 +82,19 @@ def _score_list(candidate: Any, expected: List[str]) -> float:
|
|
| 57 |
return len(actual.intersection(target)) / len(target)
|
| 58 |
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
@dataclass(frozen=True)
|
| 61 |
class TaskSpec:
|
| 62 |
name: str
|
|
@@ -81,19 +119,70 @@ class TaskSpec:
|
|
| 81 |
}
|
| 82 |
weights = {"label": 0.6, "priority": 0.2, "needs_reply": 0.2}
|
| 83 |
elif self.name == "medium":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
components = {
|
| 85 |
"day": _exact_match(candidate.get("day"), self.expected["day"], _normalize_date),
|
| 86 |
"start": _exact_match(candidate.get("start"), self.expected["start"], _normalize_time),
|
| 87 |
"end": _exact_match(candidate.get("end"), self.expected["end"], _normalize_time),
|
| 88 |
"participants": _score_list(candidate.get("participants"), self.expected["participants"]),
|
| 89 |
"room": _exact_match(candidate.get("room"), self.expected["room"], _normalize_text),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
}
|
| 91 |
weights = {
|
| 92 |
-
"day": 0.
|
| 93 |
-
"start": 0.
|
| 94 |
-
"end": 0.
|
| 95 |
-
"participants": 0.
|
| 96 |
-
"room": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
}
|
| 98 |
else:
|
| 99 |
components = {
|
|
@@ -104,13 +193,19 @@ class TaskSpec:
|
|
| 104 |
candidate.get("normalized_total"), self.expected["normalized_total"], _normalize_decimal
|
| 105 |
),
|
| 106 |
"retained_ids": _score_list(candidate.get("retained_ids"), self.expected["retained_ids"]),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
}
|
| 108 |
weights = {
|
| 109 |
"valid_rows": 0.2,
|
| 110 |
-
"duplicate_ids": 0.
|
| 111 |
-
"invalid_emails": 0.
|
| 112 |
"normalized_total": 0.2,
|
| 113 |
"retained_ids": 0.2,
|
|
|
|
| 114 |
}
|
| 115 |
|
| 116 |
score = sum(components[key] * weights[key] for key in weights)
|
|
|
|
| 31 |
return text
|
| 32 |
|
| 33 |
|
| 34 |
+
def _minutes_since_midnight(value: Any) -> int:
|
| 35 |
+
text = _normalize_time(value)
|
| 36 |
+
parts = text.split(":")
|
| 37 |
+
if len(parts) != 2:
|
| 38 |
+
return -1
|
| 39 |
+
if not parts[0].isdigit() or not parts[1].isdigit():
|
| 40 |
+
return -1
|
| 41 |
+
hour = int(parts[0])
|
| 42 |
+
minute = int(parts[1])
|
| 43 |
+
if hour < 0 or hour > 23 or minute < 0 or minute > 59:
|
| 44 |
+
return -1
|
| 45 |
+
return hour * 60 + minute
|
| 46 |
+
|
| 47 |
+
|
| 48 |
def _normalize_list(values: Any) -> List[str]:
|
| 49 |
if not isinstance(values, list):
|
| 50 |
return []
|
| 51 |
return sorted({_normalize_text(item) for item in values if str(item).strip()})
|
| 52 |
|
| 53 |
|
| 54 |
+
def _normalize_list_in_order(values: Any) -> List[str]:
|
| 55 |
+
if not isinstance(values, list):
|
| 56 |
+
return []
|
| 57 |
+
output: List[str] = []
|
| 58 |
+
for item in values:
|
| 59 |
+
normalized = _normalize_text(item)
|
| 60 |
+
if normalized:
|
| 61 |
+
output.append(normalized)
|
| 62 |
+
return output
|
| 63 |
+
|
| 64 |
+
|
| 65 |
def _normalize_decimal(value: Any) -> str:
|
| 66 |
try:
|
| 67 |
decimal = Decimal(str(value)).quantize(Decimal("0.01"), rounding=ROUND_HALF_UP)
|
|
|
|
| 82 |
return len(actual.intersection(target)) / len(target)
|
| 83 |
|
| 84 |
|
| 85 |
+
def _in_any_window(day: str, start_minute: int, end_minute: int, windows: List[Dict[str, str]]) -> bool:
|
| 86 |
+
for window in windows:
|
| 87 |
+
if _normalize_date(window.get("day", "")) != day:
|
| 88 |
+
continue
|
| 89 |
+
ws = _minutes_since_midnight(window.get("start", ""))
|
| 90 |
+
we = _minutes_since_midnight(window.get("end", ""))
|
| 91 |
+
if ws < 0 or we < 0 or ws >= we:
|
| 92 |
+
continue
|
| 93 |
+
if start_minute >= ws and end_minute <= we:
|
| 94 |
+
return True
|
| 95 |
+
return False
|
| 96 |
+
|
| 97 |
+
|
| 98 |
@dataclass(frozen=True)
|
| 99 |
class TaskSpec:
|
| 100 |
name: str
|
|
|
|
| 119 |
}
|
| 120 |
weights = {"label": 0.6, "priority": 0.2, "needs_reply": 0.2}
|
| 121 |
elif self.name == "medium":
|
| 122 |
+
day = _normalize_date(candidate.get("day", ""))
|
| 123 |
+
start_minute = _minutes_since_midnight(candidate.get("start", ""))
|
| 124 |
+
end_minute = _minutes_since_midnight(candidate.get("end", ""))
|
| 125 |
+
required_duration = int(self.payload.get("duration_minutes", 60))
|
| 126 |
+
duration_ok = 1.0 if start_minute >= 0 and end_minute - start_minute == required_duration else 0.0
|
| 127 |
+
no_blocked_overlap = 1.0
|
| 128 |
+
if day and start_minute >= 0 and end_minute > start_minute:
|
| 129 |
+
for block in self.payload.get("blocked_windows", []):
|
| 130 |
+
if _normalize_date(block.get("day", "")) != day:
|
| 131 |
+
continue
|
| 132 |
+
bs = _minutes_since_midnight(block.get("start", ""))
|
| 133 |
+
be = _minutes_since_midnight(block.get("end", ""))
|
| 134 |
+
if bs < 0 or be < 0:
|
| 135 |
+
continue
|
| 136 |
+
overlap = max(start_minute, bs) < min(end_minute, be)
|
| 137 |
+
if overlap:
|
| 138 |
+
no_blocked_overlap = 0.0
|
| 139 |
+
break
|
| 140 |
+
else:
|
| 141 |
+
no_blocked_overlap = 0.0
|
| 142 |
+
|
| 143 |
+
room_ok = 0.0
|
| 144 |
+
room_name = _normalize_text(candidate.get("room", ""))
|
| 145 |
+
participants = _normalize_list(candidate.get("participants", []))
|
| 146 |
+
for room in self.payload.get("rooms", []):
|
| 147 |
+
if _normalize_text(room.get("name", "")) != room_name:
|
| 148 |
+
continue
|
| 149 |
+
capacity_ok = int(room.get("capacity", 0)) >= len(participants)
|
| 150 |
+
slot_ok = _in_any_window(day, start_minute, end_minute, room.get("available", []))
|
| 151 |
+
room_ok = 1.0 if capacity_ok and slot_ok else 0.0
|
| 152 |
+
break
|
| 153 |
+
|
| 154 |
+
participant_availability = 1.0
|
| 155 |
+
required_people = _normalize_list(self.payload.get("required_participants", []))
|
| 156 |
+
if not set(required_people).issubset(set(participants)):
|
| 157 |
+
participant_availability = 0.0
|
| 158 |
+
else:
|
| 159 |
+
for person in required_people:
|
| 160 |
+
availability = self.payload.get("availability", {}).get(person.title(), [])
|
| 161 |
+
if not _in_any_window(day, start_minute, end_minute, availability):
|
| 162 |
+
participant_availability = 0.0
|
| 163 |
+
break
|
| 164 |
+
|
| 165 |
components = {
|
| 166 |
"day": _exact_match(candidate.get("day"), self.expected["day"], _normalize_date),
|
| 167 |
"start": _exact_match(candidate.get("start"), self.expected["start"], _normalize_time),
|
| 168 |
"end": _exact_match(candidate.get("end"), self.expected["end"], _normalize_time),
|
| 169 |
"participants": _score_list(candidate.get("participants"), self.expected["participants"]),
|
| 170 |
"room": _exact_match(candidate.get("room"), self.expected["room"], _normalize_text),
|
| 171 |
+
"duration_ok": duration_ok,
|
| 172 |
+
"no_blocked_overlap": no_blocked_overlap,
|
| 173 |
+
"participant_availability": participant_availability,
|
| 174 |
+
"room_valid": room_ok,
|
| 175 |
}
|
| 176 |
weights = {
|
| 177 |
+
"day": 0.1,
|
| 178 |
+
"start": 0.1,
|
| 179 |
+
"end": 0.1,
|
| 180 |
+
"participants": 0.15,
|
| 181 |
+
"room": 0.1,
|
| 182 |
+
"duration_ok": 0.15,
|
| 183 |
+
"no_blocked_overlap": 0.1,
|
| 184 |
+
"participant_availability": 0.1,
|
| 185 |
+
"room_valid": 0.1,
|
| 186 |
}
|
| 187 |
else:
|
| 188 |
components = {
|
|
|
|
| 193 |
candidate.get("normalized_total"), self.expected["normalized_total"], _normalize_decimal
|
| 194 |
),
|
| 195 |
"retained_ids": _score_list(candidate.get("retained_ids"), self.expected["retained_ids"]),
|
| 196 |
+
"retained_ids_order": _exact_match(
|
| 197 |
+
candidate.get("retained_ids"),
|
| 198 |
+
self.expected["retained_ids"],
|
| 199 |
+
lambda x: json.dumps(_normalize_list_in_order(x), separators=(",", ":")),
|
| 200 |
+
),
|
| 201 |
}
|
| 202 |
weights = {
|
| 203 |
"valid_rows": 0.2,
|
| 204 |
+
"duplicate_ids": 0.15,
|
| 205 |
+
"invalid_emails": 0.15,
|
| 206 |
"normalized_total": 0.2,
|
| 207 |
"retained_ids": 0.2,
|
| 208 |
+
"retained_ids_order": 0.1,
|
| 209 |
}
|
| 210 |
|
| 211 |
score = sum(components[key] * weights[key] for key in weights)
|
tests/test_environment.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import unittest
|
| 5 |
+
|
| 6 |
+
from env.environment import ProductivityEnvironment
|
| 7 |
+
from env.tasks import get_task
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class EnvironmentDeterminismTests(unittest.TestCase):
|
| 11 |
+
def setUp(self) -> None:
|
| 12 |
+
self.env = ProductivityEnvironment(max_steps=5)
|
| 13 |
+
|
| 14 |
+
def test_easy_perfect_answer_scores_one(self) -> None:
|
| 15 |
+
task = get_task("easy")
|
| 16 |
+
self.env.reset("easy")
|
| 17 |
+
action = f"final:{json.dumps(task.expected, separators=(',', ':'), sort_keys=True)}"
|
| 18 |
+
_, reward, done, info = self.env.step(action)
|
| 19 |
+
self.assertTrue(done)
|
| 20 |
+
self.assertEqual(info["best_score"], 1.0)
|
| 21 |
+
self.assertAlmostEqual(reward.value, 0.98, places=2)
|
| 22 |
+
|
| 23 |
+
def test_medium_invalid_room_capacity_penalized(self) -> None:
|
| 24 |
+
self.env.reset("medium")
|
| 25 |
+
bad = {
|
| 26 |
+
"day": "2026-04-09",
|
| 27 |
+
"start": "14:00",
|
| 28 |
+
"end": "15:00",
|
| 29 |
+
"participants": ["Alex", "Priya", "Sam"],
|
| 30 |
+
"room": "Focus-2",
|
| 31 |
+
}
|
| 32 |
+
_, reward, done, info = self.env.step(f"final:{json.dumps(bad, separators=(',', ':'))}")
|
| 33 |
+
self.assertTrue(done)
|
| 34 |
+
self.assertLess(info["best_score"], 1.0)
|
| 35 |
+
self.assertLess(reward.value, 0.98)
|
| 36 |
+
|
| 37 |
+
def test_hard_deterministic(self) -> None:
|
| 38 |
+
self.env.reset("hard")
|
| 39 |
+
answer = {
|
| 40 |
+
"valid_rows": 4,
|
| 41 |
+
"duplicate_ids": ["c003"],
|
| 42 |
+
"invalid_emails": ["bad-email"],
|
| 43 |
+
"normalized_total": "561.40",
|
| 44 |
+
"retained_ids": ["a001", "b002", "d004", "e005"],
|
| 45 |
+
}
|
| 46 |
+
_, _, _, info_a = self.env.step(f"final:{json.dumps(answer, separators=(',', ':'))}")
|
| 47 |
+
self.env.reset("hard")
|
| 48 |
+
_, _, _, info_b = self.env.step(f"final:{json.dumps(answer, separators=(',', ':'))}")
|
| 49 |
+
self.assertEqual(info_a["best_score"], info_b["best_score"])
|
| 50 |
+
|
| 51 |
+
def test_loop_penalty(self) -> None:
|
| 52 |
+
self.env.reset("easy")
|
| 53 |
+
self.env.step("inspect")
|
| 54 |
+
_, reward, _, _ = self.env.step("inspect")
|
| 55 |
+
self.assertLess(reward.value, -0.02)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
if __name__ == "__main__":
|
| 59 |
+
unittest.main()
|