Spaces:
Sleeping
Sleeping
Commit ·
1b64cba
1
Parent(s): 0fca933
initial deployment
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitignore +1 -0
- Dockerfile +18 -0
- README.md +231 -1
- __pycache__/normaltest.cpython-313.pyc +0 -0
- app/__init__.py +0 -0
- app/__pycache__/__init__.cpython-313.pyc +0 -0
- app/__pycache__/actions.cpython-313.pyc +0 -0
- app/__pycache__/env.cpython-313.pyc +0 -0
- app/__pycache__/observation.cpython-313.pyc +0 -0
- app/__pycache__/reward.cpython-313.pyc +0 -0
- app/__pycache__/state.cpython-313.pyc +0 -0
- app/__pycache__/transition.cpython-313.pyc +0 -0
- app/actions.py +18 -0
- app/env.py +67 -0
- app/observation.py +11 -0
- app/reward.py +29 -0
- app/state.py +55 -0
- app/transition.py +77 -0
- app/utils.py +0 -0
- app_server.py +22 -0
- baseline/__init__.py +0 -0
- baseline/__pycache__/__init__.cpython-313.pyc +0 -0
- baseline/__pycache__/policy.cpython-313.pyc +0 -0
- baseline/__pycache__/run_baseline.cpython-313.pyc +0 -0
- baseline/policy.py +30 -0
- baseline/run_baseline.py +54 -0
- graders/__init__.py +0 -0
- graders/__pycache__/__init__.cpython-313.pyc +0 -0
- graders/__pycache__/base.cpython-313.pyc +0 -0
- graders/__pycache__/easy_grader.cpython-313.pyc +0 -0
- graders/__pycache__/hard_grader.cpython-313.pyc +0 -0
- graders/__pycache__/medium_grader.cpython-313.pyc +0 -0
- graders/base.py +3 -0
- graders/easy_grader.py +17 -0
- graders/hard_grader.py +26 -0
- graders/medium_grader.py +22 -0
- inference.py +114 -0
- normaltest.py +21 -0
- openenv.yaml +53 -0
- requirements.txt +8 -0
- scripts/__init__.py +0 -0
- scripts/__pycache__/__init__.cpython-313.pyc +0 -0
- scripts/__pycache__/validate_env.cpython-313.pyc +0 -0
- scripts/run_all_tasks.py +0 -0
- scripts/validate_env.py +35 -0
- tasks/__init__.py +0 -0
- tasks/__pycache__/__init__.cpython-313.pyc +0 -0
- tasks/__pycache__/easy.cpython-313.pyc +0 -0
- tasks/__pycache__/hard.cpython-313.pyc +0 -0
- tasks/__pycache__/medium.cpython-313.pyc +0 -0
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
.env
|
Dockerfile
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Copy all files
|
| 6 |
+
COPY . .
|
| 7 |
+
|
| 8 |
+
# Install dependencies
|
| 9 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 10 |
+
|
| 11 |
+
# Install extra needed libs
|
| 12 |
+
RUN pip install pyyaml
|
| 13 |
+
|
| 14 |
+
# Set environment
|
| 15 |
+
ENV PYTHONPATH=/app
|
| 16 |
+
|
| 17 |
+
# Default command
|
| 18 |
+
CMD ["uvicorn", "app_server:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -7,5 +7,235 @@ sdk: docker
|
|
| 7 |
pinned: false
|
| 8 |
license: mit
|
| 9 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 7 |
pinned: false
|
| 8 |
license: mit
|
| 9 |
---
|
| 10 |
+
# 🧠 OpenEnv Workflow Agent — Decision-Making Under Uncertainty
|
| 11 |
+
|
| 12 |
+
## 🚀 Overview
|
| 13 |
+
|
| 14 |
+
We present a **real-world OpenEnv environment** that simulates workflow management tasks such as email triage, scheduling, and task handling under **partial observability**.
|
| 15 |
+
|
| 16 |
+
Unlike typical environments, this benchmark focuses on a critical but underexplored capability:
|
| 17 |
+
|
| 18 |
+
> 🔥 **Cost-aware information gathering in sequential decision-making**
|
| 19 |
+
|
| 20 |
+
Agents must decide:
|
| 21 |
+
- When to act immediately
|
| 22 |
+
- When to request additional information
|
| 23 |
+
- Whether the cost of uncertainty reduction is justified
|
| 24 |
+
|
| 25 |
+
---
|
| 26 |
+
|
| 27 |
+
## 🎯 Why This Matters
|
| 28 |
+
|
| 29 |
+
Modern AI agents (LLMs, assistants, copilots) operate in **uncertain environments**:
|
| 30 |
+
- Emails are ambiguous
|
| 31 |
+
- User intent is hidden
|
| 32 |
+
- Context is incomplete
|
| 33 |
+
|
| 34 |
+
Our environment models this realistically by enforcing:
|
| 35 |
+
|
| 36 |
+
- ❗ Incorrect actions under uncertainty → penalized
|
| 37 |
+
- ❗ Information gathering → beneficial but costly
|
| 38 |
+
- ❗ Multi-step reasoning required for optimal decisions
|
| 39 |
+
|
| 40 |
+
---
|
| 41 |
+
|
| 42 |
+
## 🧠 Core Idea
|
| 43 |
+
|
| 44 |
+
We introduce a **POMDP-style workflow environment** where:
|
| 45 |
+
|
| 46 |
+
- The true state is partially hidden
|
| 47 |
+
- Agents must **actively reduce uncertainty**
|
| 48 |
+
- Information acquisition has a **non-zero cost**
|
| 49 |
+
|
| 50 |
+
### Key Property:
|
| 51 |
+
|
| 52 |
+
> An optimal agent follows:
|
| 53 |
+
>
|
| 54 |
+
> **“Request information only when expected benefit exceeds cost.”**
|
| 55 |
+
|
| 56 |
+
---
|
| 57 |
+
|
| 58 |
+
## ⚙️ Environment Design
|
| 59 |
+
|
| 60 |
+
### 🔹 State
|
| 61 |
+
|
| 62 |
+
- Emails (observed)
|
| 63 |
+
- Tasks & calendar (observed)
|
| 64 |
+
- Hidden attributes:
|
| 65 |
+
- true intent
|
| 66 |
+
- urgency
|
| 67 |
+
- missing information
|
| 68 |
+
|
| 69 |
+
---
|
| 70 |
+
|
| 71 |
+
### 🔹 Actions
|
| 72 |
+
|
| 73 |
+
- `classify`
|
| 74 |
+
- `reply`
|
| 75 |
+
- `schedule`
|
| 76 |
+
- `request_info`
|
| 77 |
+
- `archive`
|
| 78 |
+
- `prioritize`
|
| 79 |
+
|
| 80 |
+
---
|
| 81 |
+
|
| 82 |
+
### 🔹 Reward Function
|
| 83 |
+
|
| 84 |
+
\[
|
| 85 |
+
r_t = r_{correct} + r_{progress} - r_{cost} - r_{penalty}
|
| 86 |
+
\]
|
| 87 |
+
|
| 88 |
+
- Correct action → +0.3
|
| 89 |
+
- Task progress → +0.2
|
| 90 |
+
- Step penalty → −0.01
|
| 91 |
+
- Information request cost → −0.05
|
| 92 |
+
- Incorrect action → −0.2
|
| 93 |
+
|
| 94 |
+
---
|
| 95 |
+
|
| 96 |
+
## 🧪 Tasks
|
| 97 |
+
|
| 98 |
+
### 🟢 Easy
|
| 99 |
+
- Clear intent
|
| 100 |
+
- Single-step decision
|
| 101 |
+
|
| 102 |
+
### 🟡 Medium
|
| 103 |
+
- Multi-step workflow
|
| 104 |
+
- Requires sequencing
|
| 105 |
+
|
| 106 |
+
### 🔴 Hard
|
| 107 |
+
- Ambiguous input
|
| 108 |
+
- Requires **information gathering before acting**
|
| 109 |
+
|
| 110 |
+
---
|
| 111 |
+
|
| 112 |
+
## 📊 Baseline Results
|
| 113 |
+
|
| 114 |
+
```
|
| 115 |
+
|
| 116 |
+
easy: 1.00
|
| 117 |
+
medium: 0.50
|
| 118 |
+
hard: 0.13
|
| 119 |
+
|
| 120 |
+
```
|
| 121 |
+
|
| 122 |
+
### 🔍 Interpretation
|
| 123 |
+
|
| 124 |
+
- Baseline performs well on simple tasks
|
| 125 |
+
- Fails on ambiguous scenarios
|
| 126 |
+
- Demonstrates need for **information-aware policies**
|
| 127 |
+
|
| 128 |
+
---
|
| 129 |
+
|
| 130 |
+
## 🔥 Key Insight
|
| 131 |
+
|
| 132 |
+
Standard agents fail because they **act too early under uncertainty**.
|
| 133 |
+
|
| 134 |
+
Agents that act immediately under uncertainty fail.
|
| 135 |
+
Agents that strategically gather information succeed.
|
| 136 |
+
|
| 137 |
+
This environment makes that tradeoff explicit and measurable.
|
| 138 |
+
|
| 139 |
+
Our environment exposes this failure mode clearly.
|
| 140 |
+
|
| 141 |
+
---
|
| 142 |
+
|
| 143 |
+
## 🧩 Novel Contribution
|
| 144 |
+
|
| 145 |
+
We introduce:
|
| 146 |
+
|
| 147 |
+
### ✅ Cost-sensitive information gathering
|
| 148 |
+
- Asking questions is beneficial but not free
|
| 149 |
+
|
| 150 |
+
### ✅ Enforced uncertainty
|
| 151 |
+
- Actions without information are penalized
|
| 152 |
+
|
| 153 |
+
### ✅ Sequential dependency
|
| 154 |
+
- Early decisions affect future rewards
|
| 155 |
+
|
| 156 |
+
---
|
| 157 |
+
|
| 158 |
+
## 🧪 Validation
|
| 159 |
+
|
| 160 |
+
We verify:
|
| 161 |
+
|
| 162 |
+
- ✔ Classification fails under missing information
|
| 163 |
+
- ✔ Requesting info enables correct decisions
|
| 164 |
+
- ✔ Tradeoff emerges between cost and accuracy
|
| 165 |
+
|
| 166 |
+
---
|
| 167 |
+
|
| 168 |
+
## 📦 Project Structure
|
| 169 |
+
|
| 170 |
+
```
|
| 171 |
+
|
| 172 |
+
app/
|
| 173 |
+
tasks/
|
| 174 |
+
graders/
|
| 175 |
+
baseline/
|
| 176 |
+
scripts/
|
| 177 |
+
openenv.yaml
|
| 178 |
+
Dockerfile
|
| 179 |
+
inference.py
|
| 180 |
+
|
| 181 |
+
````
|
| 182 |
+
|
| 183 |
+
---
|
| 184 |
+
|
| 185 |
+
## ▶️ Run Locally
|
| 186 |
+
|
| 187 |
+
You can pull the pre-built Docker image directly from Docker Hub and run it:
|
| 188 |
+
|
| 189 |
+
```bash
|
| 190 |
+
docker pull imsachin010/openenv-workflow-agent:latest
|
| 191 |
+
docker run -d -p 7860:7860 --name openenv-agent imsachin010/openenv-workflow-agent:latest
|
| 192 |
+
```
|
| 193 |
+
|
| 194 |
+
Test endpoint:
|
| 195 |
+
|
| 196 |
+
```bash
|
| 197 |
+
curl -X POST http://localhost:7860/reset
|
| 198 |
+
```
|
| 199 |
+
|
| 200 |
+
---
|
| 201 |
+
|
| 202 |
+
## 🤖 Inference
|
| 203 |
+
|
| 204 |
+
Run the inference script inside the environment:
|
| 205 |
+
|
| 206 |
+
```bash
|
| 207 |
+
python -m inference
|
| 208 |
+
```
|
| 209 |
+
|
| 210 |
+
Outputs:
|
| 211 |
+
|
| 212 |
+
```
|
| 213 |
+
[START]
|
| 214 |
+
[STEP]
|
| 215 |
+
[END]
|
| 216 |
+
```
|
| 217 |
+
|
| 218 |
+
---
|
| 219 |
+
|
| 220 |
+
## 🧠 Conclusion
|
| 221 |
+
|
| 222 |
+
This environment highlights a key gap in current agents:
|
| 223 |
+
|
| 224 |
+
> ❗ They do not reason about **when to gather information**
|
| 225 |
+
|
| 226 |
+
We provide a benchmark to evaluate and improve:
|
| 227 |
+
|
| 228 |
+
* decision-making under uncertainty
|
| 229 |
+
* information-seeking behavior
|
| 230 |
+
* sequential reasoning
|
| 231 |
+
|
| 232 |
+
---
|
| 233 |
+
|
| 234 |
+
## 🏁 Submission Notes
|
| 235 |
+
|
| 236 |
+
* ✔ Fully OpenEnv compliant
|
| 237 |
+
* ✔ Deterministic graders
|
| 238 |
+
* ✔ Reproducible via Docker
|
| 239 |
+
* ✔ HF Space endpoint available
|
| 240 |
+
|
| 241 |
|
|
|
__pycache__/normaltest.cpython-313.pyc
ADDED
|
Binary file (431 Bytes). View file
|
|
|
app/__init__.py
ADDED
|
File without changes
|
app/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (176 Bytes). View file
|
|
|
app/__pycache__/actions.cpython-313.pyc
ADDED
|
Binary file (764 Bytes). View file
|
|
|
app/__pycache__/env.cpython-313.pyc
ADDED
|
Binary file (3.16 kB). View file
|
|
|
app/__pycache__/observation.cpython-313.pyc
ADDED
|
Binary file (806 Bytes). View file
|
|
|
app/__pycache__/reward.cpython-313.pyc
ADDED
|
Binary file (1.13 kB). View file
|
|
|
app/__pycache__/state.cpython-313.pyc
ADDED
|
Binary file (2.51 kB). View file
|
|
|
app/__pycache__/transition.cpython-313.pyc
ADDED
|
Binary file (2.16 kB). View file
|
|
|
app/actions.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel
|
| 2 |
+
from typing import Optional, Literal, Dict
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
ActionType = Literal[
|
| 6 |
+
"classify",
|
| 7 |
+
"reply",
|
| 8 |
+
"schedule",
|
| 9 |
+
"prioritize",
|
| 10 |
+
"request_info",
|
| 11 |
+
"archive"
|
| 12 |
+
]
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class Action(BaseModel):
|
| 16 |
+
type: ActionType
|
| 17 |
+
target_id: str # email/task id
|
| 18 |
+
payload: Optional[Dict] = None # flexible for different actions
|
app/env.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Tuple, Dict, Any
|
| 2 |
+
from copy import deepcopy
|
| 3 |
+
|
| 4 |
+
from .state import EnvironmentState
|
| 5 |
+
from .observation import Observation
|
| 6 |
+
from .actions import Action
|
| 7 |
+
from .transition import apply_action
|
| 8 |
+
from .reward import compute_reward
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class WorkflowEnv:
|
| 12 |
+
def __init__(self, initial_state: EnvironmentState):
|
| 13 |
+
self.initial_state = deepcopy(initial_state)
|
| 14 |
+
self._state = deepcopy(initial_state)
|
| 15 |
+
|
| 16 |
+
# -----------------------------
|
| 17 |
+
# RESET
|
| 18 |
+
# -----------------------------
|
| 19 |
+
def reset(self) -> Observation:
|
| 20 |
+
self._state = deepcopy(self.initial_state)
|
| 21 |
+
return self._get_observation()
|
| 22 |
+
|
| 23 |
+
# -----------------------------
|
| 24 |
+
# STEP
|
| 25 |
+
# -----------------------------
|
| 26 |
+
def step(self, action: Action) -> Tuple[Observation, float, bool, Dict[str, Any]]:
|
| 27 |
+
if self._state.done:
|
| 28 |
+
raise Exception("Episode already finished. Call reset().")
|
| 29 |
+
|
| 30 |
+
# Log action
|
| 31 |
+
self._state.history.append({
|
| 32 |
+
"timestep": self._state.timestep,
|
| 33 |
+
"action": action.model_dump()
|
| 34 |
+
})
|
| 35 |
+
|
| 36 |
+
# ✅ APPLY TRANSITION (NEW)
|
| 37 |
+
self._state, info = apply_action(self._state, action)
|
| 38 |
+
|
| 39 |
+
# ✅ COMPUTE REWARD (NEW)
|
| 40 |
+
reward = compute_reward(self._state, action.type, info)
|
| 41 |
+
|
| 42 |
+
# Increment timestep
|
| 43 |
+
self._state.timestep += 1
|
| 44 |
+
|
| 45 |
+
# Episode termination
|
| 46 |
+
if self._state.timestep >= 10:
|
| 47 |
+
self._state.done = True
|
| 48 |
+
|
| 49 |
+
return self._get_observation(), reward, self._state.done, {}
|
| 50 |
+
|
| 51 |
+
# -----------------------------
|
| 52 |
+
# STATE ACCESS
|
| 53 |
+
# -----------------------------
|
| 54 |
+
def state(self) -> EnvironmentState:
|
| 55 |
+
return self._state
|
| 56 |
+
|
| 57 |
+
# -----------------------------
|
| 58 |
+
# OBSERVATION
|
| 59 |
+
# -----------------------------
|
| 60 |
+
def _get_observation(self) -> Observation:
|
| 61 |
+
return Observation(
|
| 62 |
+
emails=self._state.emails,
|
| 63 |
+
tasks=self._state.tasks,
|
| 64 |
+
calendar=self._state.calendar,
|
| 65 |
+
history=self._state.history,
|
| 66 |
+
timestep=self._state.timestep
|
| 67 |
+
)
|
app/observation.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel
|
| 2 |
+
from typing import List, Dict
|
| 3 |
+
from .state import Email, Task, CalendarEvent
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class Observation(BaseModel):
|
| 7 |
+
emails: List[Email]
|
| 8 |
+
tasks: List[Task]
|
| 9 |
+
calendar: List[CalendarEvent]
|
| 10 |
+
history: List[Dict]
|
| 11 |
+
timestep: int
|
app/reward.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.state import EnvironmentState
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def compute_reward(state: EnvironmentState, action_type: str, info: dict) -> float:
|
| 5 |
+
reward = 0.0
|
| 6 |
+
|
| 7 |
+
# --- Correctness ---
|
| 8 |
+
if info.get("correct_action"):
|
| 9 |
+
reward += 0.2
|
| 10 |
+
|
| 11 |
+
# Cost for asking info (tradeoff)
|
| 12 |
+
if action_type == "request_info":
|
| 13 |
+
reward -= 0.05 # cost for querying
|
| 14 |
+
elif info.get("incorrect_action"):
|
| 15 |
+
reward -= 0.2
|
| 16 |
+
|
| 17 |
+
# --- Progress ---
|
| 18 |
+
if info.get("task_progress"):
|
| 19 |
+
reward += 0.2
|
| 20 |
+
|
| 21 |
+
# --- Step penalty (efficiency)
|
| 22 |
+
reward -= 0.01
|
| 23 |
+
|
| 24 |
+
# --- Deadline penalty ---
|
| 25 |
+
for hidden in state.hidden_email_states:
|
| 26 |
+
if hidden.deadline and state.timestep > hidden.deadline:
|
| 27 |
+
reward -= 0.5
|
| 28 |
+
|
| 29 |
+
return reward
|
app/state.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field
|
| 2 |
+
from typing import List, Optional, Dict
|
| 3 |
+
from enum import Enum
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class EmailPriority(str, Enum):
|
| 7 |
+
LOW = "low"
|
| 8 |
+
MEDIUM = "medium"
|
| 9 |
+
HIGH = "high"
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class Email(BaseModel):
|
| 13 |
+
id: str
|
| 14 |
+
sender: str
|
| 15 |
+
subject: str
|
| 16 |
+
body: str
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class HiddenEmailState(BaseModel):
|
| 20 |
+
email_id: str
|
| 21 |
+
true_intent: str # e.g., "meeting_request", "spam", "task"
|
| 22 |
+
urgency: EmailPriority
|
| 23 |
+
requires_response: bool
|
| 24 |
+
deadline: Optional[int] # timestep deadline
|
| 25 |
+
missing_information: bool # does agent need to ask clarification?
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class Task(BaseModel):
|
| 29 |
+
id: str
|
| 30 |
+
description: str
|
| 31 |
+
completed: bool = False
|
| 32 |
+
deadline: Optional[int]
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class CalendarEvent(BaseModel):
|
| 36 |
+
id: str
|
| 37 |
+
title: str
|
| 38 |
+
time: int
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class EnvironmentState(BaseModel):
|
| 42 |
+
# Observed components
|
| 43 |
+
emails: List[Email]
|
| 44 |
+
tasks: List[Task]
|
| 45 |
+
calendar: List[CalendarEvent]
|
| 46 |
+
history: List[Dict] = Field(default_factory=list)
|
| 47 |
+
|
| 48 |
+
# Hidden components (NOT exposed to agent)
|
| 49 |
+
hidden_email_states: List[HiddenEmailState]
|
| 50 |
+
|
| 51 |
+
# Global timestep
|
| 52 |
+
timestep: int = 0
|
| 53 |
+
|
| 54 |
+
# Episode termination
|
| 55 |
+
done: bool = False
|
app/transition.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.state import EnvironmentState
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def apply_action(state: EnvironmentState, action):
|
| 5 |
+
info = {
|
| 6 |
+
"correct_action": False,
|
| 7 |
+
"incorrect_action": False,
|
| 8 |
+
"task_progress": False
|
| 9 |
+
}
|
| 10 |
+
|
| 11 |
+
# Find hidden truth
|
| 12 |
+
hidden = next(
|
| 13 |
+
(h for h in state.hidden_email_states if h.email_id == action.target_id),
|
| 14 |
+
None
|
| 15 |
+
)
|
| 16 |
+
# ----------------------------
|
| 17 |
+
# CLASSIFY
|
| 18 |
+
# ----------------------------
|
| 19 |
+
# if action.type == "classify":
|
| 20 |
+
# predicted = action.payload.get("label") if action.payload else None
|
| 21 |
+
|
| 22 |
+
# if hidden:
|
| 23 |
+
# # 🔥 NEW: penalize guessing under uncertainty
|
| 24 |
+
# if hidden.missing_information:
|
| 25 |
+
# info["incorrect_action"] = True # cannot classify correctly without info
|
| 26 |
+
|
| 27 |
+
# elif predicted == hidden.true_intent:
|
| 28 |
+
# info["correct_action"] = True
|
| 29 |
+
# info["task_progress"] = True
|
| 30 |
+
|
| 31 |
+
# else:
|
| 32 |
+
# info["incorrect_action"] = True
|
| 33 |
+
if action.type == "classify":
|
| 34 |
+
predicted = action.payload.get("label") if action.payload else None
|
| 35 |
+
|
| 36 |
+
if not hidden:
|
| 37 |
+
info["incorrect_action"] = True
|
| 38 |
+
|
| 39 |
+
elif hidden.missing_information:
|
| 40 |
+
# ❌ Cannot classify without info
|
| 41 |
+
info["incorrect_action"] = True
|
| 42 |
+
|
| 43 |
+
else:
|
| 44 |
+
# ✅ Now classification is allowed
|
| 45 |
+
if predicted == hidden.true_intent:
|
| 46 |
+
info["correct_action"] = True
|
| 47 |
+
info["task_progress"] = True
|
| 48 |
+
else:
|
| 49 |
+
info["incorrect_action"] = True
|
| 50 |
+
# ----------------------------
|
| 51 |
+
# ARCHIVE
|
| 52 |
+
# ----------------------------
|
| 53 |
+
elif action.type == "archive":
|
| 54 |
+
state.emails = [e for e in state.emails if e.id != action.target_id]
|
| 55 |
+
info["task_progress"] = True
|
| 56 |
+
|
| 57 |
+
# ----------------------------
|
| 58 |
+
# REQUEST INFO
|
| 59 |
+
# ----------------------------
|
| 60 |
+
elif action.type == "request_info":
|
| 61 |
+
if hidden and hidden.missing_information:
|
| 62 |
+
hidden.missing_information = False
|
| 63 |
+
info["correct_action"] = True
|
| 64 |
+
else:
|
| 65 |
+
info["incorrect_action"] = True
|
| 66 |
+
|
| 67 |
+
# ----------------------------
|
| 68 |
+
# REPLY
|
| 69 |
+
# ----------------------------
|
| 70 |
+
elif action.type == "reply":
|
| 71 |
+
if hidden and hidden.requires_response:
|
| 72 |
+
hidden.requires_response = False
|
| 73 |
+
info["correct_action"] = True
|
| 74 |
+
else:
|
| 75 |
+
info["incorrect_action"] = True
|
| 76 |
+
|
| 77 |
+
return state, info
|
app/utils.py
ADDED
|
File without changes
|
app_server.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI
|
| 2 |
+
from app.env import WorkflowEnv
|
| 3 |
+
from tasks.easy import create_easy_task
|
| 4 |
+
|
| 5 |
+
app = FastAPI()
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
@app.post("/reset")
|
| 9 |
+
def reset():
|
| 10 |
+
state, _ = create_easy_task()
|
| 11 |
+
env = WorkflowEnv(state)
|
| 12 |
+
obs = env.reset()
|
| 13 |
+
|
| 14 |
+
return {"status": "ok"}
|
| 15 |
+
|
| 16 |
+
@app.get("/")
|
| 17 |
+
def root():
|
| 18 |
+
return {"message": "Workflow Env is running"}
|
| 19 |
+
|
| 20 |
+
@app.get("/")
|
| 21 |
+
def home():
|
| 22 |
+
return {"message": "Workflow Env API running"}
|
baseline/__init__.py
ADDED
|
File without changes
|
baseline/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (181 Bytes). View file
|
|
|
baseline/__pycache__/policy.cpython-313.pyc
ADDED
|
Binary file (1.25 kB). View file
|
|
|
baseline/__pycache__/run_baseline.cpython-313.pyc
ADDED
|
Binary file (2.19 kB). View file
|
|
|
baseline/policy.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.actions import Action
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class BaselinePolicy:
|
| 5 |
+
def act(self, observation):
|
| 6 |
+
if not observation.emails:
|
| 7 |
+
return None
|
| 8 |
+
|
| 9 |
+
email = observation.emails[0]
|
| 10 |
+
text = (email.subject + " " + email.body).lower()
|
| 11 |
+
|
| 12 |
+
# Heuristic rules
|
| 13 |
+
if "meet" in text:
|
| 14 |
+
return Action(
|
| 15 |
+
type="classify",
|
| 16 |
+
target_id=email.id,
|
| 17 |
+
payload={"label": "meeting_request"}
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
elif "report" in text or "update" in text:
|
| 21 |
+
return Action(
|
| 22 |
+
type="classify",
|
| 23 |
+
target_id=email.id,
|
| 24 |
+
payload={"label": "task_request"}
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
return Action(
|
| 28 |
+
type="archive",
|
| 29 |
+
target_id=email.id
|
| 30 |
+
)
|
baseline/run_baseline.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from tasks.easy import create_easy_task
|
| 2 |
+
from tasks.medium import create_medium_task
|
| 3 |
+
from tasks.hard import create_hard_task
|
| 4 |
+
|
| 5 |
+
from graders.easy_grader import EasyGrader
|
| 6 |
+
from graders.medium_grader import MediumGrader
|
| 7 |
+
from graders.hard_grader import HardGrader
|
| 8 |
+
|
| 9 |
+
from app.env import WorkflowEnv
|
| 10 |
+
from baseline.policy import BaselinePolicy
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def run_task(task_name, create_task_fn, grader_cls):
|
| 14 |
+
state, ground_truth = create_task_fn()
|
| 15 |
+
env = WorkflowEnv(state)
|
| 16 |
+
policy = BaselinePolicy()
|
| 17 |
+
|
| 18 |
+
obs = env.reset()
|
| 19 |
+
|
| 20 |
+
done = False
|
| 21 |
+
steps = 0
|
| 22 |
+
|
| 23 |
+
while not done and steps < 10:
|
| 24 |
+
action = policy.act(obs)
|
| 25 |
+
|
| 26 |
+
if action is None:
|
| 27 |
+
break
|
| 28 |
+
|
| 29 |
+
obs, reward, done, _ = env.step(action)
|
| 30 |
+
steps += 1
|
| 31 |
+
|
| 32 |
+
trajectory = env.state().history
|
| 33 |
+
print(f"{task_name} trajectory:", trajectory)
|
| 34 |
+
|
| 35 |
+
grader = grader_cls()
|
| 36 |
+
score = grader.grade(trajectory, ground_truth)
|
| 37 |
+
|
| 38 |
+
return score
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def main():
|
| 42 |
+
results = {}
|
| 43 |
+
|
| 44 |
+
results["easy"] = run_task("easy", create_easy_task, EasyGrader)
|
| 45 |
+
results["medium"] = run_task("medium", create_medium_task, MediumGrader)
|
| 46 |
+
results["hard"] = run_task("hard", create_hard_task, HardGrader)
|
| 47 |
+
|
| 48 |
+
print("\n===== BASELINE RESULTS =====")
|
| 49 |
+
for k, v in results.items():
|
| 50 |
+
print(f"{k}: {round(v, 3)}")
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
if __name__ == "__main__":
|
| 54 |
+
main()
|
graders/__init__.py
ADDED
|
File without changes
|
graders/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (180 Bytes). View file
|
|
|
graders/__pycache__/base.cpython-313.pyc
ADDED
|
Binary file (609 Bytes). View file
|
|
|
graders/__pycache__/easy_grader.cpython-313.pyc
ADDED
|
Binary file (948 Bytes). View file
|
|
|
graders/__pycache__/hard_grader.cpython-313.pyc
ADDED
|
Binary file (1.2 kB). View file
|
|
|
graders/__pycache__/medium_grader.cpython-313.pyc
ADDED
|
Binary file (1.07 kB). View file
|
|
|
graders/base.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
class BaseGrader:
|
| 2 |
+
def grade(self, trajectory, ground_truth) -> float:
|
| 3 |
+
raise NotImplementedError
|
graders/easy_grader.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from graders.base import BaseGrader
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class EasyGrader(BaseGrader):
|
| 5 |
+
def grade(self, trajectory, ground_truth) -> float:
|
| 6 |
+
correct_label = ground_truth["label"]
|
| 7 |
+
|
| 8 |
+
for step in trajectory:
|
| 9 |
+
action = step["action"]
|
| 10 |
+
|
| 11 |
+
if action["type"] == "classify":
|
| 12 |
+
if action.get("payload", {}).get("label") == correct_label:
|
| 13 |
+
return 1.0
|
| 14 |
+
else:
|
| 15 |
+
return 0.0
|
| 16 |
+
|
| 17 |
+
return 0.0
|
graders/hard_grader.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from graders.base import BaseGrader
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class HardGrader(BaseGrader):
|
| 5 |
+
def grade(self, trajectory, ground_truth) -> float:
|
| 6 |
+
expected_sequence = ground_truth["sequence"]
|
| 7 |
+
|
| 8 |
+
matched = 0
|
| 9 |
+
penalty = 0
|
| 10 |
+
|
| 11 |
+
for i, step in enumerate(trajectory):
|
| 12 |
+
if i >= len(expected_sequence):
|
| 13 |
+
break
|
| 14 |
+
|
| 15 |
+
action = step["action"]
|
| 16 |
+
expected = expected_sequence[i]
|
| 17 |
+
|
| 18 |
+
if action["type"] == expected["type"]:
|
| 19 |
+
matched += 1
|
| 20 |
+
else:
|
| 21 |
+
penalty += 1
|
| 22 |
+
|
| 23 |
+
score = matched / len(expected_sequence)
|
| 24 |
+
score -= 0.1 * penalty
|
| 25 |
+
|
| 26 |
+
return max(0.0, min(1.0, score))
|
graders/medium_grader.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from graders.base import BaseGrader
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class MediumGrader(BaseGrader):
|
| 5 |
+
def grade(self, trajectory, ground_truth) -> float:
|
| 6 |
+
expected_sequence = ground_truth["sequence"]
|
| 7 |
+
|
| 8 |
+
score = 0.0
|
| 9 |
+
matched = 0
|
| 10 |
+
|
| 11 |
+
for i, step in enumerate(trajectory):
|
| 12 |
+
if i >= len(expected_sequence):
|
| 13 |
+
break
|
| 14 |
+
|
| 15 |
+
action = step["action"]
|
| 16 |
+
expected = expected_sequence[i]
|
| 17 |
+
|
| 18 |
+
if action["type"] == expected["type"]:
|
| 19 |
+
matched += 1
|
| 20 |
+
|
| 21 |
+
score = matched / len(expected_sequence)
|
| 22 |
+
return score
|
inference.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
from openai import OpenAI
|
| 4 |
+
|
| 5 |
+
load_dotenv()
|
| 6 |
+
|
| 7 |
+
from app.env import WorkflowEnv
|
| 8 |
+
from app.actions import Action
|
| 9 |
+
from tasks.hard import create_hard_task
|
| 10 |
+
from graders.hard_grader import HardGrader
|
| 11 |
+
# ---------------- ENV CONFIG ----------------
|
| 12 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 13 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
|
| 14 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 15 |
+
LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
|
| 16 |
+
|
| 17 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# ---------------- LOGGING ----------------
|
| 21 |
+
def log_start(task, env, model):
|
| 22 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def log_step(step, action, reward, done, error):
|
| 26 |
+
print(
|
| 27 |
+
f"[STEP] step={step} action={action} reward={reward:.2f} done={str(done).lower()} error={error or 'null'}",
|
| 28 |
+
flush=True,
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def log_end(success, steps, score, rewards):
|
| 33 |
+
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 34 |
+
print(
|
| 35 |
+
f"[END] success={str(success).lower()} steps={steps} score={score:.2f} rewards={rewards_str}",
|
| 36 |
+
flush=True,
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
# ---------------- SIMPLE POLICY ----------------
|
| 41 |
+
def get_action(obs):
|
| 42 |
+
if not obs.emails:
|
| 43 |
+
return None
|
| 44 |
+
|
| 45 |
+
email = obs.emails[0]
|
| 46 |
+
|
| 47 |
+
# 🔥 IMPORTANT: detect if we already asked info
|
| 48 |
+
already_asked = any(
|
| 49 |
+
h["action"]["type"] == "request_info"
|
| 50 |
+
for h in obs.history
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
text = (email.subject + " " + email.body).lower()
|
| 54 |
+
|
| 55 |
+
# If info already requested → do NOT ask again
|
| 56 |
+
if already_asked:
|
| 57 |
+
return Action(
|
| 58 |
+
type="classify",
|
| 59 |
+
target_id=email.id,
|
| 60 |
+
payload={"label": "meeting_request"}
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
# First step: ask info if ambiguous
|
| 64 |
+
if "sometime" in text or "next week" in text:
|
| 65 |
+
return Action(type="request_info", target_id=email.id)
|
| 66 |
+
|
| 67 |
+
return Action(type="archive", target_id=email.id)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
# ---------------- MAIN ----------------
|
| 71 |
+
def main():
|
| 72 |
+
state, gt = create_hard_task()
|
| 73 |
+
env = WorkflowEnv(state)
|
| 74 |
+
grader = HardGrader()
|
| 75 |
+
|
| 76 |
+
obs = env.reset()
|
| 77 |
+
|
| 78 |
+
rewards = []
|
| 79 |
+
steps = 0
|
| 80 |
+
|
| 81 |
+
log_start("hard", "workflow-env", MODEL_NAME)
|
| 82 |
+
|
| 83 |
+
try:
|
| 84 |
+
done = False
|
| 85 |
+
|
| 86 |
+
while not done and steps < 10:
|
| 87 |
+
action = get_action(obs)
|
| 88 |
+
if action is None:
|
| 89 |
+
break
|
| 90 |
+
|
| 91 |
+
obs, reward, done, _ = env.step(action)
|
| 92 |
+
|
| 93 |
+
rewards.append(reward)
|
| 94 |
+
steps += 1
|
| 95 |
+
|
| 96 |
+
log_step(steps, action.type, reward, done, None)
|
| 97 |
+
|
| 98 |
+
# 🔥 STOP CONDITION (IMPORTANT)
|
| 99 |
+
if action.type == "classify":
|
| 100 |
+
break
|
| 101 |
+
|
| 102 |
+
trajectory = env.state().history
|
| 103 |
+
score = grader.grade(trajectory, gt)
|
| 104 |
+
|
| 105 |
+
score = max(0.0, min(1.0, score))
|
| 106 |
+
|
| 107 |
+
success = score > 0.3
|
| 108 |
+
|
| 109 |
+
finally:
|
| 110 |
+
log_end(success, steps, score, rewards)
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
if __name__ == "__main__":
|
| 114 |
+
main()
|
normaltest.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from tasks.easy import create_easy_task
|
| 2 |
+
from app.env import WorkflowEnv
|
| 3 |
+
from app.actions import Action
|
| 4 |
+
|
| 5 |
+
state, gt = create_easy_task()
|
| 6 |
+
env = WorkflowEnv(state)
|
| 7 |
+
|
| 8 |
+
obs = env.reset()
|
| 9 |
+
print("Initial:", obs)
|
| 10 |
+
|
| 11 |
+
# Try correct classify
|
| 12 |
+
action = Action(
|
| 13 |
+
type="classify",
|
| 14 |
+
target_id="1",
|
| 15 |
+
payload={"label": "meeting_request"}
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
obs, reward, done, _ = env.step(action)
|
| 19 |
+
|
| 20 |
+
print("After step:", obs)
|
| 21 |
+
print("Reward:", reward)
|
openenv.yaml
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: workflow-agent-env
|
| 2 |
+
description: >
|
| 3 |
+
A real-world environment simulating email and workflow management under partial observability.
|
| 4 |
+
Agents must classify, respond, and manage tasks with incomplete information.
|
| 5 |
+
|
| 6 |
+
version: "1.0"
|
| 7 |
+
|
| 8 |
+
entry_point: app.env:WorkflowEnv
|
| 9 |
+
|
| 10 |
+
observation_space:
|
| 11 |
+
type: object
|
| 12 |
+
properties:
|
| 13 |
+
emails:
|
| 14 |
+
type: array
|
| 15 |
+
description: List of emails in inbox
|
| 16 |
+
tasks:
|
| 17 |
+
type: array
|
| 18 |
+
calendar:
|
| 19 |
+
type: array
|
| 20 |
+
history:
|
| 21 |
+
type: array
|
| 22 |
+
timestep:
|
| 23 |
+
type: integer
|
| 24 |
+
|
| 25 |
+
action_space:
|
| 26 |
+
type: object
|
| 27 |
+
properties:
|
| 28 |
+
type:
|
| 29 |
+
type: string
|
| 30 |
+
enum:
|
| 31 |
+
- classify
|
| 32 |
+
- reply
|
| 33 |
+
- schedule
|
| 34 |
+
- prioritize
|
| 35 |
+
- request_info
|
| 36 |
+
- archive
|
| 37 |
+
target_id:
|
| 38 |
+
type: string
|
| 39 |
+
payload:
|
| 40 |
+
type: object
|
| 41 |
+
|
| 42 |
+
tasks:
|
| 43 |
+
- name: easy
|
| 44 |
+
generator: tasks.easy:create_easy_task
|
| 45 |
+
grader: graders.easy_grader:EasyGrader
|
| 46 |
+
|
| 47 |
+
- name: medium
|
| 48 |
+
generator: tasks.medium:create_medium_task
|
| 49 |
+
grader: graders.medium_grader:MediumGrader
|
| 50 |
+
|
| 51 |
+
- name: hard
|
| 52 |
+
generator: tasks.hard:create_hard_task
|
| 53 |
+
grader: graders.hard_grader:HardGrader
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pydantic==2.7.1
|
| 2 |
+
typing-extensions
|
| 3 |
+
python-dotenv
|
| 4 |
+
pytest
|
| 5 |
+
pyyaml
|
| 6 |
+
fastapi
|
| 7 |
+
uvicorn
|
| 8 |
+
openai
|
scripts/__init__.py
ADDED
|
File without changes
|
scripts/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (180 Bytes). View file
|
|
|
scripts/__pycache__/validate_env.cpython-313.pyc
ADDED
|
Binary file (1.59 kB). View file
|
|
|
scripts/run_all_tasks.py
ADDED
|
File without changes
|
scripts/validate_env.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import importlib
|
| 2 |
+
import yaml
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def validate_yaml():
|
| 6 |
+
with open("openenv.yaml", "r") as f:
|
| 7 |
+
config = yaml.safe_load(f)
|
| 8 |
+
|
| 9 |
+
print("✔ YAML loaded")
|
| 10 |
+
|
| 11 |
+
# Check entry point
|
| 12 |
+
module_name, class_name = config["entry_point"].split(":")
|
| 13 |
+
module = importlib.import_module(module_name)
|
| 14 |
+
getattr(module, class_name)
|
| 15 |
+
|
| 16 |
+
print("✔ Entry point valid")
|
| 17 |
+
|
| 18 |
+
# Check tasks
|
| 19 |
+
for task in config["tasks"]:
|
| 20 |
+
gen_module, gen_fn = task["generator"].split(":")
|
| 21 |
+
grader_module, grader_cls = task["grader"].split(":")
|
| 22 |
+
|
| 23 |
+
gen_mod = importlib.import_module(gen_module)
|
| 24 |
+
getattr(gen_mod, gen_fn)
|
| 25 |
+
|
| 26 |
+
grader_mod = importlib.import_module(grader_module)
|
| 27 |
+
getattr(grader_mod, grader_cls)
|
| 28 |
+
|
| 29 |
+
print(f"✔ Task validated: {task['name']}")
|
| 30 |
+
|
| 31 |
+
print("\n✅ All validations passed!")
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
if __name__ == "__main__":
|
| 35 |
+
validate_yaml()
|
tasks/__init__.py
ADDED
|
File without changes
|
tasks/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (178 Bytes). View file
|
|
|
tasks/__pycache__/easy.cpython-313.pyc
ADDED
|
Binary file (928 Bytes). View file
|
|
|
tasks/__pycache__/hard.cpython-313.pyc
ADDED
|
Binary file (996 Bytes). View file
|
|
|
tasks/__pycache__/medium.cpython-313.pyc
ADDED
|
Binary file (992 Bytes). View file
|
|
|