Spaces:
Sleeping
Sleeping
Upload 53 files
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- Dockerfile +14 -0
- README.md +127 -10
- __pycache__/inference.cpython-313.pyc +0 -0
- __pycache__/inference.cpython-314.pyc +0 -0
- inference.py +260 -0
- openenv.yaml +84 -0
- openenv_support_ops_env.egg-info/PKG-INFO +11 -0
- openenv_support_ops_env.egg-info/SOURCES.txt +18 -0
- openenv_support_ops_env.egg-info/dependency_links.txt +1 -0
- openenv_support_ops_env.egg-info/entry_points.txt +2 -0
- openenv_support_ops_env.egg-info/requires.txt +7 -0
- openenv_support_ops_env.egg-info/top_level.txt +2 -0
- pyproject.toml +28 -0
- server/__init__.py +1 -0
- server/app.py +10 -0
- tool_use_env/README.md +256 -0
- tool_use_env/__init__.py +17 -0
- tool_use_env/__pycache__/__init__.cpython-312.pyc +0 -0
- tool_use_env/__pycache__/__init__.cpython-313.pyc +0 -0
- tool_use_env/__pycache__/__init__.cpython-314.pyc +0 -0
- tool_use_env/__pycache__/client.cpython-312.pyc +0 -0
- tool_use_env/__pycache__/client.cpython-313.pyc +0 -0
- tool_use_env/__pycache__/client.cpython-314.pyc +0 -0
- tool_use_env/__pycache__/grader.cpython-312.pyc +0 -0
- tool_use_env/__pycache__/models.cpython-312.pyc +0 -0
- tool_use_env/__pycache__/models.cpython-313.pyc +0 -0
- tool_use_env/agents/__pycache__/baseline.cpython-313.pyc +0 -0
- tool_use_env/agents/baseline.py +267 -0
- tool_use_env/client.py +165 -0
- tool_use_env/grader.py +48 -0
- tool_use_env/models.py +86 -0
- tool_use_env/openenv_tool_use_env.egg-info/PKG-INFO +9 -0
- tool_use_env/openenv_tool_use_env.egg-info/SOURCES.txt +20 -0
- tool_use_env/openenv_tool_use_env.egg-info/dependency_links.txt +1 -0
- tool_use_env/openenv_tool_use_env.egg-info/entry_points.txt +2 -0
- tool_use_env/openenv_tool_use_env.egg-info/requires.txt +5 -0
- tool_use_env/openenv_tool_use_env.egg-info/top_level.txt +1 -0
- tool_use_env/pyproject.toml +45 -0
- tool_use_env/server/Dockerfile +80 -0
- tool_use_env/server/__init__.py +11 -0
- tool_use_env/server/__pycache__/__init__.cpython-312.pyc +0 -0
- tool_use_env/server/__pycache__/__init__.cpython-313.pyc +0 -0
- tool_use_env/server/__pycache__/app.cpython-312.pyc +0 -0
- tool_use_env/server/__pycache__/app.cpython-313.pyc +0 -0
- tool_use_env/server/__pycache__/tool_use_env_environment.cpython-312.pyc +0 -0
- tool_use_env/server/__pycache__/tool_use_env_environment.cpython-313.pyc +0 -0
- tool_use_env/server/app.py +29 -0
- tool_use_env/server/requirements.txt +7 -0
- tool_use_env/server/tool_use_env_environment.py +351 -0
- tool_use_env/tasks.py +141 -0
Dockerfile
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
ENV PYTHONUNBUFFERED=1
|
| 6 |
+
|
| 7 |
+
COPY . /app
|
| 8 |
+
|
| 9 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 10 |
+
pip install --no-cache-dir .
|
| 11 |
+
|
| 12 |
+
EXPOSE 8000
|
| 13 |
+
|
| 14 |
+
CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
README.md
CHANGED
|
@@ -1,10 +1,127 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
## Why this environment
|
| 3 |
+
|
| 4 |
+
Modern tool-using agents often fail on operational workflows that require evidence gathering, policy compliance, and safe escalation. This environment targets that gap with deterministic tasks that resemble what ecommerce support, trust-and-safety, and operations agents do every day.
|
| 5 |
+
|
| 6 |
+
## Task set
|
| 7 |
+
|
| 8 |
+
The benchmark ships with three deterministic tasks and matching deterministic graders:
|
| 9 |
+
|
| 10 |
+
1. `damaged-mug-replacement` (`easy`)
|
| 11 |
+
Resolve a damaged-item replacement request.
|
| 12 |
+
2. `duplicate-charge-refund` (`medium`)
|
| 13 |
+
Investigate a duplicate billing complaint and refund the extra capture.
|
| 14 |
+
3. `account-takeover-fraud` (`hard`)
|
| 15 |
+
Handle a suspected account takeover with a security-first fraud escalation.
|
| 16 |
+
|
| 17 |
+
Each task has a fixed expected resolution, required evidence, and reply keywords. The grader returns a score in `[0.0, 1.0]` from weighted resolution accuracy, evidence coverage, reply quality, and efficiency.
|
| 18 |
+
|
| 19 |
+
## Action space
|
| 20 |
+
|
| 21 |
+
The environment uses a typed `ToolUseAction` model with these actions:
|
| 22 |
+
|
| 23 |
+
- `review_ticket`
|
| 24 |
+
- `inspect_artifact`
|
| 25 |
+
- `search_policy`
|
| 26 |
+
- `draft_reply`
|
| 27 |
+
- `submit_resolution`
|
| 28 |
+
|
| 29 |
+
Optional fields on the action are `artifact_id`, `query`, `message`, and `resolution_code`.
|
| 30 |
+
|
| 31 |
+
## Observation space
|
| 32 |
+
|
| 33 |
+
The typed `ToolUseObservation` includes:
|
| 34 |
+
|
| 35 |
+
- `task_id`, `difficulty`, `objective`
|
| 36 |
+
- `customer_message`
|
| 37 |
+
- `workspace_summary`
|
| 38 |
+
- `available_actions`
|
| 39 |
+
- `available_resolution_codes`
|
| 40 |
+
- `collected_evidence`
|
| 41 |
+
- `last_tool_result`
|
| 42 |
+
- `last_action_error`
|
| 43 |
+
- `remaining_steps`
|
| 44 |
+
- `current_score`
|
| 45 |
+
|
| 46 |
+
The typed `ToolUseState` exposes internal progress such as `final_score`, `drafted_reply`, `resolution_code`, `required_evidence`, `collected_evidence`, and action history.
|
| 47 |
+
|
| 48 |
+
## Reward design
|
| 49 |
+
|
| 50 |
+
The reward is shaped over the full trajectory:
|
| 51 |
+
|
| 52 |
+
- Positive reward for first-time collection of relevant artifacts and policies
|
| 53 |
+
- Smaller reward for drafting a reply that includes required customer-facing details
|
| 54 |
+
- Very small or zero reward for repeated or invalid actions
|
| 55 |
+
- Final step reward equal to the deterministic grader score
|
| 56 |
+
|
| 57 |
+
This gives agents signal before the final submission while still anchoring the episode outcome to task completion quality.
|
| 58 |
+
|
| 59 |
+
## Setup
|
| 60 |
+
|
| 61 |
+
### Local Python
|
| 62 |
+
|
| 63 |
+
```bash
|
| 64 |
+
UV_CACHE_DIR=/tmp/uv-cache uv sync
|
| 65 |
+
.venv/bin/pip install -e .
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
### Run the server
|
| 69 |
+
|
| 70 |
+
```bash
|
| 71 |
+
.venv/bin/python -m uvicorn server.app:app --host 0.0.0.0 --port 8000
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
### Docker
|
| 75 |
+
|
| 76 |
+
```bash
|
| 77 |
+
docker build -t support-ops-openenv .
|
| 78 |
+
docker run --rm -p 8000:8000 support-ops-openenv
|
| 79 |
+
```
|
| 80 |
+
|
| 81 |
+
## Baseline inference
|
| 82 |
+
|
| 83 |
+
The required root `inference.py` uses the OpenAI client for model calls and emits the mandatory `[START]`, `[STEP]`, and `[END]` logs.
|
| 84 |
+
|
| 85 |
+
Environment variables:
|
| 86 |
+
|
| 87 |
+
- `HF_TOKEN` or `OPENAI_API_KEY`
|
| 88 |
+
- `API_BASE_URL`
|
| 89 |
+
- `MODEL_NAME`
|
| 90 |
+
- `LOCAL_IMAGE_NAME` if you want to run via `from_docker_image()`
|
| 91 |
+
- `ENV_BASE_URL` if you want to connect to a running server
|
| 92 |
+
|
| 93 |
+
Example:
|
| 94 |
+
|
| 95 |
+
```bash
|
| 96 |
+
export HF_TOKEN=...
|
| 97 |
+
export API_BASE_URL=https://router.huggingface.co/v1
|
| 98 |
+
export MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct
|
| 99 |
+
python inference.py
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
+
The script evaluates all three tasks in a fixed order for reproducible scoring. If no API key is available, it falls back to a deterministic scripted policy so the benchmark remains runnable offline.
|
| 103 |
+
|
| 104 |
+
## Expected baseline behavior
|
| 105 |
+
|
| 106 |
+
The bundled fallback policy should solve all three tasks with high scores because it follows the intended evidence path exactly. Frontier LLMs should also perform well on the easy and medium tasks and show larger variance on the hard fraud-escalation task if they over-index on issuing refunds instead of following policy.
|
| 107 |
+
|
| 108 |
+
## Project structure
|
| 109 |
+
|
| 110 |
+
```text
|
| 111 |
+
.
|
| 112 |
+
├── Dockerfile
|
| 113 |
+
├── README.md
|
| 114 |
+
├── inference.py
|
| 115 |
+
├── openenv.yaml
|
| 116 |
+
├── pyproject.toml
|
| 117 |
+
├── server/
|
| 118 |
+
│ └── app.py
|
| 119 |
+
└── tool_use_env/
|
| 120 |
+
├── client.py
|
| 121 |
+
├── grader.py
|
| 122 |
+
├── models.py
|
| 123 |
+
├── tasks.py
|
| 124 |
+
└── server/
|
| 125 |
+
├── app.py
|
| 126 |
+
└── tool_use_env_environment.py
|
| 127 |
+
```
|
__pycache__/inference.cpython-313.pyc
ADDED
|
Binary file (6.03 kB). View file
|
|
|
__pycache__/inference.cpython-314.pyc
ADDED
|
Binary file (6.74 kB). View file
|
|
|
inference.py
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
import textwrap
|
| 5 |
+
from typing import Any, List, Optional
|
| 6 |
+
|
| 7 |
+
from openai import OpenAI
|
| 8 |
+
|
| 9 |
+
from tool_use_env.client import ToolUseEnv
|
| 10 |
+
from tool_use_env.models import ToolUseAction
|
| 11 |
+
from tool_use_env.tasks import TASK_SEQUENCE
|
| 12 |
+
|
| 13 |
+
LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
|
| 14 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 15 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Llama-3.1-8B-Instruct")
|
| 16 |
+
API_KEY = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY") or os.getenv("API_KEY")
|
| 17 |
+
ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://127.0.0.1:8000")
|
| 18 |
+
BENCHMARK = os.getenv("MY_ENV_V4_BENCHMARK", "support_ops_env")
|
| 19 |
+
MAX_STEPS = 6
|
| 20 |
+
TEMPERATURE = 0.0
|
| 21 |
+
MAX_TOKENS = 220
|
| 22 |
+
|
| 23 |
+
SYSTEM_PROMPT = textwrap.dedent(
|
| 24 |
+
"""
|
| 25 |
+
You are operating a customer-support workflow environment.
|
| 26 |
+
Your job is to gather the minimum necessary evidence, draft a short customer reply,
|
| 27 |
+
and submit the correct final resolution code.
|
| 28 |
+
|
| 29 |
+
Reply with JSON only using this schema:
|
| 30 |
+
{
|
| 31 |
+
"action_type": "review_ticket|inspect_artifact|search_policy|draft_reply|submit_resolution",
|
| 32 |
+
"artifact_id": "optional string",
|
| 33 |
+
"query": "optional string",
|
| 34 |
+
"message": "optional string",
|
| 35 |
+
"resolution_code": "optional string"
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
Use concise messages. Prefer exact artifact ids and exact resolution codes shown in the observation.
|
| 39 |
+
"""
|
| 40 |
+
).strip()
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def log_start(task: str, env: str, model: str) -> None:
|
| 44 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
|
| 48 |
+
error_val = error if error else "null"
|
| 49 |
+
print(
|
| 50 |
+
f"[STEP] step={step} action={action} reward={reward:.2f} "
|
| 51 |
+
f"done={str(done).lower()} error={error_val}",
|
| 52 |
+
flush=True,
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
|
| 57 |
+
rewards_str = ",".join(f"{reward:.2f}" for reward in rewards)
|
| 58 |
+
print(
|
| 59 |
+
f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}",
|
| 60 |
+
flush=True,
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def _serialize_action(action: ToolUseAction) -> str:
|
| 65 |
+
payload = {"action_type": action.action_type}
|
| 66 |
+
if action.artifact_id:
|
| 67 |
+
payload["artifact_id"] = action.artifact_id
|
| 68 |
+
if action.query:
|
| 69 |
+
payload["query"] = action.query
|
| 70 |
+
if action.message:
|
| 71 |
+
payload["message"] = action.message.replace("\n", " ").strip()
|
| 72 |
+
if action.resolution_code:
|
| 73 |
+
payload["resolution_code"] = action.resolution_code
|
| 74 |
+
return json.dumps(payload, ensure_ascii=True, separators=(",", ":"))
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def _fallback_action(observation: Any) -> ToolUseAction:
|
| 78 |
+
evidence = set(observation.collected_evidence)
|
| 79 |
+
task_id = observation.task_id
|
| 80 |
+
|
| 81 |
+
if "ticket" not in evidence:
|
| 82 |
+
return ToolUseAction(action_type="review_ticket")
|
| 83 |
+
|
| 84 |
+
task_plans = {
|
| 85 |
+
"damaged-mug-replacement": [
|
| 86 |
+
ToolUseAction(action_type="inspect_artifact", artifact_id="order"),
|
| 87 |
+
ToolUseAction(action_type="search_policy", query="damaged_items"),
|
| 88 |
+
ToolUseAction(
|
| 89 |
+
action_type="draft_reply",
|
| 90 |
+
message=(
|
| 91 |
+
"We are sending a replacement within 48 hours. "
|
| 92 |
+
"There is no need to return the broken mug."
|
| 93 |
+
),
|
| 94 |
+
),
|
| 95 |
+
ToolUseAction(action_type="submit_resolution", resolution_code="send_replacement"),
|
| 96 |
+
],
|
| 97 |
+
"duplicate-charge-refund": [
|
| 98 |
+
ToolUseAction(action_type="inspect_artifact", artifact_id="order"),
|
| 99 |
+
ToolUseAction(action_type="inspect_artifact", artifact_id="payment"),
|
| 100 |
+
ToolUseAction(action_type="search_policy", query="duplicate_charge"),
|
| 101 |
+
ToolUseAction(
|
| 102 |
+
action_type="draft_reply",
|
| 103 |
+
message=(
|
| 104 |
+
"We confirmed the duplicate charge and issued a refund. "
|
| 105 |
+
"You should see the refund in 3-5 business days."
|
| 106 |
+
),
|
| 107 |
+
),
|
| 108 |
+
ToolUseAction(
|
| 109 |
+
action_type="submit_resolution",
|
| 110 |
+
resolution_code="refund_duplicate_charge",
|
| 111 |
+
),
|
| 112 |
+
],
|
| 113 |
+
"account-takeover-fraud": [
|
| 114 |
+
ToolUseAction(action_type="inspect_artifact", artifact_id="account"),
|
| 115 |
+
ToolUseAction(action_type="inspect_artifact", artifact_id="risk_log"),
|
| 116 |
+
ToolUseAction(action_type="search_policy", query="account_takeover"),
|
| 117 |
+
ToolUseAction(
|
| 118 |
+
action_type="draft_reply",
|
| 119 |
+
message=(
|
| 120 |
+
"We locked your account immediately and escalated this to our fraud team. "
|
| 121 |
+
"You will receive an update within 24 hours."
|
| 122 |
+
),
|
| 123 |
+
),
|
| 124 |
+
ToolUseAction(
|
| 125 |
+
action_type="submit_resolution",
|
| 126 |
+
resolution_code="lock_account_and_escalate_fraud",
|
| 127 |
+
),
|
| 128 |
+
],
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
plan = task_plans[task_id]
|
| 132 |
+
for candidate in plan:
|
| 133 |
+
if candidate.action_type == "inspect_artifact":
|
| 134 |
+
if f"artifact:{candidate.artifact_id}" not in evidence:
|
| 135 |
+
return candidate
|
| 136 |
+
elif candidate.action_type == "search_policy":
|
| 137 |
+
if f"policy:{candidate.query}" not in evidence:
|
| 138 |
+
return candidate
|
| 139 |
+
elif candidate.action_type == "draft_reply" and not observation.last_tool_result.startswith("Draft saved"):
|
| 140 |
+
return candidate
|
| 141 |
+
elif candidate.action_type == "submit_resolution":
|
| 142 |
+
return candidate
|
| 143 |
+
|
| 144 |
+
return ToolUseAction(action_type="submit_resolution", resolution_code=observation.available_resolution_codes[0])
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def _prompt_for_observation(step: int, observation: Any) -> str:
|
| 148 |
+
return textwrap.dedent(
|
| 149 |
+
f"""
|
| 150 |
+
Step: {step}
|
| 151 |
+
Task ID: {observation.task_id}
|
| 152 |
+
Difficulty: {observation.difficulty}
|
| 153 |
+
Objective: {observation.objective}
|
| 154 |
+
Customer message: {observation.customer_message}
|
| 155 |
+
Workspace summary: {observation.workspace_summary}
|
| 156 |
+
Collected evidence: {observation.collected_evidence}
|
| 157 |
+
Available resolution codes: {observation.available_resolution_codes}
|
| 158 |
+
Last tool result: {observation.last_tool_result}
|
| 159 |
+
Last action error: {observation.last_action_error}
|
| 160 |
+
Remaining steps: {observation.remaining_steps}
|
| 161 |
+
|
| 162 |
+
Return the single best next action as JSON.
|
| 163 |
+
"""
|
| 164 |
+
).strip()
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def _model_action(client: OpenAI, step: int, observation: Any) -> ToolUseAction:
|
| 168 |
+
fallback = _fallback_action(observation)
|
| 169 |
+
|
| 170 |
+
if not API_KEY:
|
| 171 |
+
return fallback
|
| 172 |
+
|
| 173 |
+
try:
|
| 174 |
+
completion = client.chat.completions.create(
|
| 175 |
+
model=MODEL_NAME,
|
| 176 |
+
messages=[
|
| 177 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 178 |
+
{"role": "user", "content": _prompt_for_observation(step, observation)},
|
| 179 |
+
],
|
| 180 |
+
temperature=TEMPERATURE,
|
| 181 |
+
max_tokens=MAX_TOKENS,
|
| 182 |
+
response_format={"type": "json_object"},
|
| 183 |
+
)
|
| 184 |
+
raw = (completion.choices[0].message.content or "").strip()
|
| 185 |
+
data = json.loads(raw)
|
| 186 |
+
return ToolUseAction(
|
| 187 |
+
action_type=data.get("action_type", fallback.action_type),
|
| 188 |
+
artifact_id=data.get("artifact_id"),
|
| 189 |
+
query=data.get("query"),
|
| 190 |
+
message=data.get("message"),
|
| 191 |
+
resolution_code=data.get("resolution_code"),
|
| 192 |
+
)
|
| 193 |
+
except Exception:
|
| 194 |
+
return fallback
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
async def _connect_env() -> ToolUseEnv:
|
| 198 |
+
if LOCAL_IMAGE_NAME:
|
| 199 |
+
return await ToolUseEnv.from_docker_image(LOCAL_IMAGE_NAME)
|
| 200 |
+
|
| 201 |
+
env = ToolUseEnv(base_url=ENV_BASE_URL)
|
| 202 |
+
await env.connect()
|
| 203 |
+
return env
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
async def run_task(client: OpenAI, env: ToolUseEnv, task_id: str) -> float:
|
| 207 |
+
rewards: List[float] = []
|
| 208 |
+
steps_taken = 0
|
| 209 |
+
score = 0.0
|
| 210 |
+
success = False
|
| 211 |
+
|
| 212 |
+
log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
|
| 213 |
+
|
| 214 |
+
try:
|
| 215 |
+
result = await env.reset(task_id=task_id, seed=7)
|
| 216 |
+
observation = result.observation
|
| 217 |
+
|
| 218 |
+
for step in range(1, MAX_STEPS + 1):
|
| 219 |
+
if result.done:
|
| 220 |
+
break
|
| 221 |
+
|
| 222 |
+
action = _model_action(client, step, observation)
|
| 223 |
+
action_str = _serialize_action(action)
|
| 224 |
+
result = await env.step(action)
|
| 225 |
+
observation = result.observation
|
| 226 |
+
|
| 227 |
+
reward = float(result.reward or 0.0)
|
| 228 |
+
done = bool(result.done)
|
| 229 |
+
error = observation.last_action_error
|
| 230 |
+
rewards.append(reward)
|
| 231 |
+
steps_taken = step
|
| 232 |
+
|
| 233 |
+
log_step(step=step, action=action_str, reward=reward, done=done, error=error)
|
| 234 |
+
|
| 235 |
+
if done:
|
| 236 |
+
break
|
| 237 |
+
|
| 238 |
+
state = await env.state()
|
| 239 |
+
score = float(state.final_score)
|
| 240 |
+
success = score >= 0.8
|
| 241 |
+
finally:
|
| 242 |
+
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 243 |
+
|
| 244 |
+
return score
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
async def main() -> None:
|
| 248 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY or "missing")
|
| 249 |
+
env = await _connect_env()
|
| 250 |
+
try:
|
| 251 |
+
scores = []
|
| 252 |
+
for task_id in TASK_SEQUENCE:
|
| 253 |
+
score = await run_task(client, env, task_id)
|
| 254 |
+
scores.append(score)
|
| 255 |
+
finally:
|
| 256 |
+
await env.close()
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
if __name__ == "__main__":
|
| 260 |
+
asyncio.run(main())
|
openenv.yaml
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: support_ops_env
|
| 2 |
+
description: Customer support operations environment for multi-step tool-using agents
|
| 3 |
+
version: 1.0.0
|
| 4 |
+
|
| 5 |
+
entrypoint: server.app:app
|
| 6 |
+
|
| 7 |
+
actions:
|
| 8 |
+
type: object
|
| 9 |
+
properties:
|
| 10 |
+
action_type:
|
| 11 |
+
type: string
|
| 12 |
+
enum:
|
| 13 |
+
- review_ticket
|
| 14 |
+
- inspect_artifact
|
| 15 |
+
- search_policy
|
| 16 |
+
- draft_reply
|
| 17 |
+
- submit_resolution
|
| 18 |
+
artifact_id:
|
| 19 |
+
type: string
|
| 20 |
+
nullable: true
|
| 21 |
+
query:
|
| 22 |
+
type: string
|
| 23 |
+
nullable: true
|
| 24 |
+
message:
|
| 25 |
+
type: string
|
| 26 |
+
nullable: true
|
| 27 |
+
resolution_code:
|
| 28 |
+
type: string
|
| 29 |
+
nullable: true
|
| 30 |
+
required:
|
| 31 |
+
- action_type
|
| 32 |
+
|
| 33 |
+
observations:
|
| 34 |
+
type: object
|
| 35 |
+
properties:
|
| 36 |
+
task_id:
|
| 37 |
+
type: string
|
| 38 |
+
difficulty:
|
| 39 |
+
type: string
|
| 40 |
+
enum: [easy, medium, hard]
|
| 41 |
+
objective:
|
| 42 |
+
type: string
|
| 43 |
+
customer_message:
|
| 44 |
+
type: string
|
| 45 |
+
workspace_summary:
|
| 46 |
+
type: string
|
| 47 |
+
available_actions:
|
| 48 |
+
type: array
|
| 49 |
+
items:
|
| 50 |
+
type: string
|
| 51 |
+
available_resolution_codes:
|
| 52 |
+
type: array
|
| 53 |
+
items:
|
| 54 |
+
type: string
|
| 55 |
+
collected_evidence:
|
| 56 |
+
type: array
|
| 57 |
+
items:
|
| 58 |
+
type: string
|
| 59 |
+
last_tool_result:
|
| 60 |
+
type: string
|
| 61 |
+
nullable: true
|
| 62 |
+
last_action_error:
|
| 63 |
+
type: string
|
| 64 |
+
nullable: true
|
| 65 |
+
remaining_steps:
|
| 66 |
+
type: integer
|
| 67 |
+
current_score:
|
| 68 |
+
type: number
|
| 69 |
+
|
| 70 |
+
reward_range: [0.0, 1.0]
|
| 71 |
+
|
| 72 |
+
metadata:
|
| 73 |
+
benchmark: support_ops_env
|
| 74 |
+
domain: customer_support
|
| 75 |
+
difficulty_levels:
|
| 76 |
+
- easy
|
| 77 |
+
- medium
|
| 78 |
+
- hard
|
| 79 |
+
features:
|
| 80 |
+
- multi_step_reasoning
|
| 81 |
+
- tool_selection
|
| 82 |
+
- policy_lookup
|
| 83 |
+
- customer_support_triage
|
| 84 |
+
- shaped_rewards
|
openenv_support_ops_env.egg-info/PKG-INFO
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: openenv-support-ops-env
|
| 3 |
+
Version: 0.1.0
|
| 4 |
+
Summary: OpenEnv customer support operations environment
|
| 5 |
+
Requires-Python: >=3.10
|
| 6 |
+
Requires-Dist: openenv-core[core]>=0.2.1
|
| 7 |
+
Requires-Dist: openai>=1.40.0
|
| 8 |
+
Requires-Dist: python-dotenv>=1.0.1
|
| 9 |
+
Requires-Dist: uvicorn>=0.30.0
|
| 10 |
+
Provides-Extra: dev
|
| 11 |
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
openenv_support_ops_env.egg-info/SOURCES.txt
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
README.md
|
| 2 |
+
pyproject.toml
|
| 3 |
+
openenv_support_ops_env.egg-info/PKG-INFO
|
| 4 |
+
openenv_support_ops_env.egg-info/SOURCES.txt
|
| 5 |
+
openenv_support_ops_env.egg-info/dependency_links.txt
|
| 6 |
+
openenv_support_ops_env.egg-info/entry_points.txt
|
| 7 |
+
openenv_support_ops_env.egg-info/requires.txt
|
| 8 |
+
openenv_support_ops_env.egg-info/top_level.txt
|
| 9 |
+
server/__init__.py
|
| 10 |
+
server/app.py
|
| 11 |
+
tool_use_env/__init__.py
|
| 12 |
+
tool_use_env/client.py
|
| 13 |
+
tool_use_env/grader.py
|
| 14 |
+
tool_use_env/models.py
|
| 15 |
+
tool_use_env/tasks.py
|
| 16 |
+
tool_use_env/server/__init__.py
|
| 17 |
+
tool_use_env/server/app.py
|
| 18 |
+
tool_use_env/server/tool_use_env_environment.py
|
openenv_support_ops_env.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
openenv_support_ops_env.egg-info/entry_points.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[console_scripts]
|
| 2 |
+
server = server.app:main
|
openenv_support_ops_env.egg-info/requires.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv-core[core]>=0.2.1
|
| 2 |
+
openai>=1.40.0
|
| 3 |
+
python-dotenv>=1.0.1
|
| 4 |
+
uvicorn>=0.30.0
|
| 5 |
+
|
| 6 |
+
[dev]
|
| 7 |
+
pytest>=8.0.0
|
openenv_support_ops_env.egg-info/top_level.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
server
|
| 2 |
+
tool_use_env
|
pyproject.toml
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=68", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "openenv-support-ops-env"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
+
description = "OpenEnv customer support operations environment"
|
| 9 |
+
requires-python = ">=3.10"
|
| 10 |
+
dependencies = [
|
| 11 |
+
"openenv-core[core]>=0.2.1",
|
| 12 |
+
"openai>=1.40.0",
|
| 13 |
+
"python-dotenv>=1.0.1",
|
| 14 |
+
"uvicorn>=0.30.0",
|
| 15 |
+
]
|
| 16 |
+
|
| 17 |
+
[project.optional-dependencies]
|
| 18 |
+
dev = [
|
| 19 |
+
"pytest>=8.0.0",
|
| 20 |
+
]
|
| 21 |
+
|
| 22 |
+
[project.scripts]
|
| 23 |
+
server = "server.app:main"
|
| 24 |
+
|
| 25 |
+
[tool.setuptools]
|
| 26 |
+
include-package-data = true
|
| 27 |
+
packages = ["tool_use_env", "tool_use_env.server", "server"]
|
| 28 |
+
package-dir = { "tool_use_env" = "tool_use_env", "tool_use_env.server" = "tool_use_env/server", "server" = "server" }
|
server/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Root server shim for OpenEnv validation and uv run server."""
|
server/app.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from tool_use_env.server.app import app as app
|
| 2 |
+
from tool_use_env.server.app import main as _package_main
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def main(host: str = "0.0.0.0", port: int = 8000):
|
| 6 |
+
_package_main(host=host, port=port)
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
if __name__ == "__main__":
|
| 10 |
+
main()
|
tool_use_env/README.md
ADDED
|
@@ -0,0 +1,256 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Tool Use Env Environment Server
|
| 3 |
+
emoji: 📀
|
| 4 |
+
colorFrom: purple
|
| 5 |
+
colorTo: gray
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
app_port: 8000
|
| 9 |
+
base_path: /web
|
| 10 |
+
tags:
|
| 11 |
+
- openenv
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
# Tool Use Env Environment
|
| 15 |
+
|
| 16 |
+
A simple test environment that echoes back messages. Perfect for testing the env APIs as well as demonstrating environment usage patterns.
|
| 17 |
+
|
| 18 |
+
## Quick Start
|
| 19 |
+
hi
|
| 20 |
+
|
| 21 |
+
The simplest way to use the Tool Use Env environment is through the `ToolUseEnv` class:
|
| 22 |
+
|
| 23 |
+
```python
|
| 24 |
+
from tool_use_env import ToolUseAction, ToolUseEnv
|
| 25 |
+
|
| 26 |
+
try:
|
| 27 |
+
# Create environment from Docker image
|
| 28 |
+
tool_use_envenv = ToolUseEnv.from_docker_image("tool_use_env-env:latest")
|
| 29 |
+
|
| 30 |
+
# Reset
|
| 31 |
+
result = tool_use_envenv.reset()
|
| 32 |
+
print(f"Reset: {result.observation.echoed_message}")
|
| 33 |
+
|
| 34 |
+
# Send multiple messages
|
| 35 |
+
messages = ["Hello, World!", "Testing echo", "Final message"]
|
| 36 |
+
|
| 37 |
+
for msg in messages:
|
| 38 |
+
result = tool_use_envenv.step(ToolUseAction(message=msg))
|
| 39 |
+
print(f"Sent: '{msg}'")
|
| 40 |
+
print(f" → Echoed: '{result.observation.echoed_message}'")
|
| 41 |
+
print(f" → Length: {result.observation.message_length}")
|
| 42 |
+
print(f" → Reward: {result.reward}")
|
| 43 |
+
|
| 44 |
+
finally:
|
| 45 |
+
# Always clean up
|
| 46 |
+
tool_use_envenv.close()
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
That's it! The `ToolUseEnv.from_docker_image()` method handles:
|
| 50 |
+
- Starting the Docker container
|
| 51 |
+
- Waiting for the server to be ready
|
| 52 |
+
- Connecting to the environment
|
| 53 |
+
- Container cleanup when you call `close()`
|
| 54 |
+
|
| 55 |
+
## Building the Docker Image
|
| 56 |
+
|
| 57 |
+
Before using the environment, you need to build the Docker image:
|
| 58 |
+
|
| 59 |
+
```bash
|
| 60 |
+
# From project root
|
| 61 |
+
docker build -t tool_use_env-env:latest -f server/Dockerfile .
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
## Deploying to Hugging Face Spaces
|
| 65 |
+
|
| 66 |
+
You can easily deploy your OpenEnv environment to Hugging Face Spaces using the `openenv push` command:
|
| 67 |
+
|
| 68 |
+
```bash
|
| 69 |
+
# From the environment directory (where openenv.yaml is located)
|
| 70 |
+
openenv push
|
| 71 |
+
|
| 72 |
+
# Or specify options
|
| 73 |
+
openenv push --namespace my-org --private
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
The `openenv push` command will:
|
| 77 |
+
1. Validate that the directory is an OpenEnv environment (checks for `openenv.yaml`)
|
| 78 |
+
2. Prepare a custom build for Hugging Face Docker space (enables web interface)
|
| 79 |
+
3. Upload to Hugging Face (ensuring you're logged in)
|
| 80 |
+
|
| 81 |
+
### Prerequisites
|
| 82 |
+
|
| 83 |
+
- Authenticate with Hugging Face: The command will prompt for login if not already authenticated
|
| 84 |
+
|
| 85 |
+
### Options
|
| 86 |
+
|
| 87 |
+
- `--directory`, `-d`: Directory containing the OpenEnv environment (defaults to current directory)
|
| 88 |
+
- `--repo-id`, `-r`: Repository ID in format 'username/repo-name' (defaults to 'username/env-name' from openenv.yaml)
|
| 89 |
+
- `--base-image`, `-b`: Base Docker image to use (overrides Dockerfile FROM)
|
| 90 |
+
- `--private`: Deploy the space as private (default: public)
|
| 91 |
+
|
| 92 |
+
### Examples
|
| 93 |
+
|
| 94 |
+
```bash
|
| 95 |
+
# Push to your personal namespace (defaults to username/env-name from openenv.yaml)
|
| 96 |
+
openenv push
|
| 97 |
+
|
| 98 |
+
# Push to a specific repository
|
| 99 |
+
openenv push --repo-id my-org/my-env
|
| 100 |
+
|
| 101 |
+
# Push with a custom base image
|
| 102 |
+
openenv push --base-image ghcr.io/meta-pytorch/openenv-base:latest
|
| 103 |
+
|
| 104 |
+
# Push as a private space
|
| 105 |
+
openenv push --private
|
| 106 |
+
|
| 107 |
+
# Combine options
|
| 108 |
+
openenv push --repo-id my-org/my-env --base-image custom-base:latest --private
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
After deployment, your space will be available at:
|
| 112 |
+
`https://huggingface.co/spaces/<repo-id>`
|
| 113 |
+
|
| 114 |
+
The deployed space includes:
|
| 115 |
+
- **Web Interface** at `/web` - Interactive UI for exploring the environment
|
| 116 |
+
- **API Documentation** at `/docs` - Full OpenAPI/Swagger interface
|
| 117 |
+
- **Health Check** at `/health` - Container health monitoring
|
| 118 |
+
- **WebSocket** at `/ws` - Persistent session endpoint for low-latency interactions
|
| 119 |
+
|
| 120 |
+
## Environment Details
|
| 121 |
+
|
| 122 |
+
### Action
|
| 123 |
+
**ToolUseAction**: Contains a single field
|
| 124 |
+
- `message` (str) - The message to echo back
|
| 125 |
+
|
| 126 |
+
### Observation
|
| 127 |
+
**ToolUseObservation**: Contains the echo response and metadata
|
| 128 |
+
- `echoed_message` (str) - The message echoed back
|
| 129 |
+
- `message_length` (int) - Length of the message
|
| 130 |
+
- `reward` (float) - Reward based on message length (length × 0.1)
|
| 131 |
+
- `done` (bool) - Always False for echo environment
|
| 132 |
+
- `metadata` (dict) - Additional info like step count
|
| 133 |
+
|
| 134 |
+
### Reward
|
| 135 |
+
The reward is calculated as: `message_length × 0.1`
|
| 136 |
+
- "Hi" → reward: 0.2
|
| 137 |
+
- "Hello, World!" → reward: 1.3
|
| 138 |
+
- Empty message → reward: 0.0
|
| 139 |
+
|
| 140 |
+
## Advanced Usage
|
| 141 |
+
|
| 142 |
+
### Connecting to an Existing Server
|
| 143 |
+
|
| 144 |
+
If you already have a Tool Use Env environment server running, you can connect directly:
|
| 145 |
+
|
| 146 |
+
```python
|
| 147 |
+
from tool_use_env import ToolUseEnv
|
| 148 |
+
|
| 149 |
+
# Connect to existing server
|
| 150 |
+
tool_use_envenv = ToolUseEnv(base_url="<ENV_HTTP_URL_HERE>")
|
| 151 |
+
|
| 152 |
+
# Use as normal
|
| 153 |
+
result = tool_use_envenv.reset()
|
| 154 |
+
result = tool_use_envenv.step(ToolUseAction(message="Hello!"))
|
| 155 |
+
```
|
| 156 |
+
|
| 157 |
+
Note: When connecting to an existing server, `tool_use_envenv.close()` will NOT stop the server.
|
| 158 |
+
|
| 159 |
+
### Using the Context Manager
|
| 160 |
+
|
| 161 |
+
The client supports context manager usage for automatic connection management:
|
| 162 |
+
|
| 163 |
+
```python
|
| 164 |
+
from tool_use_env import ToolUseAction, ToolUseEnv
|
| 165 |
+
|
| 166 |
+
# Connect with context manager (auto-connects and closes)
|
| 167 |
+
with ToolUseEnv(base_url="http://localhost:8000") as env:
|
| 168 |
+
result = env.reset()
|
| 169 |
+
print(f"Reset: {result.observation.echoed_message}")
|
| 170 |
+
# Multiple steps with low latency
|
| 171 |
+
for msg in ["Hello", "World", "!"]:
|
| 172 |
+
result = env.step(ToolUseAction(message=msg))
|
| 173 |
+
print(f"Echoed: {result.observation.echoed_message}")
|
| 174 |
+
```
|
| 175 |
+
|
| 176 |
+
The client uses WebSocket connections for:
|
| 177 |
+
- **Lower latency**: No HTTP connection overhead per request
|
| 178 |
+
- **Persistent session**: Server maintains your environment state
|
| 179 |
+
- **Efficient for episodes**: Better for many sequential steps
|
| 180 |
+
|
| 181 |
+
### Concurrent WebSocket Sessions
|
| 182 |
+
|
| 183 |
+
The server supports multiple concurrent WebSocket connections. To enable this,
|
| 184 |
+
modify `server/app.py` to use factory mode:
|
| 185 |
+
|
| 186 |
+
```python
|
| 187 |
+
# In server/app.py - use factory mode for concurrent sessions
|
| 188 |
+
app = create_app(
|
| 189 |
+
ToolUseEnvironment, # Pass class, not instance
|
| 190 |
+
ToolUseAction,
|
| 191 |
+
ToolUseObservation,
|
| 192 |
+
max_concurrent_envs=4, # Allow 4 concurrent sessions
|
| 193 |
+
)
|
| 194 |
+
```
|
| 195 |
+
|
| 196 |
+
Then multiple clients can connect simultaneously:
|
| 197 |
+
|
| 198 |
+
```python
|
| 199 |
+
from tool_use_env import ToolUseAction, ToolUseEnv
|
| 200 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 201 |
+
|
| 202 |
+
def run_episode(client_id: int):
|
| 203 |
+
with ToolUseEnv(base_url="http://localhost:8000") as env:
|
| 204 |
+
result = env.reset()
|
| 205 |
+
for i in range(10):
|
| 206 |
+
result = env.step(ToolUseAction(message=f"Client {client_id}, step {i}"))
|
| 207 |
+
return client_id, result.observation.message_length
|
| 208 |
+
|
| 209 |
+
# Run 4 episodes concurrently
|
| 210 |
+
with ThreadPoolExecutor(max_workers=4) as executor:
|
| 211 |
+
results = list(executor.map(run_episode, range(4)))
|
| 212 |
+
```
|
| 213 |
+
|
| 214 |
+
## Development & Testing
|
| 215 |
+
|
| 216 |
+
### Direct Environment Testing
|
| 217 |
+
|
| 218 |
+
Test the environment logic directly without starting the HTTP server:
|
| 219 |
+
|
| 220 |
+
```bash
|
| 221 |
+
# From the server directory
|
| 222 |
+
python3 server/tool_use_env_environment.py
|
| 223 |
+
```
|
| 224 |
+
|
| 225 |
+
This verifies that:
|
| 226 |
+
- Environment resets correctly
|
| 227 |
+
- Step executes actions properly
|
| 228 |
+
- State tracking works
|
| 229 |
+
- Rewards are calculated correctly
|
| 230 |
+
|
| 231 |
+
### Running Locally
|
| 232 |
+
|
| 233 |
+
Run the server locally for development:
|
| 234 |
+
|
| 235 |
+
```bash
|
| 236 |
+
uvicorn server.app:app --reload
|
| 237 |
+
```
|
| 238 |
+
|
| 239 |
+
## Project Structure
|
| 240 |
+
|
| 241 |
+
```
|
| 242 |
+
tool_use_env/
|
| 243 |
+
├── .dockerignore # Docker build exclusions
|
| 244 |
+
├── __init__.py # Module exports
|
| 245 |
+
├── README.md # This file
|
| 246 |
+
├── openenv.yaml # OpenEnv manifest
|
| 247 |
+
├── pyproject.toml # Project metadata and dependencies
|
| 248 |
+
├── uv.lock # Locked dependencies (generated)
|
| 249 |
+
├── client.py # ToolUseEnv client
|
| 250 |
+
├── models.py # Action and Observation models
|
| 251 |
+
└── server/
|
| 252 |
+
├── __init__.py # Server module exports
|
| 253 |
+
├── tool_use_env_environment.py # Core environment logic
|
| 254 |
+
├── app.py # FastAPI application (HTTP + WebSocket endpoints)
|
| 255 |
+
└── Dockerfile # Container image definition
|
| 256 |
+
```
|
tool_use_env/__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Tool Use Env Environment."""
|
| 8 |
+
|
| 9 |
+
from .client import ToolUseEnv
|
| 10 |
+
from .models import ToolUseAction, ToolUseObservation, ToolUseState
|
| 11 |
+
|
| 12 |
+
__all__ = [
|
| 13 |
+
"ToolUseAction",
|
| 14 |
+
"ToolUseObservation",
|
| 15 |
+
"ToolUseState",
|
| 16 |
+
"ToolUseEnv",
|
| 17 |
+
]
|
tool_use_env/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (356 Bytes). View file
|
|
|
tool_use_env/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (364 Bytes). View file
|
|
|
tool_use_env/__pycache__/__init__.cpython-314.pyc
ADDED
|
Binary file (392 Bytes). View file
|
|
|
tool_use_env/__pycache__/client.cpython-312.pyc
ADDED
|
Binary file (3.96 kB). View file
|
|
|
tool_use_env/__pycache__/client.cpython-313.pyc
ADDED
|
Binary file (2.26 kB). View file
|
|
|
tool_use_env/__pycache__/client.cpython-314.pyc
ADDED
|
Binary file (4.58 kB). View file
|
|
|
tool_use_env/__pycache__/grader.cpython-312.pyc
ADDED
|
Binary file (2.58 kB). View file
|
|
|
tool_use_env/__pycache__/models.cpython-312.pyc
ADDED
|
Binary file (4.25 kB). View file
|
|
|
tool_use_env/__pycache__/models.cpython-313.pyc
ADDED
|
Binary file (1.41 kB). View file
|
|
|
tool_use_env/agents/__pycache__/baseline.cpython-313.pyc
ADDED
|
Binary file (4.72 kB). View file
|
|
|
tool_use_env/agents/baseline.py
ADDED
|
@@ -0,0 +1,267 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# from tool_use_env.client import ToolUseEnv
|
| 2 |
+
# from tool_use_env.models import ToolUseAction
|
| 3 |
+
# import random
|
| 4 |
+
|
| 5 |
+
# def rule_based_policy(query: str):
|
| 6 |
+
# query = query.lower()
|
| 7 |
+
|
| 8 |
+
# # --- Introduce slight imperfection ---
|
| 9 |
+
# if random.random() < 0.1:
|
| 10 |
+
# return "answer_directly"
|
| 11 |
+
|
| 12 |
+
# if "what is" in query and any(op in query for op in ["+", "-", "*", "/"]):
|
| 13 |
+
# return "use_calculator"
|
| 14 |
+
|
| 15 |
+
# if "capital" in query or "who is" in query:
|
| 16 |
+
# return "use_search"
|
| 17 |
+
|
| 18 |
+
# return "answer_directly"
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# def run_single_episode(env):
|
| 22 |
+
# result = env.reset()
|
| 23 |
+
# obs = result.observation
|
| 24 |
+
|
| 25 |
+
# query = obs.query
|
| 26 |
+
# action_type = rule_based_policy(query)
|
| 27 |
+
|
| 28 |
+
# action = ToolUseAction(action_type=action_type)
|
| 29 |
+
|
| 30 |
+
# result = env.step(action)
|
| 31 |
+
# obs = result.observation
|
| 32 |
+
|
| 33 |
+
# return {
|
| 34 |
+
# "query": query,
|
| 35 |
+
# "action": action_type,
|
| 36 |
+
# "reward": result.reward,
|
| 37 |
+
# "message": obs.message
|
| 38 |
+
# }
|
| 39 |
+
|
| 40 |
+
# def run_evaluation(num_episodes=20):
|
| 41 |
+
# results = []
|
| 42 |
+
|
| 43 |
+
# difficulty_scores = {
|
| 44 |
+
# "easy": [],
|
| 45 |
+
# "medium": [],
|
| 46 |
+
# "hard": []
|
| 47 |
+
# }
|
| 48 |
+
|
| 49 |
+
# total_score = 0
|
| 50 |
+
|
| 51 |
+
# with ToolUseEnv(base_url="http://localhost:8000").sync() as env:
|
| 52 |
+
# for _ in range(num_episodes):
|
| 53 |
+
# result = env.reset()
|
| 54 |
+
# obs = result.observation
|
| 55 |
+
# query = obs.query
|
| 56 |
+
# state = env.state()
|
| 57 |
+
# difficulty = state.difficulty
|
| 58 |
+
|
| 59 |
+
# action_type = rule_based_policy(query)
|
| 60 |
+
# action = ToolUseAction(action_type=action_type)
|
| 61 |
+
|
| 62 |
+
# result = env.step(action)
|
| 63 |
+
|
| 64 |
+
# score = result.reward
|
| 65 |
+
# total_score += score
|
| 66 |
+
|
| 67 |
+
# difficulty_scores[difficulty].append(score)
|
| 68 |
+
|
| 69 |
+
# results.append({
|
| 70 |
+
# "query": query,
|
| 71 |
+
# "difficulty": difficulty,
|
| 72 |
+
# "action": action_type,
|
| 73 |
+
# "score": score,
|
| 74 |
+
# "message": result.observation.message
|
| 75 |
+
# })
|
| 76 |
+
|
| 77 |
+
# avg_score = total_score / num_episodes
|
| 78 |
+
|
| 79 |
+
# print("\n=== OVERALL PERFORMANCE ===")
|
| 80 |
+
# print(f"Average Score: {avg_score:.2f}")
|
| 81 |
+
|
| 82 |
+
# print("\n=== DIFFICULTY BREAKDOWN ===")
|
| 83 |
+
# for level in difficulty_scores:
|
| 84 |
+
# if difficulty_scores[level]:
|
| 85 |
+
# avg = sum(difficulty_scores[level]) / len(difficulty_scores[level])
|
| 86 |
+
# print(f"{level.capitalize()}: {avg:.2f}")
|
| 87 |
+
|
| 88 |
+
# print("\n=== SAMPLE CASES ===")
|
| 89 |
+
# for r in results[:5]:
|
| 90 |
+
# print(f"\nQuery: {r['query']}")
|
| 91 |
+
# print(f"Action: {r['action']}")
|
| 92 |
+
# print(f"Score: {r['score']:.2f}")
|
| 93 |
+
# print(f"Details: {r['message']}")
|
| 94 |
+
|
| 95 |
+
# return results
|
| 96 |
+
|
| 97 |
+
# def analyze_failures(results):
|
| 98 |
+
# wrong_decisions = 0
|
| 99 |
+
# tool_failures = 0
|
| 100 |
+
# total = len(results)
|
| 101 |
+
|
| 102 |
+
# for r in results:
|
| 103 |
+
# msg = r["message"]
|
| 104 |
+
|
| 105 |
+
# if "Correct: False" in msg:
|
| 106 |
+
# if "use_" in msg:
|
| 107 |
+
# tool_failures += 1
|
| 108 |
+
# else:
|
| 109 |
+
# wrong_decisions += 1
|
| 110 |
+
|
| 111 |
+
# print("\n=== FAILURE ANALYSIS ===")
|
| 112 |
+
# print(f"Tool failures: {tool_failures}/{total} ({(tool_failures/total)*100:.1f}%)")
|
| 113 |
+
# print(f"Wrong decisions: {wrong_decisions}/{total} ({(wrong_decisions/total)*100:.1f}%)")
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
# if __name__ == "__main__":
|
| 117 |
+
# results = run_evaluation(50)
|
| 118 |
+
# analyze_failures(results)
|
| 119 |
+
|
| 120 |
+
import os
|
| 121 |
+
import random
|
| 122 |
+
from collections import defaultdict
|
| 123 |
+
|
| 124 |
+
from dotenv import load_dotenv
|
| 125 |
+
from openai import OpenAI
|
| 126 |
+
|
| 127 |
+
from tool_use_env.client import ToolUseEnv
|
| 128 |
+
from tool_use_env.models import ToolUseAction
|
| 129 |
+
|
| 130 |
+
# --- Load environment variables ---
|
| 131 |
+
load_dotenv()
|
| 132 |
+
|
| 133 |
+
# --- Initialize OpenAI client ---
|
| 134 |
+
client = OpenAI()
|
| 135 |
+
|
| 136 |
+
# --- Reproducibility ---
|
| 137 |
+
random.seed(42)
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
# 🧠 LLM Policy (CORE)
|
| 141 |
+
def llm_policy(query: str):
|
| 142 |
+
prompt = f"""
|
| 143 |
+
You are an AI agent choosing the best tool.
|
| 144 |
+
|
| 145 |
+
Available actions:
|
| 146 |
+
- use_calculator (for math problems)
|
| 147 |
+
- use_search (for factual questions)
|
| 148 |
+
- answer_directly (if neither tool is needed)
|
| 149 |
+
|
| 150 |
+
Query: {query}
|
| 151 |
+
|
| 152 |
+
Respond with ONLY one of:
|
| 153 |
+
use_calculator
|
| 154 |
+
use_search
|
| 155 |
+
answer_directly
|
| 156 |
+
"""
|
| 157 |
+
|
| 158 |
+
try:
|
| 159 |
+
response = client.chat.completions.create(
|
| 160 |
+
model="gpt-4o-mini",
|
| 161 |
+
messages=[{"role": "user", "content": prompt}],
|
| 162 |
+
temperature=0
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
action = response.choices[0].message.content.strip()
|
| 166 |
+
|
| 167 |
+
# --- Safety check ---
|
| 168 |
+
if action not in ["use_calculator", "use_search", "answer_directly"]:
|
| 169 |
+
return "answer_directly"
|
| 170 |
+
|
| 171 |
+
return action
|
| 172 |
+
|
| 173 |
+
except Exception as e:
|
| 174 |
+
print(f"[ERROR] LLM call failed: {e}")
|
| 175 |
+
return "answer_directly"
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
# 🧪 Evaluation Loop
|
| 179 |
+
def run_evaluation(num_episodes=50):
|
| 180 |
+
results = []
|
| 181 |
+
total_score = 0
|
| 182 |
+
|
| 183 |
+
difficulty_scores = defaultdict(list)
|
| 184 |
+
|
| 185 |
+
with ToolUseEnv(base_url="http://localhost:8000").sync() as env:
|
| 186 |
+
for _ in range(num_episodes):
|
| 187 |
+
# --- Reset ---
|
| 188 |
+
result = env.reset()
|
| 189 |
+
obs = result.observation
|
| 190 |
+
|
| 191 |
+
query = obs.query
|
| 192 |
+
|
| 193 |
+
# --- Get difficulty ---
|
| 194 |
+
state = env.state()
|
| 195 |
+
difficulty = state.difficulty
|
| 196 |
+
|
| 197 |
+
# --- LLM decides action ---
|
| 198 |
+
action_type = llm_policy(query)
|
| 199 |
+
action = ToolUseAction(action_type=action_type)
|
| 200 |
+
|
| 201 |
+
# --- Step ---
|
| 202 |
+
result = env.step(action)
|
| 203 |
+
obs = result.observation
|
| 204 |
+
|
| 205 |
+
score = result.reward
|
| 206 |
+
total_score += score
|
| 207 |
+
|
| 208 |
+
difficulty_scores[difficulty].append(score)
|
| 209 |
+
|
| 210 |
+
results.append({
|
| 211 |
+
"query": query,
|
| 212 |
+
"difficulty": difficulty,
|
| 213 |
+
"action": action_type,
|
| 214 |
+
"score": score,
|
| 215 |
+
"message": obs.message
|
| 216 |
+
})
|
| 217 |
+
|
| 218 |
+
print(f"Score: {score:.2f}")
|
| 219 |
+
|
| 220 |
+
# --- Overall ---
|
| 221 |
+
avg_score = total_score / num_episodes
|
| 222 |
+
|
| 223 |
+
print("\n=== OVERALL PERFORMANCE ===")
|
| 224 |
+
print(f"Average Score: {avg_score:.2f}")
|
| 225 |
+
|
| 226 |
+
# --- Breakdown ---
|
| 227 |
+
print("\n=== DIFFICULTY BREAKDOWN ===")
|
| 228 |
+
for level in ["easy", "medium", "hard"]:
|
| 229 |
+
if difficulty_scores[level]:
|
| 230 |
+
avg = sum(difficulty_scores[level]) / len(difficulty_scores[level])
|
| 231 |
+
print(f"{level.capitalize()}: {avg:.2f}")
|
| 232 |
+
|
| 233 |
+
# --- Sample Cases ---
|
| 234 |
+
print("\n=== SAMPLE CASES ===")
|
| 235 |
+
for r in results[:5]:
|
| 236 |
+
print(f"\nQuery: {r['query']}")
|
| 237 |
+
print(f"Action: {r['action']}")
|
| 238 |
+
print(f"Score: {r['score']:.2f}")
|
| 239 |
+
print(f"Details: {r['message']}")
|
| 240 |
+
|
| 241 |
+
return results
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
# 📊 Failure Analysis
|
| 245 |
+
def analyze_failures(results):
|
| 246 |
+
total = len(results)
|
| 247 |
+
tool_failures = 0
|
| 248 |
+
wrong_decisions = 0
|
| 249 |
+
|
| 250 |
+
for r in results:
|
| 251 |
+
msg = r["message"]
|
| 252 |
+
|
| 253 |
+
if "Correct: False" in msg:
|
| 254 |
+
if "use_" in msg:
|
| 255 |
+
tool_failures += 1
|
| 256 |
+
else:
|
| 257 |
+
wrong_decisions += 1
|
| 258 |
+
|
| 259 |
+
print("\n=== FAILURE ANALYSIS ===")
|
| 260 |
+
print(f"Tool failures: {tool_failures}/{total} ({(tool_failures/total)*100:.1f}%)")
|
| 261 |
+
print(f"Wrong decisions: {wrong_decisions}/{total} ({(wrong_decisions/total)*100:.1f}%)")
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
# 🚀 Main
|
| 265 |
+
if __name__ == "__main__":
|
| 266 |
+
results = run_evaluation(50)
|
| 267 |
+
analyze_failures(results)
|
tool_use_env/client.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# # Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# # All rights reserved.
|
| 3 |
+
# #
|
| 4 |
+
# # This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# # LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
# """Tool Use Env Environment Client."""
|
| 8 |
+
|
| 9 |
+
# from typing import Dict
|
| 10 |
+
|
| 11 |
+
# from openenv.core import EnvClient
|
| 12 |
+
# from openenv.core.client_types import StepResult
|
| 13 |
+
# from openenv.core.env_server.types import State
|
| 14 |
+
|
| 15 |
+
# from .models import ToolUseAction, ToolUseObservation
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# class ToolUseEnv(
|
| 19 |
+
# EnvClient[ToolUseAction, ToolUseObservation, State]
|
| 20 |
+
# ):
|
| 21 |
+
# """
|
| 22 |
+
# Client for the Tool Use Env Environment.
|
| 23 |
+
|
| 24 |
+
# This client maintains a persistent WebSocket connection to the environment server,
|
| 25 |
+
# enabling efficient multi-step interactions with lower latency.
|
| 26 |
+
# Each client instance has its own dedicated environment session on the server.
|
| 27 |
+
|
| 28 |
+
# Example:
|
| 29 |
+
# >>> # Connect to a running server
|
| 30 |
+
# >>> with ToolUseEnv(base_url="http://localhost:8000") as client:
|
| 31 |
+
# ... result = client.reset()
|
| 32 |
+
# ... print(result.observation.echoed_message)
|
| 33 |
+
# ...
|
| 34 |
+
# ... result = client.step(ToolUseAction(message="Hello!"))
|
| 35 |
+
# ... print(result.observation.echoed_message)
|
| 36 |
+
|
| 37 |
+
# Example with Docker:
|
| 38 |
+
# >>> # Automatically start container and connect
|
| 39 |
+
# >>> client = ToolUseEnv.from_docker_image("tool_use_env-env:latest")
|
| 40 |
+
# >>> try:
|
| 41 |
+
# ... result = client.reset()
|
| 42 |
+
# ... result = client.step(ToolUseAction(message="Test"))
|
| 43 |
+
# ... finally:
|
| 44 |
+
# ... client.close()
|
| 45 |
+
# """
|
| 46 |
+
|
| 47 |
+
# def _step_payload(self, action: ToolUseAction) -> Dict:
|
| 48 |
+
# """
|
| 49 |
+
# Convert ToolUseAction to JSON payload for step message.
|
| 50 |
+
|
| 51 |
+
# Args:
|
| 52 |
+
# action: ToolUseAction instance
|
| 53 |
+
|
| 54 |
+
# Returns:
|
| 55 |
+
# Dictionary representation suitable for JSON encoding
|
| 56 |
+
# """
|
| 57 |
+
# return {
|
| 58 |
+
# "message": action.message,
|
| 59 |
+
# }
|
| 60 |
+
|
| 61 |
+
# def _parse_result(self, payload: Dict) -> StepResult[ToolUseObservation]:
|
| 62 |
+
# """
|
| 63 |
+
# Parse server response into StepResult[ToolUseObservation].
|
| 64 |
+
|
| 65 |
+
# Args:
|
| 66 |
+
# payload: JSON response data from server
|
| 67 |
+
|
| 68 |
+
# Returns:
|
| 69 |
+
# StepResult with ToolUseObservation
|
| 70 |
+
# """
|
| 71 |
+
# obs_data = payload.get("observation", {})
|
| 72 |
+
# observation = ToolUseObservation(
|
| 73 |
+
# echoed_message=obs_data.get("echoed_message", ""),
|
| 74 |
+
# message_length=obs_data.get("message_length", 0),
|
| 75 |
+
# done=payload.get("done", False),
|
| 76 |
+
# reward=payload.get("reward"),
|
| 77 |
+
# metadata=obs_data.get("metadata", {}),
|
| 78 |
+
# )
|
| 79 |
+
|
| 80 |
+
# return StepResult(
|
| 81 |
+
# observation=observation,
|
| 82 |
+
# reward=payload.get("reward"),
|
| 83 |
+
# done=payload.get("done", False),
|
| 84 |
+
# )
|
| 85 |
+
|
| 86 |
+
# def _parse_state(self, payload: Dict) -> State:
|
| 87 |
+
# """
|
| 88 |
+
# Parse server response into State object.
|
| 89 |
+
|
| 90 |
+
# Args:
|
| 91 |
+
# payload: JSON response from state request
|
| 92 |
+
|
| 93 |
+
# Returns:
|
| 94 |
+
# State object with episode_id and step_count
|
| 95 |
+
# """
|
| 96 |
+
# return State(
|
| 97 |
+
# episode_id=payload.get("episode_id"),
|
| 98 |
+
# step_count=payload.get("step_count", 0),
|
| 99 |
+
# )
|
| 100 |
+
|
| 101 |
+
from openenv.core.env_client import EnvClient
|
| 102 |
+
from openenv.core.client_types import StepResult
|
| 103 |
+
|
| 104 |
+
from tool_use_env.models import ToolUseAction, ToolUseObservation, ToolUseState
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
class ToolUseEnv(EnvClient[ToolUseAction, ToolUseObservation, ToolUseState]):
|
| 108 |
+
|
| 109 |
+
def _step_payload(self, action: ToolUseAction) -> dict:
|
| 110 |
+
return {
|
| 111 |
+
"action_type": action.action_type,
|
| 112 |
+
"artifact_id": action.artifact_id,
|
| 113 |
+
"query": action.query,
|
| 114 |
+
"message": action.message,
|
| 115 |
+
"resolution_code": action.resolution_code,
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
def _parse_result(self, payload: dict) -> StepResult:
|
| 119 |
+
obs_data = payload.get("observation", {})
|
| 120 |
+
|
| 121 |
+
observation = ToolUseObservation(
|
| 122 |
+
done=payload.get("done", False),
|
| 123 |
+
reward=payload.get("reward"),
|
| 124 |
+
task_id=obs_data.get("task_id", ""),
|
| 125 |
+
difficulty=obs_data.get("difficulty", "easy"),
|
| 126 |
+
objective=obs_data.get("objective", ""),
|
| 127 |
+
customer_message=obs_data.get("customer_message", ""),
|
| 128 |
+
workspace_summary=obs_data.get("workspace_summary", ""),
|
| 129 |
+
available_actions=obs_data.get("available_actions", []),
|
| 130 |
+
available_resolution_codes=obs_data.get("available_resolution_codes", []),
|
| 131 |
+
collected_evidence=obs_data.get("collected_evidence", []),
|
| 132 |
+
last_tool_result=obs_data.get("last_tool_result"),
|
| 133 |
+
last_action_error=obs_data.get("last_action_error"),
|
| 134 |
+
remaining_steps=obs_data.get("remaining_steps", 0),
|
| 135 |
+
current_score=obs_data.get("current_score", 0.0),
|
| 136 |
+
metadata=obs_data.get("metadata", {}),
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
return StepResult(
|
| 140 |
+
observation=observation,
|
| 141 |
+
reward=payload.get("reward"),
|
| 142 |
+
done=payload.get("done", False),
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
def _parse_state(self, payload: dict) -> ToolUseState:
|
| 146 |
+
return ToolUseState(
|
| 147 |
+
episode_id=payload.get("episode_id"),
|
| 148 |
+
step_count=payload.get("step_count", 0),
|
| 149 |
+
task_id=payload.get("task_id", ""),
|
| 150 |
+
task_name=payload.get("task_name", ""),
|
| 151 |
+
difficulty=payload.get("difficulty", ""),
|
| 152 |
+
objective=payload.get("objective", ""),
|
| 153 |
+
cumulative_reward=payload.get("cumulative_reward", 0.0),
|
| 154 |
+
final_score=payload.get("final_score", 0.0),
|
| 155 |
+
drafted_reply=payload.get("drafted_reply"),
|
| 156 |
+
resolution_code=payload.get("resolution_code"),
|
| 157 |
+
expected_resolution_code=payload.get("expected_resolution_code", ""),
|
| 158 |
+
required_evidence=payload.get("required_evidence", []),
|
| 159 |
+
collected_evidence=payload.get("collected_evidence", []),
|
| 160 |
+
action_history=payload.get("action_history", []),
|
| 161 |
+
repeat_action_count=payload.get("repeat_action_count", 0),
|
| 162 |
+
last_action_error=payload.get("last_action_error"),
|
| 163 |
+
known_artifacts=payload.get("known_artifacts", {}),
|
| 164 |
+
known_policies=payload.get("known_policies", {}),
|
| 165 |
+
)
|
tool_use_env/grader.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Any
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def _keyword_score(reply: str | None, keywords: list[str]) -> float:
|
| 7 |
+
if not reply or not keywords:
|
| 8 |
+
return 0.0
|
| 9 |
+
|
| 10 |
+
lowered = reply.lower()
|
| 11 |
+
hits = sum(1 for keyword in keywords if keyword.lower() in lowered)
|
| 12 |
+
return hits / len(keywords)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def grade_task(
|
| 16 |
+
task: dict[str, Any],
|
| 17 |
+
collected_evidence: list[str],
|
| 18 |
+
drafted_reply: str | None,
|
| 19 |
+
resolution_code: str | None,
|
| 20 |
+
step_count: int,
|
| 21 |
+
repeat_action_count: int,
|
| 22 |
+
) -> dict[str, float]:
|
| 23 |
+
required_evidence = task["required_evidence"]
|
| 24 |
+
evidence_hits = sum(1 for key in required_evidence if key in collected_evidence)
|
| 25 |
+
evidence_score = evidence_hits / len(required_evidence)
|
| 26 |
+
|
| 27 |
+
resolution_score = 1.0 if resolution_code == task["expected_resolution_code"] else 0.0
|
| 28 |
+
reply_score = _keyword_score(drafted_reply, task["reply_keywords"])
|
| 29 |
+
|
| 30 |
+
optimal_steps = task.get("optimal_steps", len(required_evidence) + 2)
|
| 31 |
+
extra_steps = max(0, step_count - optimal_steps)
|
| 32 |
+
efficiency_penalty = min(0.25, (extra_steps * 0.05) + (repeat_action_count * 0.04))
|
| 33 |
+
efficiency_score = max(0.0, 1.0 - efficiency_penalty)
|
| 34 |
+
|
| 35 |
+
final_score = (
|
| 36 |
+
0.5 * resolution_score
|
| 37 |
+
+ 0.25 * evidence_score
|
| 38 |
+
+ 0.2 * reply_score
|
| 39 |
+
+ 0.05 * efficiency_score
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
return {
|
| 43 |
+
"resolution_score": round(resolution_score, 3),
|
| 44 |
+
"evidence_score": round(evidence_score, 3),
|
| 45 |
+
"reply_score": round(reply_score, 3),
|
| 46 |
+
"efficiency_score": round(efficiency_score, 3),
|
| 47 |
+
"final_score": round(min(max(final_score, 0.0), 1.0), 3),
|
| 48 |
+
}
|
tool_use_env/models.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, List, Literal, Optional
|
| 2 |
+
|
| 3 |
+
from openenv.core.env_server import Action, Observation, State
|
| 4 |
+
from pydantic import Field
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class ToolUseAction(Action):
|
| 8 |
+
action_type: Literal[
|
| 9 |
+
"review_ticket",
|
| 10 |
+
"inspect_artifact",
|
| 11 |
+
"search_policy",
|
| 12 |
+
"draft_reply",
|
| 13 |
+
"submit_resolution",
|
| 14 |
+
] = Field(..., description="The action the agent wants to execute.")
|
| 15 |
+
artifact_id: Optional[str] = Field(
|
| 16 |
+
default=None,
|
| 17 |
+
description="Artifact identifier for inspect_artifact, such as order or risk_log.",
|
| 18 |
+
)
|
| 19 |
+
query: Optional[str] = Field(
|
| 20 |
+
default=None,
|
| 21 |
+
description="Policy name or search query for search_policy.",
|
| 22 |
+
)
|
| 23 |
+
message: Optional[str] = Field(
|
| 24 |
+
default=None,
|
| 25 |
+
description="Customer-facing reply draft used with draft_reply.",
|
| 26 |
+
)
|
| 27 |
+
resolution_code: Optional[str] = Field(
|
| 28 |
+
default=None,
|
| 29 |
+
description="Final resolution code used with submit_resolution.",
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class ToolUseObservation(Observation):
|
| 34 |
+
task_id: str = Field(..., description="Deterministic task identifier.")
|
| 35 |
+
difficulty: Literal["easy", "medium", "hard"] = Field(
|
| 36 |
+
..., description="Difficulty tier for the active task."
|
| 37 |
+
)
|
| 38 |
+
objective: str = Field(..., description="Concrete task objective for the agent.")
|
| 39 |
+
customer_message: str = Field(..., description="The raw customer support ticket.")
|
| 40 |
+
workspace_summary: str = Field(
|
| 41 |
+
..., description="Short summary of known evidence and remaining work."
|
| 42 |
+
)
|
| 43 |
+
available_actions: List[str] = Field(
|
| 44 |
+
default_factory=list, description="Available environment actions."
|
| 45 |
+
)
|
| 46 |
+
available_resolution_codes: List[str] = Field(
|
| 47 |
+
default_factory=list,
|
| 48 |
+
description="Resolution codes accepted by submit_resolution.",
|
| 49 |
+
)
|
| 50 |
+
collected_evidence: List[str] = Field(
|
| 51 |
+
default_factory=list,
|
| 52 |
+
description="Evidence keys collected so far, such as ticket or payment.",
|
| 53 |
+
)
|
| 54 |
+
last_tool_result: Optional[str] = Field(
|
| 55 |
+
default=None,
|
| 56 |
+
description="Most recent tool or grader output shown to the agent.",
|
| 57 |
+
)
|
| 58 |
+
last_action_error: Optional[str] = Field(
|
| 59 |
+
default=None, description="Validation error for the last action, if any."
|
| 60 |
+
)
|
| 61 |
+
remaining_steps: int = Field(
|
| 62 |
+
..., description="How many steps are left before the episode ends."
|
| 63 |
+
)
|
| 64 |
+
current_score: float = Field(
|
| 65 |
+
default=0.0,
|
| 66 |
+
description="Current deterministic grader score in the [0, 1] range.",
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
class ToolUseState(State):
|
| 71 |
+
task_id: str = ""
|
| 72 |
+
task_name: str = ""
|
| 73 |
+
difficulty: str = ""
|
| 74 |
+
objective: str = ""
|
| 75 |
+
cumulative_reward: float = 0.0
|
| 76 |
+
final_score: float = 0.0
|
| 77 |
+
drafted_reply: Optional[str] = None
|
| 78 |
+
resolution_code: Optional[str] = None
|
| 79 |
+
expected_resolution_code: str = ""
|
| 80 |
+
required_evidence: List[str] = Field(default_factory=list)
|
| 81 |
+
collected_evidence: List[str] = Field(default_factory=list)
|
| 82 |
+
action_history: List[str] = Field(default_factory=list)
|
| 83 |
+
repeat_action_count: int = 0
|
| 84 |
+
last_action_error: Optional[str] = None
|
| 85 |
+
known_artifacts: Dict[str, str] = Field(default_factory=dict)
|
| 86 |
+
known_policies: Dict[str, str] = Field(default_factory=dict)
|
tool_use_env/openenv_tool_use_env.egg-info/PKG-INFO
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: openenv-tool_use_env
|
| 3 |
+
Version: 0.1.0
|
| 4 |
+
Summary: Tool Use Env environment for OpenEnv
|
| 5 |
+
Requires-Python: >=3.10
|
| 6 |
+
Requires-Dist: openenv-core[core]>=0.2.1
|
| 7 |
+
Provides-Extra: dev
|
| 8 |
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
| 9 |
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
tool_use_env/openenv_tool_use_env.egg-info/SOURCES.txt
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
README.md
|
| 2 |
+
__init__.py
|
| 3 |
+
client.py
|
| 4 |
+
grader.py
|
| 5 |
+
models.py
|
| 6 |
+
pyproject.toml
|
| 7 |
+
./__init__.py
|
| 8 |
+
./client.py
|
| 9 |
+
./grader.py
|
| 10 |
+
./models.py
|
| 11 |
+
openenv_tool_use_env.egg-info/PKG-INFO
|
| 12 |
+
openenv_tool_use_env.egg-info/SOURCES.txt
|
| 13 |
+
openenv_tool_use_env.egg-info/dependency_links.txt
|
| 14 |
+
openenv_tool_use_env.egg-info/entry_points.txt
|
| 15 |
+
openenv_tool_use_env.egg-info/requires.txt
|
| 16 |
+
openenv_tool_use_env.egg-info/top_level.txt
|
| 17 |
+
server/__init__.py
|
| 18 |
+
server/app.py
|
| 19 |
+
server/tool_use_env_environment.py
|
| 20 |
+
tests/test_tools.py
|
tool_use_env/openenv_tool_use_env.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
tool_use_env/openenv_tool_use_env.egg-info/entry_points.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[console_scripts]
|
| 2 |
+
server = tool_use_env.server.app:main
|
tool_use_env/openenv_tool_use_env.egg-info/requires.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv-core[core]>=0.2.1
|
| 2 |
+
|
| 3 |
+
[dev]
|
| 4 |
+
pytest>=8.0.0
|
| 5 |
+
pytest-cov>=4.0.0
|
tool_use_env/openenv_tool_use_env.egg-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
tool_use_env
|
tool_use_env/pyproject.toml
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
[build-system]
|
| 8 |
+
requires = ["setuptools>=45", "wheel"]
|
| 9 |
+
build-backend = "setuptools.build_meta"
|
| 10 |
+
|
| 11 |
+
[project]
|
| 12 |
+
name = "openenv-tool_use_env"
|
| 13 |
+
version = "0.1.0"
|
| 14 |
+
description = "Tool Use Env environment for OpenEnv"
|
| 15 |
+
requires-python = ">=3.10"
|
| 16 |
+
dependencies = [
|
| 17 |
+
# Core OpenEnv runtime (provides FastAPI server + HTTP client types)
|
| 18 |
+
# install from github
|
| 19 |
+
# "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
|
| 20 |
+
"openenv-core[core]>=0.2.1",
|
| 21 |
+
# Environment-specific dependencies
|
| 22 |
+
# Add all dependencies needed for your environment here
|
| 23 |
+
# Examples:
|
| 24 |
+
# "numpy>=1.19.0",
|
| 25 |
+
# "torch>=2.0.0",
|
| 26 |
+
# "gymnasium>=0.29.0",
|
| 27 |
+
# "openspiel>=1.0.0",
|
| 28 |
+
# "smolagents>=1.22.0,<2",
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
[project.optional-dependencies]
|
| 32 |
+
dev = [
|
| 33 |
+
"pytest>=8.0.0",
|
| 34 |
+
"pytest-cov>=4.0.0",
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
+
[project.scripts]
|
| 38 |
+
# Server entry point - enables running via: uv run --project . server
|
| 39 |
+
# or: python -m tool_use_env.server.app
|
| 40 |
+
server = "tool_use_env.server.app:main"
|
| 41 |
+
|
| 42 |
+
[tool.setuptools]
|
| 43 |
+
include-package-data = true
|
| 44 |
+
packages = ["tool_use_env", "tool_use_env.server"]
|
| 45 |
+
package-dir = { "tool_use_env" = ".", "tool_use_env.server" = "server" }
|
tool_use_env/server/Dockerfile
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
# Multi-stage build using openenv-base
|
| 8 |
+
# This Dockerfile is flexible and works for both:
|
| 9 |
+
# - In-repo environments (with local OpenEnv sources)
|
| 10 |
+
# - Standalone environments (with openenv from PyPI/Git)
|
| 11 |
+
# The build script (openenv build) handles context detection and sets appropriate build args.
|
| 12 |
+
|
| 13 |
+
ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
|
| 14 |
+
FROM ${BASE_IMAGE} AS builder
|
| 15 |
+
|
| 16 |
+
WORKDIR /app
|
| 17 |
+
|
| 18 |
+
# Ensure git is available (required for installing dependencies from VCS)
|
| 19 |
+
RUN apt-get update && \
|
| 20 |
+
apt-get install -y --no-install-recommends git && \
|
| 21 |
+
rm -rf /var/lib/apt/lists/*
|
| 22 |
+
|
| 23 |
+
# Build argument to control whether we're building standalone or in-repo
|
| 24 |
+
ARG BUILD_MODE=in-repo
|
| 25 |
+
ARG ENV_NAME=tool_use_env
|
| 26 |
+
|
| 27 |
+
# Copy environment code (always at root of build context)
|
| 28 |
+
COPY . /app/env
|
| 29 |
+
|
| 30 |
+
# For in-repo builds, openenv is already vendored in the build context
|
| 31 |
+
# For standalone builds, openenv will be installed via pyproject.toml
|
| 32 |
+
WORKDIR /app/env
|
| 33 |
+
|
| 34 |
+
# Ensure uv is available (for local builds where base image lacks it)
|
| 35 |
+
RUN if ! command -v uv >/dev/null 2>&1; then \
|
| 36 |
+
curl -LsSf https://astral.sh/uv/install.sh | sh && \
|
| 37 |
+
mv /root/.local/bin/uv /usr/local/bin/uv && \
|
| 38 |
+
mv /root/.local/bin/uvx /usr/local/bin/uvx; \
|
| 39 |
+
fi
|
| 40 |
+
|
| 41 |
+
# Install dependencies using uv sync
|
| 42 |
+
# If uv.lock exists, use it; otherwise resolve on the fly
|
| 43 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 44 |
+
if [ -f uv.lock ]; then \
|
| 45 |
+
uv sync --frozen --no-install-project --no-editable; \
|
| 46 |
+
else \
|
| 47 |
+
uv sync --no-install-project --no-editable; \
|
| 48 |
+
fi
|
| 49 |
+
|
| 50 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 51 |
+
if [ -f uv.lock ]; then \
|
| 52 |
+
uv sync --frozen --no-editable; \
|
| 53 |
+
else \
|
| 54 |
+
uv sync --no-editable; \
|
| 55 |
+
fi
|
| 56 |
+
|
| 57 |
+
# Final runtime stage
|
| 58 |
+
FROM ${BASE_IMAGE}
|
| 59 |
+
|
| 60 |
+
WORKDIR /app
|
| 61 |
+
|
| 62 |
+
# Copy the virtual environment from builder
|
| 63 |
+
COPY --from=builder /app/env/.venv /app/.venv
|
| 64 |
+
|
| 65 |
+
# Copy the environment code
|
| 66 |
+
COPY --from=builder /app/env /app/env
|
| 67 |
+
|
| 68 |
+
# Set PATH to use the virtual environment
|
| 69 |
+
ENV PATH="/app/.venv/bin:$PATH"
|
| 70 |
+
|
| 71 |
+
# Set PYTHONPATH so imports work correctly
|
| 72 |
+
ENV PYTHONPATH="/app/env:$PYTHONPATH"
|
| 73 |
+
|
| 74 |
+
# Health check
|
| 75 |
+
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
|
| 76 |
+
CMD curl -f http://localhost:8000/health || exit 1
|
| 77 |
+
|
| 78 |
+
# Run the FastAPI server
|
| 79 |
+
# The module path is constructed to work with the /app/env structure
|
| 80 |
+
CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
|
tool_use_env/server/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Tool Use Env environment server components."""
|
| 8 |
+
|
| 9 |
+
from .tool_use_env_environment import ToolUseEnvironment
|
| 10 |
+
|
| 11 |
+
__all__ = ["ToolUseEnvironment"]
|
tool_use_env/server/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (328 Bytes). View file
|
|
|
tool_use_env/server/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (400 Bytes). View file
|
|
|
tool_use_env/server/__pycache__/app.cpython-312.pyc
ADDED
|
Binary file (1.05 kB). View file
|
|
|
tool_use_env/server/__pycache__/app.cpython-313.pyc
ADDED
|
Binary file (2.8 kB). View file
|
|
|
tool_use_env/server/__pycache__/tool_use_env_environment.cpython-312.pyc
ADDED
|
Binary file (15.3 kB). View file
|
|
|
tool_use_env/server/__pycache__/tool_use_env_environment.cpython-313.pyc
ADDED
|
Binary file (3.83 kB). View file
|
|
|
tool_use_env/server/app.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from openenv.core.env_server.http_server import create_app
|
| 2 |
+
|
| 3 |
+
from tool_use_env.models import ToolUseAction, ToolUseObservation
|
| 4 |
+
from tool_use_env.server.tool_use_env_environment import ToolUseEnvironment
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
app = create_app(
|
| 8 |
+
ToolUseEnvironment,
|
| 9 |
+
ToolUseAction,
|
| 10 |
+
ToolUseObservation,
|
| 11 |
+
env_name="support_ops_env",
|
| 12 |
+
max_concurrent_envs=4,
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
import uvicorn
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def main(host: str = "0.0.0.0", port: int = 8000):
|
| 20 |
+
uvicorn.run("tool_use_env.server.app:app", host=host, port=port)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@app.get("/")
|
| 24 |
+
def root():
|
| 25 |
+
return {"status": "running"}
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
if __name__ == "__main__":
|
| 29 |
+
main()
|
tool_use_env/server/requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv
|
| 2 |
+
fastapi
|
| 3 |
+
dotenv
|
| 4 |
+
uvicorn
|
| 5 |
+
pydantic
|
| 6 |
+
python-dotenv
|
| 7 |
+
openai
|
tool_use_env/server/tool_use_env_environment.py
ADDED
|
@@ -0,0 +1,351 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import random
|
| 4 |
+
import uuid
|
| 5 |
+
from typing import Any
|
| 6 |
+
|
| 7 |
+
from openenv.core.env_server import Environment
|
| 8 |
+
|
| 9 |
+
from tool_use_env.grader import grade_task
|
| 10 |
+
from tool_use_env.models import ToolUseAction, ToolUseObservation, ToolUseState
|
| 11 |
+
from tool_use_env.tasks import TASKS, TASK_SEQUENCE
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class ToolUseEnvironment(Environment):
|
| 15 |
+
SUPPORTS_CONCURRENT_SESSIONS = True
|
| 16 |
+
MAX_STEPS = 6
|
| 17 |
+
|
| 18 |
+
def __init__(self) -> None:
|
| 19 |
+
super().__init__()
|
| 20 |
+
self._state = ToolUseState()
|
| 21 |
+
self._active_task: dict[str, Any] | None = None
|
| 22 |
+
self._task_cursor = 0
|
| 23 |
+
|
| 24 |
+
def _select_task(self, seed: int | None = None, task_id: str | None = None) -> dict[str, Any]:
|
| 25 |
+
if task_id:
|
| 26 |
+
if task_id not in TASKS:
|
| 27 |
+
raise ValueError(f"Unknown task_id '{task_id}'")
|
| 28 |
+
return TASKS[task_id]
|
| 29 |
+
|
| 30 |
+
if seed is not None:
|
| 31 |
+
rng = random.Random(seed)
|
| 32 |
+
return TASKS[TASK_SEQUENCE[rng.randrange(len(TASK_SEQUENCE))]]
|
| 33 |
+
|
| 34 |
+
selected = TASKS[TASK_SEQUENCE[self._task_cursor % len(TASK_SEQUENCE)]]
|
| 35 |
+
self._task_cursor += 1
|
| 36 |
+
return selected
|
| 37 |
+
|
| 38 |
+
def reset(
|
| 39 |
+
self,
|
| 40 |
+
seed: int | None = None,
|
| 41 |
+
episode_id: str | None = None,
|
| 42 |
+
**kwargs: Any,
|
| 43 |
+
) -> ToolUseObservation:
|
| 44 |
+
task = self._select_task(seed=seed, task_id=kwargs.get("task_id"))
|
| 45 |
+
self._active_task = task
|
| 46 |
+
|
| 47 |
+
self._state = ToolUseState(
|
| 48 |
+
episode_id=episode_id or str(uuid.uuid4()),
|
| 49 |
+
step_count=0,
|
| 50 |
+
task_id=task["task_id"],
|
| 51 |
+
task_name=task["task_name"],
|
| 52 |
+
difficulty=task["difficulty"],
|
| 53 |
+
objective=task["objective"],
|
| 54 |
+
cumulative_reward=0.0,
|
| 55 |
+
final_score=0.0,
|
| 56 |
+
drafted_reply=None,
|
| 57 |
+
resolution_code=None,
|
| 58 |
+
expected_resolution_code=task["expected_resolution_code"],
|
| 59 |
+
required_evidence=list(task["required_evidence"]),
|
| 60 |
+
collected_evidence=["ticket"],
|
| 61 |
+
action_history=[],
|
| 62 |
+
repeat_action_count=0,
|
| 63 |
+
last_action_error=None,
|
| 64 |
+
known_artifacts={},
|
| 65 |
+
known_policies={},
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
return self._build_observation(
|
| 69 |
+
reward=0.0,
|
| 70 |
+
done=False,
|
| 71 |
+
last_tool_result=(
|
| 72 |
+
"Ticket loaded. Start by reviewing the ticket, then inspect the most relevant "
|
| 73 |
+
"artifacts and policy before submitting a resolution."
|
| 74 |
+
),
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
def _normalize_artifact_id(self, artifact_id: str | None) -> str | None:
|
| 78 |
+
if not artifact_id:
|
| 79 |
+
return None
|
| 80 |
+
normalized = artifact_id.strip().lower().replace(" ", "_")
|
| 81 |
+
aliases = {
|
| 82 |
+
"payments": "payment",
|
| 83 |
+
"billing": "payment",
|
| 84 |
+
"risk": "risk_log",
|
| 85 |
+
"risklog": "risk_log",
|
| 86 |
+
"profile": "account",
|
| 87 |
+
}
|
| 88 |
+
return aliases.get(normalized, normalized)
|
| 89 |
+
|
| 90 |
+
def _resolve_policy_key(self, query: str | None) -> str | None:
|
| 91 |
+
if not query or not self._active_task:
|
| 92 |
+
return None
|
| 93 |
+
|
| 94 |
+
normalized = query.strip().lower().replace(" ", "_")
|
| 95 |
+
policies = self._active_task["policies"]
|
| 96 |
+
|
| 97 |
+
if normalized in policies:
|
| 98 |
+
return normalized
|
| 99 |
+
|
| 100 |
+
alias_map = {
|
| 101 |
+
"damaged": "damaged_items",
|
| 102 |
+
"damage": "damaged_items",
|
| 103 |
+
"replacement": "damaged_items",
|
| 104 |
+
"duplicate": "duplicate_charge",
|
| 105 |
+
"duplicate_charge": "duplicate_charge",
|
| 106 |
+
"billing": "duplicate_charge",
|
| 107 |
+
"fraud": "account_takeover",
|
| 108 |
+
"takeover": "account_takeover",
|
| 109 |
+
"account_takeover": "account_takeover",
|
| 110 |
+
"security": "account_takeover",
|
| 111 |
+
}
|
| 112 |
+
mapped = alias_map.get(normalized)
|
| 113 |
+
if mapped in policies:
|
| 114 |
+
return mapped
|
| 115 |
+
|
| 116 |
+
for key in policies:
|
| 117 |
+
if normalized in key:
|
| 118 |
+
return key
|
| 119 |
+
return None
|
| 120 |
+
|
| 121 |
+
def _record_repeat_if_needed(self, evidence_key: str) -> bool:
|
| 122 |
+
if evidence_key in self._state.collected_evidence:
|
| 123 |
+
self._state.repeat_action_count += 1
|
| 124 |
+
return True
|
| 125 |
+
return False
|
| 126 |
+
|
| 127 |
+
def _partial_score(self) -> float:
|
| 128 |
+
if not self._active_task:
|
| 129 |
+
return 0.0
|
| 130 |
+
return grade_task(
|
| 131 |
+
self._active_task,
|
| 132 |
+
self._state.collected_evidence,
|
| 133 |
+
self._state.drafted_reply,
|
| 134 |
+
self._state.resolution_code,
|
| 135 |
+
self._state.step_count,
|
| 136 |
+
self._state.repeat_action_count,
|
| 137 |
+
)["final_score"]
|
| 138 |
+
|
| 139 |
+
def _append_history(self, action: ToolUseAction) -> None:
|
| 140 |
+
parts = [action.action_type]
|
| 141 |
+
if action.artifact_id:
|
| 142 |
+
parts.append(f"artifact={action.artifact_id}")
|
| 143 |
+
if action.query:
|
| 144 |
+
parts.append(f"query={action.query}")
|
| 145 |
+
if action.resolution_code:
|
| 146 |
+
parts.append(f"resolution={action.resolution_code}")
|
| 147 |
+
self._state.action_history.append(" | ".join(parts))
|
| 148 |
+
|
| 149 |
+
def _build_observation(
|
| 150 |
+
self,
|
| 151 |
+
reward: float,
|
| 152 |
+
done: bool,
|
| 153 |
+
last_tool_result: str | None,
|
| 154 |
+
last_action_error: str | None = None,
|
| 155 |
+
) -> ToolUseObservation:
|
| 156 |
+
task = self._active_task
|
| 157 |
+
if not task:
|
| 158 |
+
raise RuntimeError("Environment has no active task.")
|
| 159 |
+
|
| 160 |
+
score = self._state.final_score if done else self._partial_score()
|
| 161 |
+
remaining_steps = max(0, self.MAX_STEPS - self._state.step_count)
|
| 162 |
+
known_items = self._state.collected_evidence or ["ticket"]
|
| 163 |
+
draft_status = "present" if self._state.drafted_reply else "missing"
|
| 164 |
+
resolution_status = self._state.resolution_code or "not submitted"
|
| 165 |
+
|
| 166 |
+
summary = (
|
| 167 |
+
f"Known evidence: {', '.join(known_items)}. "
|
| 168 |
+
f"Draft reply: {draft_status}. "
|
| 169 |
+
f"Resolution: {resolution_status}. "
|
| 170 |
+
f"Submit the best supported resolution before steps run out."
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
return ToolUseObservation(
|
| 174 |
+
done=done,
|
| 175 |
+
reward=round(min(max(reward, 0.0), 1.0), 3),
|
| 176 |
+
task_id=task["task_id"],
|
| 177 |
+
difficulty=task["difficulty"],
|
| 178 |
+
objective=task["objective"],
|
| 179 |
+
customer_message=task["customer_message"],
|
| 180 |
+
workspace_summary=summary,
|
| 181 |
+
available_actions=[
|
| 182 |
+
"review_ticket",
|
| 183 |
+
"inspect_artifact",
|
| 184 |
+
"search_policy",
|
| 185 |
+
"draft_reply",
|
| 186 |
+
"submit_resolution",
|
| 187 |
+
],
|
| 188 |
+
available_resolution_codes=list(task["available_resolution_codes"]),
|
| 189 |
+
collected_evidence=list(self._state.collected_evidence),
|
| 190 |
+
last_tool_result=last_tool_result,
|
| 191 |
+
last_action_error=last_action_error,
|
| 192 |
+
remaining_steps=remaining_steps,
|
| 193 |
+
current_score=round(score, 3),
|
| 194 |
+
metadata={
|
| 195 |
+
"task_name": task["task_name"],
|
| 196 |
+
"action_history": list(self._state.action_history),
|
| 197 |
+
},
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
def _finish_episode(self, resolution_code: str | None, feedback: str) -> ToolUseObservation:
|
| 201 |
+
if not self._active_task:
|
| 202 |
+
raise RuntimeError("Environment has no active task.")
|
| 203 |
+
|
| 204 |
+
self._state.resolution_code = resolution_code
|
| 205 |
+
breakdown = grade_task(
|
| 206 |
+
self._active_task,
|
| 207 |
+
self._state.collected_evidence,
|
| 208 |
+
self._state.drafted_reply,
|
| 209 |
+
self._state.resolution_code,
|
| 210 |
+
self._state.step_count,
|
| 211 |
+
self._state.repeat_action_count,
|
| 212 |
+
)
|
| 213 |
+
self._state.final_score = breakdown["final_score"]
|
| 214 |
+
self._state.last_action_error = None
|
| 215 |
+
|
| 216 |
+
result_text = (
|
| 217 |
+
f"{feedback} | final_score={breakdown['final_score']:.3f} | "
|
| 218 |
+
f"resolution_score={breakdown['resolution_score']:.3f} | "
|
| 219 |
+
f"evidence_score={breakdown['evidence_score']:.3f} | "
|
| 220 |
+
f"reply_score={breakdown['reply_score']:.3f} | "
|
| 221 |
+
f"efficiency_score={breakdown['efficiency_score']:.3f}"
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
return self._build_observation(
|
| 225 |
+
reward=breakdown["final_score"],
|
| 226 |
+
done=True,
|
| 227 |
+
last_tool_result=result_text,
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
def step(
|
| 231 |
+
self,
|
| 232 |
+
action: ToolUseAction,
|
| 233 |
+
timeout_s: float | None = None,
|
| 234 |
+
**kwargs: Any,
|
| 235 |
+
) -> ToolUseObservation:
|
| 236 |
+
if not self._active_task:
|
| 237 |
+
raise RuntimeError("Call reset() before step().")
|
| 238 |
+
|
| 239 |
+
if self._state.final_score > 0 and self._state.resolution_code:
|
| 240 |
+
return self._build_observation(
|
| 241 |
+
reward=0.0,
|
| 242 |
+
done=True,
|
| 243 |
+
last_tool_result="Episode already finished.",
|
| 244 |
+
last_action_error="episode_already_done",
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
self._state.step_count += 1
|
| 248 |
+
self._append_history(action)
|
| 249 |
+
|
| 250 |
+
reward = 0.0
|
| 251 |
+
last_tool_result = None
|
| 252 |
+
error = None
|
| 253 |
+
|
| 254 |
+
if action.action_type == "review_ticket":
|
| 255 |
+
repeated = self._record_repeat_if_needed("ticket")
|
| 256 |
+
reward = 0.02 if repeated else 0.10
|
| 257 |
+
last_tool_result = self._active_task["customer_message"]
|
| 258 |
+
|
| 259 |
+
elif action.action_type == "inspect_artifact":
|
| 260 |
+
artifact_id = self._normalize_artifact_id(action.artifact_id)
|
| 261 |
+
artifacts = self._active_task["artifacts"]
|
| 262 |
+
if not artifact_id or artifact_id not in artifacts:
|
| 263 |
+
error = "invalid_artifact_id"
|
| 264 |
+
last_tool_result = (
|
| 265 |
+
"Unknown artifact. Valid artifacts: "
|
| 266 |
+
+ ", ".join(sorted(artifacts.keys()))
|
| 267 |
+
)
|
| 268 |
+
else:
|
| 269 |
+
evidence_key = f"artifact:{artifact_id}"
|
| 270 |
+
repeated = self._record_repeat_if_needed(evidence_key)
|
| 271 |
+
if not repeated:
|
| 272 |
+
self._state.collected_evidence.append(evidence_key)
|
| 273 |
+
self._state.known_artifacts[artifact_id] = artifacts[artifact_id]
|
| 274 |
+
reward = 0.14 if evidence_key in self._state.required_evidence else 0.04
|
| 275 |
+
else:
|
| 276 |
+
reward = 0.01
|
| 277 |
+
last_tool_result = artifacts[artifact_id]
|
| 278 |
+
|
| 279 |
+
elif action.action_type == "search_policy":
|
| 280 |
+
policy_key = self._resolve_policy_key(action.query)
|
| 281 |
+
policies = self._active_task["policies"]
|
| 282 |
+
if not policy_key:
|
| 283 |
+
error = "policy_not_found"
|
| 284 |
+
last_tool_result = (
|
| 285 |
+
"No matching policy found. Available policies: "
|
| 286 |
+
+ ", ".join(sorted(policies.keys()))
|
| 287 |
+
)
|
| 288 |
+
else:
|
| 289 |
+
evidence_key = f"policy:{policy_key}"
|
| 290 |
+
repeated = self._record_repeat_if_needed(evidence_key)
|
| 291 |
+
if not repeated:
|
| 292 |
+
self._state.collected_evidence.append(evidence_key)
|
| 293 |
+
self._state.known_policies[policy_key] = policies[policy_key]
|
| 294 |
+
reward = 0.14 if evidence_key in self._state.required_evidence else 0.04
|
| 295 |
+
else:
|
| 296 |
+
reward = 0.01
|
| 297 |
+
last_tool_result = policies[policy_key]
|
| 298 |
+
|
| 299 |
+
elif action.action_type == "draft_reply":
|
| 300 |
+
if not action.message or not action.message.strip():
|
| 301 |
+
error = "empty_reply"
|
| 302 |
+
last_tool_result = "Draft reply cannot be empty."
|
| 303 |
+
else:
|
| 304 |
+
self._state.drafted_reply = action.message.strip()
|
| 305 |
+
keywords = self._active_task["reply_keywords"]
|
| 306 |
+
hits = sum(
|
| 307 |
+
1 for keyword in keywords if keyword.lower() in self._state.drafted_reply.lower()
|
| 308 |
+
)
|
| 309 |
+
reward = round(0.05 + (0.15 * (hits / len(keywords))), 3)
|
| 310 |
+
last_tool_result = (
|
| 311 |
+
f"Draft saved. Included {hits}/{len(keywords)} required reply cues."
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
+
elif action.action_type == "submit_resolution":
|
| 315 |
+
if not action.resolution_code:
|
| 316 |
+
error = "missing_resolution_code"
|
| 317 |
+
last_tool_result = "submit_resolution requires a resolution_code."
|
| 318 |
+
elif action.resolution_code not in self._active_task["available_resolution_codes"]:
|
| 319 |
+
error = "invalid_resolution_code"
|
| 320 |
+
last_tool_result = (
|
| 321 |
+
"Unsupported resolution code. Valid codes: "
|
| 322 |
+
+ ", ".join(self._active_task["available_resolution_codes"])
|
| 323 |
+
)
|
| 324 |
+
else:
|
| 325 |
+
return self._finish_episode(
|
| 326 |
+
resolution_code=action.resolution_code,
|
| 327 |
+
feedback=f"Resolution submitted: {action.resolution_code}",
|
| 328 |
+
)
|
| 329 |
+
|
| 330 |
+
else:
|
| 331 |
+
error = "invalid_action_type"
|
| 332 |
+
last_tool_result = "Unsupported action_type."
|
| 333 |
+
|
| 334 |
+
self._state.last_action_error = error
|
| 335 |
+
if self._state.step_count >= self.MAX_STEPS:
|
| 336 |
+
return self._finish_episode(
|
| 337 |
+
resolution_code=self._state.resolution_code,
|
| 338 |
+
feedback="Episode ended because the step limit was reached.",
|
| 339 |
+
)
|
| 340 |
+
|
| 341 |
+
self._state.cumulative_reward = round(self._state.cumulative_reward + reward, 3)
|
| 342 |
+
return self._build_observation(
|
| 343 |
+
reward=reward,
|
| 344 |
+
done=False,
|
| 345 |
+
last_tool_result=last_tool_result,
|
| 346 |
+
last_action_error=error,
|
| 347 |
+
)
|
| 348 |
+
|
| 349 |
+
@property
|
| 350 |
+
def state(self) -> ToolUseState:
|
| 351 |
+
return self._state
|
tool_use_env/tasks.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
TASKS = {
|
| 2 |
+
"damaged-mug-replacement": {
|
| 3 |
+
"task_id": "damaged-mug-replacement",
|
| 4 |
+
"task_name": "Damaged Mug Replacement",
|
| 5 |
+
"difficulty": "easy",
|
| 6 |
+
"objective": (
|
| 7 |
+
"Resolve a damaged-item support ticket by gathering the right evidence, "
|
| 8 |
+
"drafting a concise customer reply, and submitting the correct resolution."
|
| 9 |
+
),
|
| 10 |
+
"customer_message": (
|
| 11 |
+
"Hi support, my Northwind ceramic mug from order O-1001 arrived shattered. "
|
| 12 |
+
"I uploaded a photo. I still want the mug if you can replace it quickly."
|
| 13 |
+
),
|
| 14 |
+
"artifacts": {
|
| 15 |
+
"order": (
|
| 16 |
+
"Order O-1001 | Item: Northwind ceramic mug | Delivered: 2026-04-01 | "
|
| 17 |
+
"Photo evidence attached: yes | Carrier note: box dented on arrival."
|
| 18 |
+
),
|
| 19 |
+
"account": (
|
| 20 |
+
"Customer since 2023 | No prior claims abuse | Shipping address verified."
|
| 21 |
+
),
|
| 22 |
+
},
|
| 23 |
+
"policies": {
|
| 24 |
+
"damaged_items": (
|
| 25 |
+
"Damaged items reported within 7 days with photo evidence qualify for a "
|
| 26 |
+
"free replacement. Low-cost broken items do not need to be returned."
|
| 27 |
+
)
|
| 28 |
+
},
|
| 29 |
+
"required_evidence": ["ticket", "artifact:order", "policy:damaged_items"],
|
| 30 |
+
"expected_resolution_code": "send_replacement",
|
| 31 |
+
"available_resolution_codes": [
|
| 32 |
+
"send_replacement",
|
| 33 |
+
"issue_refund",
|
| 34 |
+
"request_more_info",
|
| 35 |
+
"deny_request",
|
| 36 |
+
],
|
| 37 |
+
"reply_keywords": ["replacement", "48 hours", "no need to return"],
|
| 38 |
+
"optimal_steps": 4,
|
| 39 |
+
},
|
| 40 |
+
"duplicate-charge-refund": {
|
| 41 |
+
"task_id": "duplicate-charge-refund",
|
| 42 |
+
"task_name": "Duplicate Charge Refund",
|
| 43 |
+
"difficulty": "medium",
|
| 44 |
+
"objective": (
|
| 45 |
+
"Investigate a billing complaint, confirm whether a duplicate charge occurred, "
|
| 46 |
+
"and choose the correct refund resolution."
|
| 47 |
+
),
|
| 48 |
+
"customer_message": (
|
| 49 |
+
"I was charged twice for the same blender order and only received one item. "
|
| 50 |
+
"Please fix this. Order number is O-2044."
|
| 51 |
+
),
|
| 52 |
+
"artifacts": {
|
| 53 |
+
"order": (
|
| 54 |
+
"Order O-2044 | Item: Pulse blender | Fulfilled once on 2026-03-28 | "
|
| 55 |
+
"Only one shipment and one invoice should exist."
|
| 56 |
+
),
|
| 57 |
+
"payment": (
|
| 58 |
+
"Payments for O-2044 | Charge 1: CAPTURED $89.00 | Charge 2: CAPTURED $89.00 | "
|
| 59 |
+
"Gateway note: retry after timeout caused duplicate capture."
|
| 60 |
+
),
|
| 61 |
+
"account": "Customer in good standing | No prior charge disputes.",
|
| 62 |
+
},
|
| 63 |
+
"policies": {
|
| 64 |
+
"duplicate_charge": (
|
| 65 |
+
"If a duplicate capture is confirmed, refund the extra charge to the original "
|
| 66 |
+
"payment method within 3-5 business days."
|
| 67 |
+
)
|
| 68 |
+
},
|
| 69 |
+
"required_evidence": [
|
| 70 |
+
"ticket",
|
| 71 |
+
"artifact:order",
|
| 72 |
+
"artifact:payment",
|
| 73 |
+
"policy:duplicate_charge",
|
| 74 |
+
],
|
| 75 |
+
"expected_resolution_code": "refund_duplicate_charge",
|
| 76 |
+
"available_resolution_codes": [
|
| 77 |
+
"refund_duplicate_charge",
|
| 78 |
+
"issue_refund",
|
| 79 |
+
"request_more_info",
|
| 80 |
+
"deny_request",
|
| 81 |
+
],
|
| 82 |
+
"reply_keywords": ["duplicate charge", "refund", "3-5 business days"],
|
| 83 |
+
"optimal_steps": 5,
|
| 84 |
+
},
|
| 85 |
+
"account-takeover-fraud": {
|
| 86 |
+
"task_id": "account-takeover-fraud",
|
| 87 |
+
"task_name": "Account Takeover Fraud Escalation",
|
| 88 |
+
"difficulty": "hard",
|
| 89 |
+
"objective": (
|
| 90 |
+
"Handle a potential account takeover involving digital gift cards. Gather fraud "
|
| 91 |
+
"signals, apply the correct security-first resolution, and communicate next steps."
|
| 92 |
+
),
|
| 93 |
+
"customer_message": (
|
| 94 |
+
"I did not place the gift card order on my account last night. Please stop any "
|
| 95 |
+
"more charges and secure my account immediately."
|
| 96 |
+
),
|
| 97 |
+
"artifacts": {
|
| 98 |
+
"account": (
|
| 99 |
+
"Account A-77 | Password reset disabled | New login from unfamiliar IP at 02:14 UTC | "
|
| 100 |
+
"2FA not enabled."
|
| 101 |
+
),
|
| 102 |
+
"order": (
|
| 103 |
+
"Order O-7781 | Digital gift cards totaling $250 | Delivered instantly by email | "
|
| 104 |
+
"Marked high-risk by payments."
|
| 105 |
+
),
|
| 106 |
+
"risk_log": (
|
| 107 |
+
"Risk engine score: 0.98 | Device mismatch: true | Velocity spike: true | "
|
| 108 |
+
"Recommendation: lock account and escalate to fraud operations."
|
| 109 |
+
),
|
| 110 |
+
},
|
| 111 |
+
"policies": {
|
| 112 |
+
"account_takeover": (
|
| 113 |
+
"Suspected account takeover with digital goods requires immediate account lock, "
|
| 114 |
+
"fraud team escalation, and customer follow-up within 24 hours. Do not promise "
|
| 115 |
+
"an automatic refund before investigation."
|
| 116 |
+
)
|
| 117 |
+
},
|
| 118 |
+
"required_evidence": [
|
| 119 |
+
"ticket",
|
| 120 |
+
"artifact:account",
|
| 121 |
+
"artifact:risk_log",
|
| 122 |
+
"policy:account_takeover",
|
| 123 |
+
],
|
| 124 |
+
"expected_resolution_code": "lock_account_and_escalate_fraud",
|
| 125 |
+
"available_resolution_codes": [
|
| 126 |
+
"lock_account_and_escalate_fraud",
|
| 127 |
+
"issue_refund",
|
| 128 |
+
"request_more_info",
|
| 129 |
+
"deny_request",
|
| 130 |
+
],
|
| 131 |
+
"reply_keywords": ["account locked", "fraud team", "24 hours"],
|
| 132 |
+
"optimal_steps": 5,
|
| 133 |
+
},
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
TASK_SEQUENCE = [
|
| 138 |
+
"damaged-mug-replacement",
|
| 139 |
+
"duplicate-charge-refund",
|
| 140 |
+
"account-takeover-fraud",
|
| 141 |
+
]
|