Clove25 commited on
Commit
18feac5
·
verified ·
1 Parent(s): a7d2d15

Upload 53 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Dockerfile +14 -0
  2. README.md +127 -10
  3. __pycache__/inference.cpython-313.pyc +0 -0
  4. __pycache__/inference.cpython-314.pyc +0 -0
  5. inference.py +260 -0
  6. openenv.yaml +84 -0
  7. openenv_support_ops_env.egg-info/PKG-INFO +11 -0
  8. openenv_support_ops_env.egg-info/SOURCES.txt +18 -0
  9. openenv_support_ops_env.egg-info/dependency_links.txt +1 -0
  10. openenv_support_ops_env.egg-info/entry_points.txt +2 -0
  11. openenv_support_ops_env.egg-info/requires.txt +7 -0
  12. openenv_support_ops_env.egg-info/top_level.txt +2 -0
  13. pyproject.toml +28 -0
  14. server/__init__.py +1 -0
  15. server/app.py +10 -0
  16. tool_use_env/README.md +256 -0
  17. tool_use_env/__init__.py +17 -0
  18. tool_use_env/__pycache__/__init__.cpython-312.pyc +0 -0
  19. tool_use_env/__pycache__/__init__.cpython-313.pyc +0 -0
  20. tool_use_env/__pycache__/__init__.cpython-314.pyc +0 -0
  21. tool_use_env/__pycache__/client.cpython-312.pyc +0 -0
  22. tool_use_env/__pycache__/client.cpython-313.pyc +0 -0
  23. tool_use_env/__pycache__/client.cpython-314.pyc +0 -0
  24. tool_use_env/__pycache__/grader.cpython-312.pyc +0 -0
  25. tool_use_env/__pycache__/models.cpython-312.pyc +0 -0
  26. tool_use_env/__pycache__/models.cpython-313.pyc +0 -0
  27. tool_use_env/agents/__pycache__/baseline.cpython-313.pyc +0 -0
  28. tool_use_env/agents/baseline.py +267 -0
  29. tool_use_env/client.py +165 -0
  30. tool_use_env/grader.py +48 -0
  31. tool_use_env/models.py +86 -0
  32. tool_use_env/openenv_tool_use_env.egg-info/PKG-INFO +9 -0
  33. tool_use_env/openenv_tool_use_env.egg-info/SOURCES.txt +20 -0
  34. tool_use_env/openenv_tool_use_env.egg-info/dependency_links.txt +1 -0
  35. tool_use_env/openenv_tool_use_env.egg-info/entry_points.txt +2 -0
  36. tool_use_env/openenv_tool_use_env.egg-info/requires.txt +5 -0
  37. tool_use_env/openenv_tool_use_env.egg-info/top_level.txt +1 -0
  38. tool_use_env/pyproject.toml +45 -0
  39. tool_use_env/server/Dockerfile +80 -0
  40. tool_use_env/server/__init__.py +11 -0
  41. tool_use_env/server/__pycache__/__init__.cpython-312.pyc +0 -0
  42. tool_use_env/server/__pycache__/__init__.cpython-313.pyc +0 -0
  43. tool_use_env/server/__pycache__/app.cpython-312.pyc +0 -0
  44. tool_use_env/server/__pycache__/app.cpython-313.pyc +0 -0
  45. tool_use_env/server/__pycache__/tool_use_env_environment.cpython-312.pyc +0 -0
  46. tool_use_env/server/__pycache__/tool_use_env_environment.cpython-313.pyc +0 -0
  47. tool_use_env/server/app.py +29 -0
  48. tool_use_env/server/requirements.txt +7 -0
  49. tool_use_env/server/tool_use_env_environment.py +351 -0
  50. tool_use_env/tasks.py +141 -0
Dockerfile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ ENV PYTHONUNBUFFERED=1
6
+
7
+ COPY . /app
8
+
9
+ RUN pip install --no-cache-dir --upgrade pip && \
10
+ pip install --no-cache-dir .
11
+
12
+ EXPOSE 8000
13
+
14
+ CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]
README.md CHANGED
@@ -1,10 +1,127 @@
1
- ---
2
- title: ToolUseEnv
3
- emoji: 📊
4
- colorFrom: gray
5
- colorTo: yellow
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ## Why this environment
3
+
4
+ Modern tool-using agents often fail on operational workflows that require evidence gathering, policy compliance, and safe escalation. This environment targets that gap with deterministic tasks that resemble what ecommerce support, trust-and-safety, and operations agents do every day.
5
+
6
+ ## Task set
7
+
8
+ The benchmark ships with three deterministic tasks and matching deterministic graders:
9
+
10
+ 1. `damaged-mug-replacement` (`easy`)
11
+ Resolve a damaged-item replacement request.
12
+ 2. `duplicate-charge-refund` (`medium`)
13
+ Investigate a duplicate billing complaint and refund the extra capture.
14
+ 3. `account-takeover-fraud` (`hard`)
15
+ Handle a suspected account takeover with a security-first fraud escalation.
16
+
17
+ Each task has a fixed expected resolution, required evidence, and reply keywords. The grader returns a score in `[0.0, 1.0]` from weighted resolution accuracy, evidence coverage, reply quality, and efficiency.
18
+
19
+ ## Action space
20
+
21
+ The environment uses a typed `ToolUseAction` model with these actions:
22
+
23
+ - `review_ticket`
24
+ - `inspect_artifact`
25
+ - `search_policy`
26
+ - `draft_reply`
27
+ - `submit_resolution`
28
+
29
+ Optional fields on the action are `artifact_id`, `query`, `message`, and `resolution_code`.
30
+
31
+ ## Observation space
32
+
33
+ The typed `ToolUseObservation` includes:
34
+
35
+ - `task_id`, `difficulty`, `objective`
36
+ - `customer_message`
37
+ - `workspace_summary`
38
+ - `available_actions`
39
+ - `available_resolution_codes`
40
+ - `collected_evidence`
41
+ - `last_tool_result`
42
+ - `last_action_error`
43
+ - `remaining_steps`
44
+ - `current_score`
45
+
46
+ The typed `ToolUseState` exposes internal progress such as `final_score`, `drafted_reply`, `resolution_code`, `required_evidence`, `collected_evidence`, and action history.
47
+
48
+ ## Reward design
49
+
50
+ The reward is shaped over the full trajectory:
51
+
52
+ - Positive reward for first-time collection of relevant artifacts and policies
53
+ - Smaller reward for drafting a reply that includes required customer-facing details
54
+ - Very small or zero reward for repeated or invalid actions
55
+ - Final step reward equal to the deterministic grader score
56
+
57
+ This gives agents signal before the final submission while still anchoring the episode outcome to task completion quality.
58
+
59
+ ## Setup
60
+
61
+ ### Local Python
62
+
63
+ ```bash
64
+ UV_CACHE_DIR=/tmp/uv-cache uv sync
65
+ .venv/bin/pip install -e .
66
+ ```
67
+
68
+ ### Run the server
69
+
70
+ ```bash
71
+ .venv/bin/python -m uvicorn server.app:app --host 0.0.0.0 --port 8000
72
+ ```
73
+
74
+ ### Docker
75
+
76
+ ```bash
77
+ docker build -t support-ops-openenv .
78
+ docker run --rm -p 8000:8000 support-ops-openenv
79
+ ```
80
+
81
+ ## Baseline inference
82
+
83
+ The required root `inference.py` uses the OpenAI client for model calls and emits the mandatory `[START]`, `[STEP]`, and `[END]` logs.
84
+
85
+ Environment variables:
86
+
87
+ - `HF_TOKEN` or `OPENAI_API_KEY`
88
+ - `API_BASE_URL`
89
+ - `MODEL_NAME`
90
+ - `LOCAL_IMAGE_NAME` if you want to run via `from_docker_image()`
91
+ - `ENV_BASE_URL` if you want to connect to a running server
92
+
93
+ Example:
94
+
95
+ ```bash
96
+ export HF_TOKEN=...
97
+ export API_BASE_URL=https://router.huggingface.co/v1
98
+ export MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct
99
+ python inference.py
100
+ ```
101
+
102
+ The script evaluates all three tasks in a fixed order for reproducible scoring. If no API key is available, it falls back to a deterministic scripted policy so the benchmark remains runnable offline.
103
+
104
+ ## Expected baseline behavior
105
+
106
+ The bundled fallback policy should solve all three tasks with high scores because it follows the intended evidence path exactly. Frontier LLMs should also perform well on the easy and medium tasks and show larger variance on the hard fraud-escalation task if they over-index on issuing refunds instead of following policy.
107
+
108
+ ## Project structure
109
+
110
+ ```text
111
+ .
112
+ ├── Dockerfile
113
+ ├── README.md
114
+ ├── inference.py
115
+ ├── openenv.yaml
116
+ ├── pyproject.toml
117
+ ├── server/
118
+ │ └── app.py
119
+ └── tool_use_env/
120
+ ├── client.py
121
+ ├── grader.py
122
+ ├── models.py
123
+ ├── tasks.py
124
+ └── server/
125
+ ├── app.py
126
+ └── tool_use_env_environment.py
127
+ ```
__pycache__/inference.cpython-313.pyc ADDED
Binary file (6.03 kB). View file
 
__pycache__/inference.cpython-314.pyc ADDED
Binary file (6.74 kB). View file
 
inference.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import os
4
+ import textwrap
5
+ from typing import Any, List, Optional
6
+
7
+ from openai import OpenAI
8
+
9
+ from tool_use_env.client import ToolUseEnv
10
+ from tool_use_env.models import ToolUseAction
11
+ from tool_use_env.tasks import TASK_SEQUENCE
12
+
13
+ LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
14
+ API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
15
+ MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Llama-3.1-8B-Instruct")
16
+ API_KEY = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY") or os.getenv("API_KEY")
17
+ ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://127.0.0.1:8000")
18
+ BENCHMARK = os.getenv("MY_ENV_V4_BENCHMARK", "support_ops_env")
19
+ MAX_STEPS = 6
20
+ TEMPERATURE = 0.0
21
+ MAX_TOKENS = 220
22
+
23
+ SYSTEM_PROMPT = textwrap.dedent(
24
+ """
25
+ You are operating a customer-support workflow environment.
26
+ Your job is to gather the minimum necessary evidence, draft a short customer reply,
27
+ and submit the correct final resolution code.
28
+
29
+ Reply with JSON only using this schema:
30
+ {
31
+ "action_type": "review_ticket|inspect_artifact|search_policy|draft_reply|submit_resolution",
32
+ "artifact_id": "optional string",
33
+ "query": "optional string",
34
+ "message": "optional string",
35
+ "resolution_code": "optional string"
36
+ }
37
+
38
+ Use concise messages. Prefer exact artifact ids and exact resolution codes shown in the observation.
39
+ """
40
+ ).strip()
41
+
42
+
43
+ def log_start(task: str, env: str, model: str) -> None:
44
+ print(f"[START] task={task} env={env} model={model}", flush=True)
45
+
46
+
47
+ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
48
+ error_val = error if error else "null"
49
+ print(
50
+ f"[STEP] step={step} action={action} reward={reward:.2f} "
51
+ f"done={str(done).lower()} error={error_val}",
52
+ flush=True,
53
+ )
54
+
55
+
56
+ def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
57
+ rewards_str = ",".join(f"{reward:.2f}" for reward in rewards)
58
+ print(
59
+ f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}",
60
+ flush=True,
61
+ )
62
+
63
+
64
+ def _serialize_action(action: ToolUseAction) -> str:
65
+ payload = {"action_type": action.action_type}
66
+ if action.artifact_id:
67
+ payload["artifact_id"] = action.artifact_id
68
+ if action.query:
69
+ payload["query"] = action.query
70
+ if action.message:
71
+ payload["message"] = action.message.replace("\n", " ").strip()
72
+ if action.resolution_code:
73
+ payload["resolution_code"] = action.resolution_code
74
+ return json.dumps(payload, ensure_ascii=True, separators=(",", ":"))
75
+
76
+
77
+ def _fallback_action(observation: Any) -> ToolUseAction:
78
+ evidence = set(observation.collected_evidence)
79
+ task_id = observation.task_id
80
+
81
+ if "ticket" not in evidence:
82
+ return ToolUseAction(action_type="review_ticket")
83
+
84
+ task_plans = {
85
+ "damaged-mug-replacement": [
86
+ ToolUseAction(action_type="inspect_artifact", artifact_id="order"),
87
+ ToolUseAction(action_type="search_policy", query="damaged_items"),
88
+ ToolUseAction(
89
+ action_type="draft_reply",
90
+ message=(
91
+ "We are sending a replacement within 48 hours. "
92
+ "There is no need to return the broken mug."
93
+ ),
94
+ ),
95
+ ToolUseAction(action_type="submit_resolution", resolution_code="send_replacement"),
96
+ ],
97
+ "duplicate-charge-refund": [
98
+ ToolUseAction(action_type="inspect_artifact", artifact_id="order"),
99
+ ToolUseAction(action_type="inspect_artifact", artifact_id="payment"),
100
+ ToolUseAction(action_type="search_policy", query="duplicate_charge"),
101
+ ToolUseAction(
102
+ action_type="draft_reply",
103
+ message=(
104
+ "We confirmed the duplicate charge and issued a refund. "
105
+ "You should see the refund in 3-5 business days."
106
+ ),
107
+ ),
108
+ ToolUseAction(
109
+ action_type="submit_resolution",
110
+ resolution_code="refund_duplicate_charge",
111
+ ),
112
+ ],
113
+ "account-takeover-fraud": [
114
+ ToolUseAction(action_type="inspect_artifact", artifact_id="account"),
115
+ ToolUseAction(action_type="inspect_artifact", artifact_id="risk_log"),
116
+ ToolUseAction(action_type="search_policy", query="account_takeover"),
117
+ ToolUseAction(
118
+ action_type="draft_reply",
119
+ message=(
120
+ "We locked your account immediately and escalated this to our fraud team. "
121
+ "You will receive an update within 24 hours."
122
+ ),
123
+ ),
124
+ ToolUseAction(
125
+ action_type="submit_resolution",
126
+ resolution_code="lock_account_and_escalate_fraud",
127
+ ),
128
+ ],
129
+ }
130
+
131
+ plan = task_plans[task_id]
132
+ for candidate in plan:
133
+ if candidate.action_type == "inspect_artifact":
134
+ if f"artifact:{candidate.artifact_id}" not in evidence:
135
+ return candidate
136
+ elif candidate.action_type == "search_policy":
137
+ if f"policy:{candidate.query}" not in evidence:
138
+ return candidate
139
+ elif candidate.action_type == "draft_reply" and not observation.last_tool_result.startswith("Draft saved"):
140
+ return candidate
141
+ elif candidate.action_type == "submit_resolution":
142
+ return candidate
143
+
144
+ return ToolUseAction(action_type="submit_resolution", resolution_code=observation.available_resolution_codes[0])
145
+
146
+
147
+ def _prompt_for_observation(step: int, observation: Any) -> str:
148
+ return textwrap.dedent(
149
+ f"""
150
+ Step: {step}
151
+ Task ID: {observation.task_id}
152
+ Difficulty: {observation.difficulty}
153
+ Objective: {observation.objective}
154
+ Customer message: {observation.customer_message}
155
+ Workspace summary: {observation.workspace_summary}
156
+ Collected evidence: {observation.collected_evidence}
157
+ Available resolution codes: {observation.available_resolution_codes}
158
+ Last tool result: {observation.last_tool_result}
159
+ Last action error: {observation.last_action_error}
160
+ Remaining steps: {observation.remaining_steps}
161
+
162
+ Return the single best next action as JSON.
163
+ """
164
+ ).strip()
165
+
166
+
167
+ def _model_action(client: OpenAI, step: int, observation: Any) -> ToolUseAction:
168
+ fallback = _fallback_action(observation)
169
+
170
+ if not API_KEY:
171
+ return fallback
172
+
173
+ try:
174
+ completion = client.chat.completions.create(
175
+ model=MODEL_NAME,
176
+ messages=[
177
+ {"role": "system", "content": SYSTEM_PROMPT},
178
+ {"role": "user", "content": _prompt_for_observation(step, observation)},
179
+ ],
180
+ temperature=TEMPERATURE,
181
+ max_tokens=MAX_TOKENS,
182
+ response_format={"type": "json_object"},
183
+ )
184
+ raw = (completion.choices[0].message.content or "").strip()
185
+ data = json.loads(raw)
186
+ return ToolUseAction(
187
+ action_type=data.get("action_type", fallback.action_type),
188
+ artifact_id=data.get("artifact_id"),
189
+ query=data.get("query"),
190
+ message=data.get("message"),
191
+ resolution_code=data.get("resolution_code"),
192
+ )
193
+ except Exception:
194
+ return fallback
195
+
196
+
197
+ async def _connect_env() -> ToolUseEnv:
198
+ if LOCAL_IMAGE_NAME:
199
+ return await ToolUseEnv.from_docker_image(LOCAL_IMAGE_NAME)
200
+
201
+ env = ToolUseEnv(base_url=ENV_BASE_URL)
202
+ await env.connect()
203
+ return env
204
+
205
+
206
+ async def run_task(client: OpenAI, env: ToolUseEnv, task_id: str) -> float:
207
+ rewards: List[float] = []
208
+ steps_taken = 0
209
+ score = 0.0
210
+ success = False
211
+
212
+ log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
213
+
214
+ try:
215
+ result = await env.reset(task_id=task_id, seed=7)
216
+ observation = result.observation
217
+
218
+ for step in range(1, MAX_STEPS + 1):
219
+ if result.done:
220
+ break
221
+
222
+ action = _model_action(client, step, observation)
223
+ action_str = _serialize_action(action)
224
+ result = await env.step(action)
225
+ observation = result.observation
226
+
227
+ reward = float(result.reward or 0.0)
228
+ done = bool(result.done)
229
+ error = observation.last_action_error
230
+ rewards.append(reward)
231
+ steps_taken = step
232
+
233
+ log_step(step=step, action=action_str, reward=reward, done=done, error=error)
234
+
235
+ if done:
236
+ break
237
+
238
+ state = await env.state()
239
+ score = float(state.final_score)
240
+ success = score >= 0.8
241
+ finally:
242
+ log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
243
+
244
+ return score
245
+
246
+
247
+ async def main() -> None:
248
+ client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY or "missing")
249
+ env = await _connect_env()
250
+ try:
251
+ scores = []
252
+ for task_id in TASK_SEQUENCE:
253
+ score = await run_task(client, env, task_id)
254
+ scores.append(score)
255
+ finally:
256
+ await env.close()
257
+
258
+
259
+ if __name__ == "__main__":
260
+ asyncio.run(main())
openenv.yaml ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: support_ops_env
2
+ description: Customer support operations environment for multi-step tool-using agents
3
+ version: 1.0.0
4
+
5
+ entrypoint: server.app:app
6
+
7
+ actions:
8
+ type: object
9
+ properties:
10
+ action_type:
11
+ type: string
12
+ enum:
13
+ - review_ticket
14
+ - inspect_artifact
15
+ - search_policy
16
+ - draft_reply
17
+ - submit_resolution
18
+ artifact_id:
19
+ type: string
20
+ nullable: true
21
+ query:
22
+ type: string
23
+ nullable: true
24
+ message:
25
+ type: string
26
+ nullable: true
27
+ resolution_code:
28
+ type: string
29
+ nullable: true
30
+ required:
31
+ - action_type
32
+
33
+ observations:
34
+ type: object
35
+ properties:
36
+ task_id:
37
+ type: string
38
+ difficulty:
39
+ type: string
40
+ enum: [easy, medium, hard]
41
+ objective:
42
+ type: string
43
+ customer_message:
44
+ type: string
45
+ workspace_summary:
46
+ type: string
47
+ available_actions:
48
+ type: array
49
+ items:
50
+ type: string
51
+ available_resolution_codes:
52
+ type: array
53
+ items:
54
+ type: string
55
+ collected_evidence:
56
+ type: array
57
+ items:
58
+ type: string
59
+ last_tool_result:
60
+ type: string
61
+ nullable: true
62
+ last_action_error:
63
+ type: string
64
+ nullable: true
65
+ remaining_steps:
66
+ type: integer
67
+ current_score:
68
+ type: number
69
+
70
+ reward_range: [0.0, 1.0]
71
+
72
+ metadata:
73
+ benchmark: support_ops_env
74
+ domain: customer_support
75
+ difficulty_levels:
76
+ - easy
77
+ - medium
78
+ - hard
79
+ features:
80
+ - multi_step_reasoning
81
+ - tool_selection
82
+ - policy_lookup
83
+ - customer_support_triage
84
+ - shaped_rewards
openenv_support_ops_env.egg-info/PKG-INFO ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: openenv-support-ops-env
3
+ Version: 0.1.0
4
+ Summary: OpenEnv customer support operations environment
5
+ Requires-Python: >=3.10
6
+ Requires-Dist: openenv-core[core]>=0.2.1
7
+ Requires-Dist: openai>=1.40.0
8
+ Requires-Dist: python-dotenv>=1.0.1
9
+ Requires-Dist: uvicorn>=0.30.0
10
+ Provides-Extra: dev
11
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
openenv_support_ops_env.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ README.md
2
+ pyproject.toml
3
+ openenv_support_ops_env.egg-info/PKG-INFO
4
+ openenv_support_ops_env.egg-info/SOURCES.txt
5
+ openenv_support_ops_env.egg-info/dependency_links.txt
6
+ openenv_support_ops_env.egg-info/entry_points.txt
7
+ openenv_support_ops_env.egg-info/requires.txt
8
+ openenv_support_ops_env.egg-info/top_level.txt
9
+ server/__init__.py
10
+ server/app.py
11
+ tool_use_env/__init__.py
12
+ tool_use_env/client.py
13
+ tool_use_env/grader.py
14
+ tool_use_env/models.py
15
+ tool_use_env/tasks.py
16
+ tool_use_env/server/__init__.py
17
+ tool_use_env/server/app.py
18
+ tool_use_env/server/tool_use_env_environment.py
openenv_support_ops_env.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
openenv_support_ops_env.egg-info/entry_points.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [console_scripts]
2
+ server = server.app:main
openenv_support_ops_env.egg-info/requires.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ openenv-core[core]>=0.2.1
2
+ openai>=1.40.0
3
+ python-dotenv>=1.0.1
4
+ uvicorn>=0.30.0
5
+
6
+ [dev]
7
+ pytest>=8.0.0
openenv_support_ops_env.egg-info/top_level.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ server
2
+ tool_use_env
pyproject.toml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "openenv-support-ops-env"
7
+ version = "0.1.0"
8
+ description = "OpenEnv customer support operations environment"
9
+ requires-python = ">=3.10"
10
+ dependencies = [
11
+ "openenv-core[core]>=0.2.1",
12
+ "openai>=1.40.0",
13
+ "python-dotenv>=1.0.1",
14
+ "uvicorn>=0.30.0",
15
+ ]
16
+
17
+ [project.optional-dependencies]
18
+ dev = [
19
+ "pytest>=8.0.0",
20
+ ]
21
+
22
+ [project.scripts]
23
+ server = "server.app:main"
24
+
25
+ [tool.setuptools]
26
+ include-package-data = true
27
+ packages = ["tool_use_env", "tool_use_env.server", "server"]
28
+ package-dir = { "tool_use_env" = "tool_use_env", "tool_use_env.server" = "tool_use_env/server", "server" = "server" }
server/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Root server shim for OpenEnv validation and uv run server."""
server/app.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from tool_use_env.server.app import app as app
2
+ from tool_use_env.server.app import main as _package_main
3
+
4
+
5
+ def main(host: str = "0.0.0.0", port: int = 8000):
6
+ _package_main(host=host, port=port)
7
+
8
+
9
+ if __name__ == "__main__":
10
+ main()
tool_use_env/README.md ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Tool Use Env Environment Server
3
+ emoji: 📀
4
+ colorFrom: purple
5
+ colorTo: gray
6
+ sdk: docker
7
+ pinned: false
8
+ app_port: 8000
9
+ base_path: /web
10
+ tags:
11
+ - openenv
12
+ ---
13
+
14
+ # Tool Use Env Environment
15
+
16
+ A simple test environment that echoes back messages. Perfect for testing the env APIs as well as demonstrating environment usage patterns.
17
+
18
+ ## Quick Start
19
+ hi
20
+
21
+ The simplest way to use the Tool Use Env environment is through the `ToolUseEnv` class:
22
+
23
+ ```python
24
+ from tool_use_env import ToolUseAction, ToolUseEnv
25
+
26
+ try:
27
+ # Create environment from Docker image
28
+ tool_use_envenv = ToolUseEnv.from_docker_image("tool_use_env-env:latest")
29
+
30
+ # Reset
31
+ result = tool_use_envenv.reset()
32
+ print(f"Reset: {result.observation.echoed_message}")
33
+
34
+ # Send multiple messages
35
+ messages = ["Hello, World!", "Testing echo", "Final message"]
36
+
37
+ for msg in messages:
38
+ result = tool_use_envenv.step(ToolUseAction(message=msg))
39
+ print(f"Sent: '{msg}'")
40
+ print(f" → Echoed: '{result.observation.echoed_message}'")
41
+ print(f" → Length: {result.observation.message_length}")
42
+ print(f" → Reward: {result.reward}")
43
+
44
+ finally:
45
+ # Always clean up
46
+ tool_use_envenv.close()
47
+ ```
48
+
49
+ That's it! The `ToolUseEnv.from_docker_image()` method handles:
50
+ - Starting the Docker container
51
+ - Waiting for the server to be ready
52
+ - Connecting to the environment
53
+ - Container cleanup when you call `close()`
54
+
55
+ ## Building the Docker Image
56
+
57
+ Before using the environment, you need to build the Docker image:
58
+
59
+ ```bash
60
+ # From project root
61
+ docker build -t tool_use_env-env:latest -f server/Dockerfile .
62
+ ```
63
+
64
+ ## Deploying to Hugging Face Spaces
65
+
66
+ You can easily deploy your OpenEnv environment to Hugging Face Spaces using the `openenv push` command:
67
+
68
+ ```bash
69
+ # From the environment directory (where openenv.yaml is located)
70
+ openenv push
71
+
72
+ # Or specify options
73
+ openenv push --namespace my-org --private
74
+ ```
75
+
76
+ The `openenv push` command will:
77
+ 1. Validate that the directory is an OpenEnv environment (checks for `openenv.yaml`)
78
+ 2. Prepare a custom build for Hugging Face Docker space (enables web interface)
79
+ 3. Upload to Hugging Face (ensuring you're logged in)
80
+
81
+ ### Prerequisites
82
+
83
+ - Authenticate with Hugging Face: The command will prompt for login if not already authenticated
84
+
85
+ ### Options
86
+
87
+ - `--directory`, `-d`: Directory containing the OpenEnv environment (defaults to current directory)
88
+ - `--repo-id`, `-r`: Repository ID in format 'username/repo-name' (defaults to 'username/env-name' from openenv.yaml)
89
+ - `--base-image`, `-b`: Base Docker image to use (overrides Dockerfile FROM)
90
+ - `--private`: Deploy the space as private (default: public)
91
+
92
+ ### Examples
93
+
94
+ ```bash
95
+ # Push to your personal namespace (defaults to username/env-name from openenv.yaml)
96
+ openenv push
97
+
98
+ # Push to a specific repository
99
+ openenv push --repo-id my-org/my-env
100
+
101
+ # Push with a custom base image
102
+ openenv push --base-image ghcr.io/meta-pytorch/openenv-base:latest
103
+
104
+ # Push as a private space
105
+ openenv push --private
106
+
107
+ # Combine options
108
+ openenv push --repo-id my-org/my-env --base-image custom-base:latest --private
109
+ ```
110
+
111
+ After deployment, your space will be available at:
112
+ `https://huggingface.co/spaces/<repo-id>`
113
+
114
+ The deployed space includes:
115
+ - **Web Interface** at `/web` - Interactive UI for exploring the environment
116
+ - **API Documentation** at `/docs` - Full OpenAPI/Swagger interface
117
+ - **Health Check** at `/health` - Container health monitoring
118
+ - **WebSocket** at `/ws` - Persistent session endpoint for low-latency interactions
119
+
120
+ ## Environment Details
121
+
122
+ ### Action
123
+ **ToolUseAction**: Contains a single field
124
+ - `message` (str) - The message to echo back
125
+
126
+ ### Observation
127
+ **ToolUseObservation**: Contains the echo response and metadata
128
+ - `echoed_message` (str) - The message echoed back
129
+ - `message_length` (int) - Length of the message
130
+ - `reward` (float) - Reward based on message length (length × 0.1)
131
+ - `done` (bool) - Always False for echo environment
132
+ - `metadata` (dict) - Additional info like step count
133
+
134
+ ### Reward
135
+ The reward is calculated as: `message_length × 0.1`
136
+ - "Hi" → reward: 0.2
137
+ - "Hello, World!" → reward: 1.3
138
+ - Empty message → reward: 0.0
139
+
140
+ ## Advanced Usage
141
+
142
+ ### Connecting to an Existing Server
143
+
144
+ If you already have a Tool Use Env environment server running, you can connect directly:
145
+
146
+ ```python
147
+ from tool_use_env import ToolUseEnv
148
+
149
+ # Connect to existing server
150
+ tool_use_envenv = ToolUseEnv(base_url="<ENV_HTTP_URL_HERE>")
151
+
152
+ # Use as normal
153
+ result = tool_use_envenv.reset()
154
+ result = tool_use_envenv.step(ToolUseAction(message="Hello!"))
155
+ ```
156
+
157
+ Note: When connecting to an existing server, `tool_use_envenv.close()` will NOT stop the server.
158
+
159
+ ### Using the Context Manager
160
+
161
+ The client supports context manager usage for automatic connection management:
162
+
163
+ ```python
164
+ from tool_use_env import ToolUseAction, ToolUseEnv
165
+
166
+ # Connect with context manager (auto-connects and closes)
167
+ with ToolUseEnv(base_url="http://localhost:8000") as env:
168
+ result = env.reset()
169
+ print(f"Reset: {result.observation.echoed_message}")
170
+ # Multiple steps with low latency
171
+ for msg in ["Hello", "World", "!"]:
172
+ result = env.step(ToolUseAction(message=msg))
173
+ print(f"Echoed: {result.observation.echoed_message}")
174
+ ```
175
+
176
+ The client uses WebSocket connections for:
177
+ - **Lower latency**: No HTTP connection overhead per request
178
+ - **Persistent session**: Server maintains your environment state
179
+ - **Efficient for episodes**: Better for many sequential steps
180
+
181
+ ### Concurrent WebSocket Sessions
182
+
183
+ The server supports multiple concurrent WebSocket connections. To enable this,
184
+ modify `server/app.py` to use factory mode:
185
+
186
+ ```python
187
+ # In server/app.py - use factory mode for concurrent sessions
188
+ app = create_app(
189
+ ToolUseEnvironment, # Pass class, not instance
190
+ ToolUseAction,
191
+ ToolUseObservation,
192
+ max_concurrent_envs=4, # Allow 4 concurrent sessions
193
+ )
194
+ ```
195
+
196
+ Then multiple clients can connect simultaneously:
197
+
198
+ ```python
199
+ from tool_use_env import ToolUseAction, ToolUseEnv
200
+ from concurrent.futures import ThreadPoolExecutor
201
+
202
+ def run_episode(client_id: int):
203
+ with ToolUseEnv(base_url="http://localhost:8000") as env:
204
+ result = env.reset()
205
+ for i in range(10):
206
+ result = env.step(ToolUseAction(message=f"Client {client_id}, step {i}"))
207
+ return client_id, result.observation.message_length
208
+
209
+ # Run 4 episodes concurrently
210
+ with ThreadPoolExecutor(max_workers=4) as executor:
211
+ results = list(executor.map(run_episode, range(4)))
212
+ ```
213
+
214
+ ## Development & Testing
215
+
216
+ ### Direct Environment Testing
217
+
218
+ Test the environment logic directly without starting the HTTP server:
219
+
220
+ ```bash
221
+ # From the server directory
222
+ python3 server/tool_use_env_environment.py
223
+ ```
224
+
225
+ This verifies that:
226
+ - Environment resets correctly
227
+ - Step executes actions properly
228
+ - State tracking works
229
+ - Rewards are calculated correctly
230
+
231
+ ### Running Locally
232
+
233
+ Run the server locally for development:
234
+
235
+ ```bash
236
+ uvicorn server.app:app --reload
237
+ ```
238
+
239
+ ## Project Structure
240
+
241
+ ```
242
+ tool_use_env/
243
+ ├── .dockerignore # Docker build exclusions
244
+ ├── __init__.py # Module exports
245
+ ├── README.md # This file
246
+ ├── openenv.yaml # OpenEnv manifest
247
+ ├── pyproject.toml # Project metadata and dependencies
248
+ ├── uv.lock # Locked dependencies (generated)
249
+ ├── client.py # ToolUseEnv client
250
+ ├── models.py # Action and Observation models
251
+ └── server/
252
+ ├── __init__.py # Server module exports
253
+ ├── tool_use_env_environment.py # Core environment logic
254
+ ├── app.py # FastAPI application (HTTP + WebSocket endpoints)
255
+ └── Dockerfile # Container image definition
256
+ ```
tool_use_env/__init__.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Tool Use Env Environment."""
8
+
9
+ from .client import ToolUseEnv
10
+ from .models import ToolUseAction, ToolUseObservation, ToolUseState
11
+
12
+ __all__ = [
13
+ "ToolUseAction",
14
+ "ToolUseObservation",
15
+ "ToolUseState",
16
+ "ToolUseEnv",
17
+ ]
tool_use_env/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (356 Bytes). View file
 
tool_use_env/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (364 Bytes). View file
 
tool_use_env/__pycache__/__init__.cpython-314.pyc ADDED
Binary file (392 Bytes). View file
 
tool_use_env/__pycache__/client.cpython-312.pyc ADDED
Binary file (3.96 kB). View file
 
tool_use_env/__pycache__/client.cpython-313.pyc ADDED
Binary file (2.26 kB). View file
 
tool_use_env/__pycache__/client.cpython-314.pyc ADDED
Binary file (4.58 kB). View file
 
tool_use_env/__pycache__/grader.cpython-312.pyc ADDED
Binary file (2.58 kB). View file
 
tool_use_env/__pycache__/models.cpython-312.pyc ADDED
Binary file (4.25 kB). View file
 
tool_use_env/__pycache__/models.cpython-313.pyc ADDED
Binary file (1.41 kB). View file
 
tool_use_env/agents/__pycache__/baseline.cpython-313.pyc ADDED
Binary file (4.72 kB). View file
 
tool_use_env/agents/baseline.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from tool_use_env.client import ToolUseEnv
2
+ # from tool_use_env.models import ToolUseAction
3
+ # import random
4
+
5
+ # def rule_based_policy(query: str):
6
+ # query = query.lower()
7
+
8
+ # # --- Introduce slight imperfection ---
9
+ # if random.random() < 0.1:
10
+ # return "answer_directly"
11
+
12
+ # if "what is" in query and any(op in query for op in ["+", "-", "*", "/"]):
13
+ # return "use_calculator"
14
+
15
+ # if "capital" in query or "who is" in query:
16
+ # return "use_search"
17
+
18
+ # return "answer_directly"
19
+
20
+
21
+ # def run_single_episode(env):
22
+ # result = env.reset()
23
+ # obs = result.observation
24
+
25
+ # query = obs.query
26
+ # action_type = rule_based_policy(query)
27
+
28
+ # action = ToolUseAction(action_type=action_type)
29
+
30
+ # result = env.step(action)
31
+ # obs = result.observation
32
+
33
+ # return {
34
+ # "query": query,
35
+ # "action": action_type,
36
+ # "reward": result.reward,
37
+ # "message": obs.message
38
+ # }
39
+
40
+ # def run_evaluation(num_episodes=20):
41
+ # results = []
42
+
43
+ # difficulty_scores = {
44
+ # "easy": [],
45
+ # "medium": [],
46
+ # "hard": []
47
+ # }
48
+
49
+ # total_score = 0
50
+
51
+ # with ToolUseEnv(base_url="http://localhost:8000").sync() as env:
52
+ # for _ in range(num_episodes):
53
+ # result = env.reset()
54
+ # obs = result.observation
55
+ # query = obs.query
56
+ # state = env.state()
57
+ # difficulty = state.difficulty
58
+
59
+ # action_type = rule_based_policy(query)
60
+ # action = ToolUseAction(action_type=action_type)
61
+
62
+ # result = env.step(action)
63
+
64
+ # score = result.reward
65
+ # total_score += score
66
+
67
+ # difficulty_scores[difficulty].append(score)
68
+
69
+ # results.append({
70
+ # "query": query,
71
+ # "difficulty": difficulty,
72
+ # "action": action_type,
73
+ # "score": score,
74
+ # "message": result.observation.message
75
+ # })
76
+
77
+ # avg_score = total_score / num_episodes
78
+
79
+ # print("\n=== OVERALL PERFORMANCE ===")
80
+ # print(f"Average Score: {avg_score:.2f}")
81
+
82
+ # print("\n=== DIFFICULTY BREAKDOWN ===")
83
+ # for level in difficulty_scores:
84
+ # if difficulty_scores[level]:
85
+ # avg = sum(difficulty_scores[level]) / len(difficulty_scores[level])
86
+ # print(f"{level.capitalize()}: {avg:.2f}")
87
+
88
+ # print("\n=== SAMPLE CASES ===")
89
+ # for r in results[:5]:
90
+ # print(f"\nQuery: {r['query']}")
91
+ # print(f"Action: {r['action']}")
92
+ # print(f"Score: {r['score']:.2f}")
93
+ # print(f"Details: {r['message']}")
94
+
95
+ # return results
96
+
97
+ # def analyze_failures(results):
98
+ # wrong_decisions = 0
99
+ # tool_failures = 0
100
+ # total = len(results)
101
+
102
+ # for r in results:
103
+ # msg = r["message"]
104
+
105
+ # if "Correct: False" in msg:
106
+ # if "use_" in msg:
107
+ # tool_failures += 1
108
+ # else:
109
+ # wrong_decisions += 1
110
+
111
+ # print("\n=== FAILURE ANALYSIS ===")
112
+ # print(f"Tool failures: {tool_failures}/{total} ({(tool_failures/total)*100:.1f}%)")
113
+ # print(f"Wrong decisions: {wrong_decisions}/{total} ({(wrong_decisions/total)*100:.1f}%)")
114
+
115
+
116
+ # if __name__ == "__main__":
117
+ # results = run_evaluation(50)
118
+ # analyze_failures(results)
119
+
120
+ import os
121
+ import random
122
+ from collections import defaultdict
123
+
124
+ from dotenv import load_dotenv
125
+ from openai import OpenAI
126
+
127
+ from tool_use_env.client import ToolUseEnv
128
+ from tool_use_env.models import ToolUseAction
129
+
130
+ # --- Load environment variables ---
131
+ load_dotenv()
132
+
133
+ # --- Initialize OpenAI client ---
134
+ client = OpenAI()
135
+
136
+ # --- Reproducibility ---
137
+ random.seed(42)
138
+
139
+
140
+ # 🧠 LLM Policy (CORE)
141
+ def llm_policy(query: str):
142
+ prompt = f"""
143
+ You are an AI agent choosing the best tool.
144
+
145
+ Available actions:
146
+ - use_calculator (for math problems)
147
+ - use_search (for factual questions)
148
+ - answer_directly (if neither tool is needed)
149
+
150
+ Query: {query}
151
+
152
+ Respond with ONLY one of:
153
+ use_calculator
154
+ use_search
155
+ answer_directly
156
+ """
157
+
158
+ try:
159
+ response = client.chat.completions.create(
160
+ model="gpt-4o-mini",
161
+ messages=[{"role": "user", "content": prompt}],
162
+ temperature=0
163
+ )
164
+
165
+ action = response.choices[0].message.content.strip()
166
+
167
+ # --- Safety check ---
168
+ if action not in ["use_calculator", "use_search", "answer_directly"]:
169
+ return "answer_directly"
170
+
171
+ return action
172
+
173
+ except Exception as e:
174
+ print(f"[ERROR] LLM call failed: {e}")
175
+ return "answer_directly"
176
+
177
+
178
+ # 🧪 Evaluation Loop
179
+ def run_evaluation(num_episodes=50):
180
+ results = []
181
+ total_score = 0
182
+
183
+ difficulty_scores = defaultdict(list)
184
+
185
+ with ToolUseEnv(base_url="http://localhost:8000").sync() as env:
186
+ for _ in range(num_episodes):
187
+ # --- Reset ---
188
+ result = env.reset()
189
+ obs = result.observation
190
+
191
+ query = obs.query
192
+
193
+ # --- Get difficulty ---
194
+ state = env.state()
195
+ difficulty = state.difficulty
196
+
197
+ # --- LLM decides action ---
198
+ action_type = llm_policy(query)
199
+ action = ToolUseAction(action_type=action_type)
200
+
201
+ # --- Step ---
202
+ result = env.step(action)
203
+ obs = result.observation
204
+
205
+ score = result.reward
206
+ total_score += score
207
+
208
+ difficulty_scores[difficulty].append(score)
209
+
210
+ results.append({
211
+ "query": query,
212
+ "difficulty": difficulty,
213
+ "action": action_type,
214
+ "score": score,
215
+ "message": obs.message
216
+ })
217
+
218
+ print(f"Score: {score:.2f}")
219
+
220
+ # --- Overall ---
221
+ avg_score = total_score / num_episodes
222
+
223
+ print("\n=== OVERALL PERFORMANCE ===")
224
+ print(f"Average Score: {avg_score:.2f}")
225
+
226
+ # --- Breakdown ---
227
+ print("\n=== DIFFICULTY BREAKDOWN ===")
228
+ for level in ["easy", "medium", "hard"]:
229
+ if difficulty_scores[level]:
230
+ avg = sum(difficulty_scores[level]) / len(difficulty_scores[level])
231
+ print(f"{level.capitalize()}: {avg:.2f}")
232
+
233
+ # --- Sample Cases ---
234
+ print("\n=== SAMPLE CASES ===")
235
+ for r in results[:5]:
236
+ print(f"\nQuery: {r['query']}")
237
+ print(f"Action: {r['action']}")
238
+ print(f"Score: {r['score']:.2f}")
239
+ print(f"Details: {r['message']}")
240
+
241
+ return results
242
+
243
+
244
+ # 📊 Failure Analysis
245
+ def analyze_failures(results):
246
+ total = len(results)
247
+ tool_failures = 0
248
+ wrong_decisions = 0
249
+
250
+ for r in results:
251
+ msg = r["message"]
252
+
253
+ if "Correct: False" in msg:
254
+ if "use_" in msg:
255
+ tool_failures += 1
256
+ else:
257
+ wrong_decisions += 1
258
+
259
+ print("\n=== FAILURE ANALYSIS ===")
260
+ print(f"Tool failures: {tool_failures}/{total} ({(tool_failures/total)*100:.1f}%)")
261
+ print(f"Wrong decisions: {wrong_decisions}/{total} ({(wrong_decisions/total)*100:.1f}%)")
262
+
263
+
264
+ # 🚀 Main
265
+ if __name__ == "__main__":
266
+ results = run_evaluation(50)
267
+ analyze_failures(results)
tool_use_env/client.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # # All rights reserved.
3
+ # #
4
+ # # This source code is licensed under the BSD-style license found in the
5
+ # # LICENSE file in the root directory of this source tree.
6
+
7
+ # """Tool Use Env Environment Client."""
8
+
9
+ # from typing import Dict
10
+
11
+ # from openenv.core import EnvClient
12
+ # from openenv.core.client_types import StepResult
13
+ # from openenv.core.env_server.types import State
14
+
15
+ # from .models import ToolUseAction, ToolUseObservation
16
+
17
+
18
+ # class ToolUseEnv(
19
+ # EnvClient[ToolUseAction, ToolUseObservation, State]
20
+ # ):
21
+ # """
22
+ # Client for the Tool Use Env Environment.
23
+
24
+ # This client maintains a persistent WebSocket connection to the environment server,
25
+ # enabling efficient multi-step interactions with lower latency.
26
+ # Each client instance has its own dedicated environment session on the server.
27
+
28
+ # Example:
29
+ # >>> # Connect to a running server
30
+ # >>> with ToolUseEnv(base_url="http://localhost:8000") as client:
31
+ # ... result = client.reset()
32
+ # ... print(result.observation.echoed_message)
33
+ # ...
34
+ # ... result = client.step(ToolUseAction(message="Hello!"))
35
+ # ... print(result.observation.echoed_message)
36
+
37
+ # Example with Docker:
38
+ # >>> # Automatically start container and connect
39
+ # >>> client = ToolUseEnv.from_docker_image("tool_use_env-env:latest")
40
+ # >>> try:
41
+ # ... result = client.reset()
42
+ # ... result = client.step(ToolUseAction(message="Test"))
43
+ # ... finally:
44
+ # ... client.close()
45
+ # """
46
+
47
+ # def _step_payload(self, action: ToolUseAction) -> Dict:
48
+ # """
49
+ # Convert ToolUseAction to JSON payload for step message.
50
+
51
+ # Args:
52
+ # action: ToolUseAction instance
53
+
54
+ # Returns:
55
+ # Dictionary representation suitable for JSON encoding
56
+ # """
57
+ # return {
58
+ # "message": action.message,
59
+ # }
60
+
61
+ # def _parse_result(self, payload: Dict) -> StepResult[ToolUseObservation]:
62
+ # """
63
+ # Parse server response into StepResult[ToolUseObservation].
64
+
65
+ # Args:
66
+ # payload: JSON response data from server
67
+
68
+ # Returns:
69
+ # StepResult with ToolUseObservation
70
+ # """
71
+ # obs_data = payload.get("observation", {})
72
+ # observation = ToolUseObservation(
73
+ # echoed_message=obs_data.get("echoed_message", ""),
74
+ # message_length=obs_data.get("message_length", 0),
75
+ # done=payload.get("done", False),
76
+ # reward=payload.get("reward"),
77
+ # metadata=obs_data.get("metadata", {}),
78
+ # )
79
+
80
+ # return StepResult(
81
+ # observation=observation,
82
+ # reward=payload.get("reward"),
83
+ # done=payload.get("done", False),
84
+ # )
85
+
86
+ # def _parse_state(self, payload: Dict) -> State:
87
+ # """
88
+ # Parse server response into State object.
89
+
90
+ # Args:
91
+ # payload: JSON response from state request
92
+
93
+ # Returns:
94
+ # State object with episode_id and step_count
95
+ # """
96
+ # return State(
97
+ # episode_id=payload.get("episode_id"),
98
+ # step_count=payload.get("step_count", 0),
99
+ # )
100
+
101
+ from openenv.core.env_client import EnvClient
102
+ from openenv.core.client_types import StepResult
103
+
104
+ from tool_use_env.models import ToolUseAction, ToolUseObservation, ToolUseState
105
+
106
+
107
+ class ToolUseEnv(EnvClient[ToolUseAction, ToolUseObservation, ToolUseState]):
108
+
109
+ def _step_payload(self, action: ToolUseAction) -> dict:
110
+ return {
111
+ "action_type": action.action_type,
112
+ "artifact_id": action.artifact_id,
113
+ "query": action.query,
114
+ "message": action.message,
115
+ "resolution_code": action.resolution_code,
116
+ }
117
+
118
+ def _parse_result(self, payload: dict) -> StepResult:
119
+ obs_data = payload.get("observation", {})
120
+
121
+ observation = ToolUseObservation(
122
+ done=payload.get("done", False),
123
+ reward=payload.get("reward"),
124
+ task_id=obs_data.get("task_id", ""),
125
+ difficulty=obs_data.get("difficulty", "easy"),
126
+ objective=obs_data.get("objective", ""),
127
+ customer_message=obs_data.get("customer_message", ""),
128
+ workspace_summary=obs_data.get("workspace_summary", ""),
129
+ available_actions=obs_data.get("available_actions", []),
130
+ available_resolution_codes=obs_data.get("available_resolution_codes", []),
131
+ collected_evidence=obs_data.get("collected_evidence", []),
132
+ last_tool_result=obs_data.get("last_tool_result"),
133
+ last_action_error=obs_data.get("last_action_error"),
134
+ remaining_steps=obs_data.get("remaining_steps", 0),
135
+ current_score=obs_data.get("current_score", 0.0),
136
+ metadata=obs_data.get("metadata", {}),
137
+ )
138
+
139
+ return StepResult(
140
+ observation=observation,
141
+ reward=payload.get("reward"),
142
+ done=payload.get("done", False),
143
+ )
144
+
145
+ def _parse_state(self, payload: dict) -> ToolUseState:
146
+ return ToolUseState(
147
+ episode_id=payload.get("episode_id"),
148
+ step_count=payload.get("step_count", 0),
149
+ task_id=payload.get("task_id", ""),
150
+ task_name=payload.get("task_name", ""),
151
+ difficulty=payload.get("difficulty", ""),
152
+ objective=payload.get("objective", ""),
153
+ cumulative_reward=payload.get("cumulative_reward", 0.0),
154
+ final_score=payload.get("final_score", 0.0),
155
+ drafted_reply=payload.get("drafted_reply"),
156
+ resolution_code=payload.get("resolution_code"),
157
+ expected_resolution_code=payload.get("expected_resolution_code", ""),
158
+ required_evidence=payload.get("required_evidence", []),
159
+ collected_evidence=payload.get("collected_evidence", []),
160
+ action_history=payload.get("action_history", []),
161
+ repeat_action_count=payload.get("repeat_action_count", 0),
162
+ last_action_error=payload.get("last_action_error"),
163
+ known_artifacts=payload.get("known_artifacts", {}),
164
+ known_policies=payload.get("known_policies", {}),
165
+ )
tool_use_env/grader.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+
6
+ def _keyword_score(reply: str | None, keywords: list[str]) -> float:
7
+ if not reply or not keywords:
8
+ return 0.0
9
+
10
+ lowered = reply.lower()
11
+ hits = sum(1 for keyword in keywords if keyword.lower() in lowered)
12
+ return hits / len(keywords)
13
+
14
+
15
+ def grade_task(
16
+ task: dict[str, Any],
17
+ collected_evidence: list[str],
18
+ drafted_reply: str | None,
19
+ resolution_code: str | None,
20
+ step_count: int,
21
+ repeat_action_count: int,
22
+ ) -> dict[str, float]:
23
+ required_evidence = task["required_evidence"]
24
+ evidence_hits = sum(1 for key in required_evidence if key in collected_evidence)
25
+ evidence_score = evidence_hits / len(required_evidence)
26
+
27
+ resolution_score = 1.0 if resolution_code == task["expected_resolution_code"] else 0.0
28
+ reply_score = _keyword_score(drafted_reply, task["reply_keywords"])
29
+
30
+ optimal_steps = task.get("optimal_steps", len(required_evidence) + 2)
31
+ extra_steps = max(0, step_count - optimal_steps)
32
+ efficiency_penalty = min(0.25, (extra_steps * 0.05) + (repeat_action_count * 0.04))
33
+ efficiency_score = max(0.0, 1.0 - efficiency_penalty)
34
+
35
+ final_score = (
36
+ 0.5 * resolution_score
37
+ + 0.25 * evidence_score
38
+ + 0.2 * reply_score
39
+ + 0.05 * efficiency_score
40
+ )
41
+
42
+ return {
43
+ "resolution_score": round(resolution_score, 3),
44
+ "evidence_score": round(evidence_score, 3),
45
+ "reply_score": round(reply_score, 3),
46
+ "efficiency_score": round(efficiency_score, 3),
47
+ "final_score": round(min(max(final_score, 0.0), 1.0), 3),
48
+ }
tool_use_env/models.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Literal, Optional
2
+
3
+ from openenv.core.env_server import Action, Observation, State
4
+ from pydantic import Field
5
+
6
+
7
+ class ToolUseAction(Action):
8
+ action_type: Literal[
9
+ "review_ticket",
10
+ "inspect_artifact",
11
+ "search_policy",
12
+ "draft_reply",
13
+ "submit_resolution",
14
+ ] = Field(..., description="The action the agent wants to execute.")
15
+ artifact_id: Optional[str] = Field(
16
+ default=None,
17
+ description="Artifact identifier for inspect_artifact, such as order or risk_log.",
18
+ )
19
+ query: Optional[str] = Field(
20
+ default=None,
21
+ description="Policy name or search query for search_policy.",
22
+ )
23
+ message: Optional[str] = Field(
24
+ default=None,
25
+ description="Customer-facing reply draft used with draft_reply.",
26
+ )
27
+ resolution_code: Optional[str] = Field(
28
+ default=None,
29
+ description="Final resolution code used with submit_resolution.",
30
+ )
31
+
32
+
33
+ class ToolUseObservation(Observation):
34
+ task_id: str = Field(..., description="Deterministic task identifier.")
35
+ difficulty: Literal["easy", "medium", "hard"] = Field(
36
+ ..., description="Difficulty tier for the active task."
37
+ )
38
+ objective: str = Field(..., description="Concrete task objective for the agent.")
39
+ customer_message: str = Field(..., description="The raw customer support ticket.")
40
+ workspace_summary: str = Field(
41
+ ..., description="Short summary of known evidence and remaining work."
42
+ )
43
+ available_actions: List[str] = Field(
44
+ default_factory=list, description="Available environment actions."
45
+ )
46
+ available_resolution_codes: List[str] = Field(
47
+ default_factory=list,
48
+ description="Resolution codes accepted by submit_resolution.",
49
+ )
50
+ collected_evidence: List[str] = Field(
51
+ default_factory=list,
52
+ description="Evidence keys collected so far, such as ticket or payment.",
53
+ )
54
+ last_tool_result: Optional[str] = Field(
55
+ default=None,
56
+ description="Most recent tool or grader output shown to the agent.",
57
+ )
58
+ last_action_error: Optional[str] = Field(
59
+ default=None, description="Validation error for the last action, if any."
60
+ )
61
+ remaining_steps: int = Field(
62
+ ..., description="How many steps are left before the episode ends."
63
+ )
64
+ current_score: float = Field(
65
+ default=0.0,
66
+ description="Current deterministic grader score in the [0, 1] range.",
67
+ )
68
+
69
+
70
+ class ToolUseState(State):
71
+ task_id: str = ""
72
+ task_name: str = ""
73
+ difficulty: str = ""
74
+ objective: str = ""
75
+ cumulative_reward: float = 0.0
76
+ final_score: float = 0.0
77
+ drafted_reply: Optional[str] = None
78
+ resolution_code: Optional[str] = None
79
+ expected_resolution_code: str = ""
80
+ required_evidence: List[str] = Field(default_factory=list)
81
+ collected_evidence: List[str] = Field(default_factory=list)
82
+ action_history: List[str] = Field(default_factory=list)
83
+ repeat_action_count: int = 0
84
+ last_action_error: Optional[str] = None
85
+ known_artifacts: Dict[str, str] = Field(default_factory=dict)
86
+ known_policies: Dict[str, str] = Field(default_factory=dict)
tool_use_env/openenv_tool_use_env.egg-info/PKG-INFO ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: openenv-tool_use_env
3
+ Version: 0.1.0
4
+ Summary: Tool Use Env environment for OpenEnv
5
+ Requires-Python: >=3.10
6
+ Requires-Dist: openenv-core[core]>=0.2.1
7
+ Provides-Extra: dev
8
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
9
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
tool_use_env/openenv_tool_use_env.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ README.md
2
+ __init__.py
3
+ client.py
4
+ grader.py
5
+ models.py
6
+ pyproject.toml
7
+ ./__init__.py
8
+ ./client.py
9
+ ./grader.py
10
+ ./models.py
11
+ openenv_tool_use_env.egg-info/PKG-INFO
12
+ openenv_tool_use_env.egg-info/SOURCES.txt
13
+ openenv_tool_use_env.egg-info/dependency_links.txt
14
+ openenv_tool_use_env.egg-info/entry_points.txt
15
+ openenv_tool_use_env.egg-info/requires.txt
16
+ openenv_tool_use_env.egg-info/top_level.txt
17
+ server/__init__.py
18
+ server/app.py
19
+ server/tool_use_env_environment.py
20
+ tests/test_tools.py
tool_use_env/openenv_tool_use_env.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
tool_use_env/openenv_tool_use_env.egg-info/entry_points.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [console_scripts]
2
+ server = tool_use_env.server.app:main
tool_use_env/openenv_tool_use_env.egg-info/requires.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ openenv-core[core]>=0.2.1
2
+
3
+ [dev]
4
+ pytest>=8.0.0
5
+ pytest-cov>=4.0.0
tool_use_env/openenv_tool_use_env.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ tool_use_env
tool_use_env/pyproject.toml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ [build-system]
8
+ requires = ["setuptools>=45", "wheel"]
9
+ build-backend = "setuptools.build_meta"
10
+
11
+ [project]
12
+ name = "openenv-tool_use_env"
13
+ version = "0.1.0"
14
+ description = "Tool Use Env environment for OpenEnv"
15
+ requires-python = ">=3.10"
16
+ dependencies = [
17
+ # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
18
+ # install from github
19
+ # "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
20
+ "openenv-core[core]>=0.2.1",
21
+ # Environment-specific dependencies
22
+ # Add all dependencies needed for your environment here
23
+ # Examples:
24
+ # "numpy>=1.19.0",
25
+ # "torch>=2.0.0",
26
+ # "gymnasium>=0.29.0",
27
+ # "openspiel>=1.0.0",
28
+ # "smolagents>=1.22.0,<2",
29
+ ]
30
+
31
+ [project.optional-dependencies]
32
+ dev = [
33
+ "pytest>=8.0.0",
34
+ "pytest-cov>=4.0.0",
35
+ ]
36
+
37
+ [project.scripts]
38
+ # Server entry point - enables running via: uv run --project . server
39
+ # or: python -m tool_use_env.server.app
40
+ server = "tool_use_env.server.app:main"
41
+
42
+ [tool.setuptools]
43
+ include-package-data = true
44
+ packages = ["tool_use_env", "tool_use_env.server"]
45
+ package-dir = { "tool_use_env" = ".", "tool_use_env.server" = "server" }
tool_use_env/server/Dockerfile ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # Multi-stage build using openenv-base
8
+ # This Dockerfile is flexible and works for both:
9
+ # - In-repo environments (with local OpenEnv sources)
10
+ # - Standalone environments (with openenv from PyPI/Git)
11
+ # The build script (openenv build) handles context detection and sets appropriate build args.
12
+
13
+ ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
14
+ FROM ${BASE_IMAGE} AS builder
15
+
16
+ WORKDIR /app
17
+
18
+ # Ensure git is available (required for installing dependencies from VCS)
19
+ RUN apt-get update && \
20
+ apt-get install -y --no-install-recommends git && \
21
+ rm -rf /var/lib/apt/lists/*
22
+
23
+ # Build argument to control whether we're building standalone or in-repo
24
+ ARG BUILD_MODE=in-repo
25
+ ARG ENV_NAME=tool_use_env
26
+
27
+ # Copy environment code (always at root of build context)
28
+ COPY . /app/env
29
+
30
+ # For in-repo builds, openenv is already vendored in the build context
31
+ # For standalone builds, openenv will be installed via pyproject.toml
32
+ WORKDIR /app/env
33
+
34
+ # Ensure uv is available (for local builds where base image lacks it)
35
+ RUN if ! command -v uv >/dev/null 2>&1; then \
36
+ curl -LsSf https://astral.sh/uv/install.sh | sh && \
37
+ mv /root/.local/bin/uv /usr/local/bin/uv && \
38
+ mv /root/.local/bin/uvx /usr/local/bin/uvx; \
39
+ fi
40
+
41
+ # Install dependencies using uv sync
42
+ # If uv.lock exists, use it; otherwise resolve on the fly
43
+ RUN --mount=type=cache,target=/root/.cache/uv \
44
+ if [ -f uv.lock ]; then \
45
+ uv sync --frozen --no-install-project --no-editable; \
46
+ else \
47
+ uv sync --no-install-project --no-editable; \
48
+ fi
49
+
50
+ RUN --mount=type=cache,target=/root/.cache/uv \
51
+ if [ -f uv.lock ]; then \
52
+ uv sync --frozen --no-editable; \
53
+ else \
54
+ uv sync --no-editable; \
55
+ fi
56
+
57
+ # Final runtime stage
58
+ FROM ${BASE_IMAGE}
59
+
60
+ WORKDIR /app
61
+
62
+ # Copy the virtual environment from builder
63
+ COPY --from=builder /app/env/.venv /app/.venv
64
+
65
+ # Copy the environment code
66
+ COPY --from=builder /app/env /app/env
67
+
68
+ # Set PATH to use the virtual environment
69
+ ENV PATH="/app/.venv/bin:$PATH"
70
+
71
+ # Set PYTHONPATH so imports work correctly
72
+ ENV PYTHONPATH="/app/env:$PYTHONPATH"
73
+
74
+ # Health check
75
+ HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
76
+ CMD curl -f http://localhost:8000/health || exit 1
77
+
78
+ # Run the FastAPI server
79
+ # The module path is constructed to work with the /app/env structure
80
+ CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
tool_use_env/server/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Tool Use Env environment server components."""
8
+
9
+ from .tool_use_env_environment import ToolUseEnvironment
10
+
11
+ __all__ = ["ToolUseEnvironment"]
tool_use_env/server/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (328 Bytes). View file
 
tool_use_env/server/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (400 Bytes). View file
 
tool_use_env/server/__pycache__/app.cpython-312.pyc ADDED
Binary file (1.05 kB). View file
 
tool_use_env/server/__pycache__/app.cpython-313.pyc ADDED
Binary file (2.8 kB). View file
 
tool_use_env/server/__pycache__/tool_use_env_environment.cpython-312.pyc ADDED
Binary file (15.3 kB). View file
 
tool_use_env/server/__pycache__/tool_use_env_environment.cpython-313.pyc ADDED
Binary file (3.83 kB). View file
 
tool_use_env/server/app.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openenv.core.env_server.http_server import create_app
2
+
3
+ from tool_use_env.models import ToolUseAction, ToolUseObservation
4
+ from tool_use_env.server.tool_use_env_environment import ToolUseEnvironment
5
+
6
+
7
+ app = create_app(
8
+ ToolUseEnvironment,
9
+ ToolUseAction,
10
+ ToolUseObservation,
11
+ env_name="support_ops_env",
12
+ max_concurrent_envs=4,
13
+ )
14
+
15
+
16
+ import uvicorn
17
+
18
+
19
+ def main(host: str = "0.0.0.0", port: int = 8000):
20
+ uvicorn.run("tool_use_env.server.app:app", host=host, port=port)
21
+
22
+
23
+ @app.get("/")
24
+ def root():
25
+ return {"status": "running"}
26
+
27
+
28
+ if __name__ == "__main__":
29
+ main()
tool_use_env/server/requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ openenv
2
+ fastapi
3
+ dotenv
4
+ uvicorn
5
+ pydantic
6
+ python-dotenv
7
+ openai
tool_use_env/server/tool_use_env_environment.py ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import random
4
+ import uuid
5
+ from typing import Any
6
+
7
+ from openenv.core.env_server import Environment
8
+
9
+ from tool_use_env.grader import grade_task
10
+ from tool_use_env.models import ToolUseAction, ToolUseObservation, ToolUseState
11
+ from tool_use_env.tasks import TASKS, TASK_SEQUENCE
12
+
13
+
14
+ class ToolUseEnvironment(Environment):
15
+ SUPPORTS_CONCURRENT_SESSIONS = True
16
+ MAX_STEPS = 6
17
+
18
+ def __init__(self) -> None:
19
+ super().__init__()
20
+ self._state = ToolUseState()
21
+ self._active_task: dict[str, Any] | None = None
22
+ self._task_cursor = 0
23
+
24
+ def _select_task(self, seed: int | None = None, task_id: str | None = None) -> dict[str, Any]:
25
+ if task_id:
26
+ if task_id not in TASKS:
27
+ raise ValueError(f"Unknown task_id '{task_id}'")
28
+ return TASKS[task_id]
29
+
30
+ if seed is not None:
31
+ rng = random.Random(seed)
32
+ return TASKS[TASK_SEQUENCE[rng.randrange(len(TASK_SEQUENCE))]]
33
+
34
+ selected = TASKS[TASK_SEQUENCE[self._task_cursor % len(TASK_SEQUENCE)]]
35
+ self._task_cursor += 1
36
+ return selected
37
+
38
+ def reset(
39
+ self,
40
+ seed: int | None = None,
41
+ episode_id: str | None = None,
42
+ **kwargs: Any,
43
+ ) -> ToolUseObservation:
44
+ task = self._select_task(seed=seed, task_id=kwargs.get("task_id"))
45
+ self._active_task = task
46
+
47
+ self._state = ToolUseState(
48
+ episode_id=episode_id or str(uuid.uuid4()),
49
+ step_count=0,
50
+ task_id=task["task_id"],
51
+ task_name=task["task_name"],
52
+ difficulty=task["difficulty"],
53
+ objective=task["objective"],
54
+ cumulative_reward=0.0,
55
+ final_score=0.0,
56
+ drafted_reply=None,
57
+ resolution_code=None,
58
+ expected_resolution_code=task["expected_resolution_code"],
59
+ required_evidence=list(task["required_evidence"]),
60
+ collected_evidence=["ticket"],
61
+ action_history=[],
62
+ repeat_action_count=0,
63
+ last_action_error=None,
64
+ known_artifacts={},
65
+ known_policies={},
66
+ )
67
+
68
+ return self._build_observation(
69
+ reward=0.0,
70
+ done=False,
71
+ last_tool_result=(
72
+ "Ticket loaded. Start by reviewing the ticket, then inspect the most relevant "
73
+ "artifacts and policy before submitting a resolution."
74
+ ),
75
+ )
76
+
77
+ def _normalize_artifact_id(self, artifact_id: str | None) -> str | None:
78
+ if not artifact_id:
79
+ return None
80
+ normalized = artifact_id.strip().lower().replace(" ", "_")
81
+ aliases = {
82
+ "payments": "payment",
83
+ "billing": "payment",
84
+ "risk": "risk_log",
85
+ "risklog": "risk_log",
86
+ "profile": "account",
87
+ }
88
+ return aliases.get(normalized, normalized)
89
+
90
+ def _resolve_policy_key(self, query: str | None) -> str | None:
91
+ if not query or not self._active_task:
92
+ return None
93
+
94
+ normalized = query.strip().lower().replace(" ", "_")
95
+ policies = self._active_task["policies"]
96
+
97
+ if normalized in policies:
98
+ return normalized
99
+
100
+ alias_map = {
101
+ "damaged": "damaged_items",
102
+ "damage": "damaged_items",
103
+ "replacement": "damaged_items",
104
+ "duplicate": "duplicate_charge",
105
+ "duplicate_charge": "duplicate_charge",
106
+ "billing": "duplicate_charge",
107
+ "fraud": "account_takeover",
108
+ "takeover": "account_takeover",
109
+ "account_takeover": "account_takeover",
110
+ "security": "account_takeover",
111
+ }
112
+ mapped = alias_map.get(normalized)
113
+ if mapped in policies:
114
+ return mapped
115
+
116
+ for key in policies:
117
+ if normalized in key:
118
+ return key
119
+ return None
120
+
121
+ def _record_repeat_if_needed(self, evidence_key: str) -> bool:
122
+ if evidence_key in self._state.collected_evidence:
123
+ self._state.repeat_action_count += 1
124
+ return True
125
+ return False
126
+
127
+ def _partial_score(self) -> float:
128
+ if not self._active_task:
129
+ return 0.0
130
+ return grade_task(
131
+ self._active_task,
132
+ self._state.collected_evidence,
133
+ self._state.drafted_reply,
134
+ self._state.resolution_code,
135
+ self._state.step_count,
136
+ self._state.repeat_action_count,
137
+ )["final_score"]
138
+
139
+ def _append_history(self, action: ToolUseAction) -> None:
140
+ parts = [action.action_type]
141
+ if action.artifact_id:
142
+ parts.append(f"artifact={action.artifact_id}")
143
+ if action.query:
144
+ parts.append(f"query={action.query}")
145
+ if action.resolution_code:
146
+ parts.append(f"resolution={action.resolution_code}")
147
+ self._state.action_history.append(" | ".join(parts))
148
+
149
+ def _build_observation(
150
+ self,
151
+ reward: float,
152
+ done: bool,
153
+ last_tool_result: str | None,
154
+ last_action_error: str | None = None,
155
+ ) -> ToolUseObservation:
156
+ task = self._active_task
157
+ if not task:
158
+ raise RuntimeError("Environment has no active task.")
159
+
160
+ score = self._state.final_score if done else self._partial_score()
161
+ remaining_steps = max(0, self.MAX_STEPS - self._state.step_count)
162
+ known_items = self._state.collected_evidence or ["ticket"]
163
+ draft_status = "present" if self._state.drafted_reply else "missing"
164
+ resolution_status = self._state.resolution_code or "not submitted"
165
+
166
+ summary = (
167
+ f"Known evidence: {', '.join(known_items)}. "
168
+ f"Draft reply: {draft_status}. "
169
+ f"Resolution: {resolution_status}. "
170
+ f"Submit the best supported resolution before steps run out."
171
+ )
172
+
173
+ return ToolUseObservation(
174
+ done=done,
175
+ reward=round(min(max(reward, 0.0), 1.0), 3),
176
+ task_id=task["task_id"],
177
+ difficulty=task["difficulty"],
178
+ objective=task["objective"],
179
+ customer_message=task["customer_message"],
180
+ workspace_summary=summary,
181
+ available_actions=[
182
+ "review_ticket",
183
+ "inspect_artifact",
184
+ "search_policy",
185
+ "draft_reply",
186
+ "submit_resolution",
187
+ ],
188
+ available_resolution_codes=list(task["available_resolution_codes"]),
189
+ collected_evidence=list(self._state.collected_evidence),
190
+ last_tool_result=last_tool_result,
191
+ last_action_error=last_action_error,
192
+ remaining_steps=remaining_steps,
193
+ current_score=round(score, 3),
194
+ metadata={
195
+ "task_name": task["task_name"],
196
+ "action_history": list(self._state.action_history),
197
+ },
198
+ )
199
+
200
+ def _finish_episode(self, resolution_code: str | None, feedback: str) -> ToolUseObservation:
201
+ if not self._active_task:
202
+ raise RuntimeError("Environment has no active task.")
203
+
204
+ self._state.resolution_code = resolution_code
205
+ breakdown = grade_task(
206
+ self._active_task,
207
+ self._state.collected_evidence,
208
+ self._state.drafted_reply,
209
+ self._state.resolution_code,
210
+ self._state.step_count,
211
+ self._state.repeat_action_count,
212
+ )
213
+ self._state.final_score = breakdown["final_score"]
214
+ self._state.last_action_error = None
215
+
216
+ result_text = (
217
+ f"{feedback} | final_score={breakdown['final_score']:.3f} | "
218
+ f"resolution_score={breakdown['resolution_score']:.3f} | "
219
+ f"evidence_score={breakdown['evidence_score']:.3f} | "
220
+ f"reply_score={breakdown['reply_score']:.3f} | "
221
+ f"efficiency_score={breakdown['efficiency_score']:.3f}"
222
+ )
223
+
224
+ return self._build_observation(
225
+ reward=breakdown["final_score"],
226
+ done=True,
227
+ last_tool_result=result_text,
228
+ )
229
+
230
+ def step(
231
+ self,
232
+ action: ToolUseAction,
233
+ timeout_s: float | None = None,
234
+ **kwargs: Any,
235
+ ) -> ToolUseObservation:
236
+ if not self._active_task:
237
+ raise RuntimeError("Call reset() before step().")
238
+
239
+ if self._state.final_score > 0 and self._state.resolution_code:
240
+ return self._build_observation(
241
+ reward=0.0,
242
+ done=True,
243
+ last_tool_result="Episode already finished.",
244
+ last_action_error="episode_already_done",
245
+ )
246
+
247
+ self._state.step_count += 1
248
+ self._append_history(action)
249
+
250
+ reward = 0.0
251
+ last_tool_result = None
252
+ error = None
253
+
254
+ if action.action_type == "review_ticket":
255
+ repeated = self._record_repeat_if_needed("ticket")
256
+ reward = 0.02 if repeated else 0.10
257
+ last_tool_result = self._active_task["customer_message"]
258
+
259
+ elif action.action_type == "inspect_artifact":
260
+ artifact_id = self._normalize_artifact_id(action.artifact_id)
261
+ artifacts = self._active_task["artifacts"]
262
+ if not artifact_id or artifact_id not in artifacts:
263
+ error = "invalid_artifact_id"
264
+ last_tool_result = (
265
+ "Unknown artifact. Valid artifacts: "
266
+ + ", ".join(sorted(artifacts.keys()))
267
+ )
268
+ else:
269
+ evidence_key = f"artifact:{artifact_id}"
270
+ repeated = self._record_repeat_if_needed(evidence_key)
271
+ if not repeated:
272
+ self._state.collected_evidence.append(evidence_key)
273
+ self._state.known_artifacts[artifact_id] = artifacts[artifact_id]
274
+ reward = 0.14 if evidence_key in self._state.required_evidence else 0.04
275
+ else:
276
+ reward = 0.01
277
+ last_tool_result = artifacts[artifact_id]
278
+
279
+ elif action.action_type == "search_policy":
280
+ policy_key = self._resolve_policy_key(action.query)
281
+ policies = self._active_task["policies"]
282
+ if not policy_key:
283
+ error = "policy_not_found"
284
+ last_tool_result = (
285
+ "No matching policy found. Available policies: "
286
+ + ", ".join(sorted(policies.keys()))
287
+ )
288
+ else:
289
+ evidence_key = f"policy:{policy_key}"
290
+ repeated = self._record_repeat_if_needed(evidence_key)
291
+ if not repeated:
292
+ self._state.collected_evidence.append(evidence_key)
293
+ self._state.known_policies[policy_key] = policies[policy_key]
294
+ reward = 0.14 if evidence_key in self._state.required_evidence else 0.04
295
+ else:
296
+ reward = 0.01
297
+ last_tool_result = policies[policy_key]
298
+
299
+ elif action.action_type == "draft_reply":
300
+ if not action.message or not action.message.strip():
301
+ error = "empty_reply"
302
+ last_tool_result = "Draft reply cannot be empty."
303
+ else:
304
+ self._state.drafted_reply = action.message.strip()
305
+ keywords = self._active_task["reply_keywords"]
306
+ hits = sum(
307
+ 1 for keyword in keywords if keyword.lower() in self._state.drafted_reply.lower()
308
+ )
309
+ reward = round(0.05 + (0.15 * (hits / len(keywords))), 3)
310
+ last_tool_result = (
311
+ f"Draft saved. Included {hits}/{len(keywords)} required reply cues."
312
+ )
313
+
314
+ elif action.action_type == "submit_resolution":
315
+ if not action.resolution_code:
316
+ error = "missing_resolution_code"
317
+ last_tool_result = "submit_resolution requires a resolution_code."
318
+ elif action.resolution_code not in self._active_task["available_resolution_codes"]:
319
+ error = "invalid_resolution_code"
320
+ last_tool_result = (
321
+ "Unsupported resolution code. Valid codes: "
322
+ + ", ".join(self._active_task["available_resolution_codes"])
323
+ )
324
+ else:
325
+ return self._finish_episode(
326
+ resolution_code=action.resolution_code,
327
+ feedback=f"Resolution submitted: {action.resolution_code}",
328
+ )
329
+
330
+ else:
331
+ error = "invalid_action_type"
332
+ last_tool_result = "Unsupported action_type."
333
+
334
+ self._state.last_action_error = error
335
+ if self._state.step_count >= self.MAX_STEPS:
336
+ return self._finish_episode(
337
+ resolution_code=self._state.resolution_code,
338
+ feedback="Episode ended because the step limit was reached.",
339
+ )
340
+
341
+ self._state.cumulative_reward = round(self._state.cumulative_reward + reward, 3)
342
+ return self._build_observation(
343
+ reward=reward,
344
+ done=False,
345
+ last_tool_result=last_tool_result,
346
+ last_action_error=error,
347
+ )
348
+
349
+ @property
350
+ def state(self) -> ToolUseState:
351
+ return self._state
tool_use_env/tasks.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TASKS = {
2
+ "damaged-mug-replacement": {
3
+ "task_id": "damaged-mug-replacement",
4
+ "task_name": "Damaged Mug Replacement",
5
+ "difficulty": "easy",
6
+ "objective": (
7
+ "Resolve a damaged-item support ticket by gathering the right evidence, "
8
+ "drafting a concise customer reply, and submitting the correct resolution."
9
+ ),
10
+ "customer_message": (
11
+ "Hi support, my Northwind ceramic mug from order O-1001 arrived shattered. "
12
+ "I uploaded a photo. I still want the mug if you can replace it quickly."
13
+ ),
14
+ "artifacts": {
15
+ "order": (
16
+ "Order O-1001 | Item: Northwind ceramic mug | Delivered: 2026-04-01 | "
17
+ "Photo evidence attached: yes | Carrier note: box dented on arrival."
18
+ ),
19
+ "account": (
20
+ "Customer since 2023 | No prior claims abuse | Shipping address verified."
21
+ ),
22
+ },
23
+ "policies": {
24
+ "damaged_items": (
25
+ "Damaged items reported within 7 days with photo evidence qualify for a "
26
+ "free replacement. Low-cost broken items do not need to be returned."
27
+ )
28
+ },
29
+ "required_evidence": ["ticket", "artifact:order", "policy:damaged_items"],
30
+ "expected_resolution_code": "send_replacement",
31
+ "available_resolution_codes": [
32
+ "send_replacement",
33
+ "issue_refund",
34
+ "request_more_info",
35
+ "deny_request",
36
+ ],
37
+ "reply_keywords": ["replacement", "48 hours", "no need to return"],
38
+ "optimal_steps": 4,
39
+ },
40
+ "duplicate-charge-refund": {
41
+ "task_id": "duplicate-charge-refund",
42
+ "task_name": "Duplicate Charge Refund",
43
+ "difficulty": "medium",
44
+ "objective": (
45
+ "Investigate a billing complaint, confirm whether a duplicate charge occurred, "
46
+ "and choose the correct refund resolution."
47
+ ),
48
+ "customer_message": (
49
+ "I was charged twice for the same blender order and only received one item. "
50
+ "Please fix this. Order number is O-2044."
51
+ ),
52
+ "artifacts": {
53
+ "order": (
54
+ "Order O-2044 | Item: Pulse blender | Fulfilled once on 2026-03-28 | "
55
+ "Only one shipment and one invoice should exist."
56
+ ),
57
+ "payment": (
58
+ "Payments for O-2044 | Charge 1: CAPTURED $89.00 | Charge 2: CAPTURED $89.00 | "
59
+ "Gateway note: retry after timeout caused duplicate capture."
60
+ ),
61
+ "account": "Customer in good standing | No prior charge disputes.",
62
+ },
63
+ "policies": {
64
+ "duplicate_charge": (
65
+ "If a duplicate capture is confirmed, refund the extra charge to the original "
66
+ "payment method within 3-5 business days."
67
+ )
68
+ },
69
+ "required_evidence": [
70
+ "ticket",
71
+ "artifact:order",
72
+ "artifact:payment",
73
+ "policy:duplicate_charge",
74
+ ],
75
+ "expected_resolution_code": "refund_duplicate_charge",
76
+ "available_resolution_codes": [
77
+ "refund_duplicate_charge",
78
+ "issue_refund",
79
+ "request_more_info",
80
+ "deny_request",
81
+ ],
82
+ "reply_keywords": ["duplicate charge", "refund", "3-5 business days"],
83
+ "optimal_steps": 5,
84
+ },
85
+ "account-takeover-fraud": {
86
+ "task_id": "account-takeover-fraud",
87
+ "task_name": "Account Takeover Fraud Escalation",
88
+ "difficulty": "hard",
89
+ "objective": (
90
+ "Handle a potential account takeover involving digital gift cards. Gather fraud "
91
+ "signals, apply the correct security-first resolution, and communicate next steps."
92
+ ),
93
+ "customer_message": (
94
+ "I did not place the gift card order on my account last night. Please stop any "
95
+ "more charges and secure my account immediately."
96
+ ),
97
+ "artifacts": {
98
+ "account": (
99
+ "Account A-77 | Password reset disabled | New login from unfamiliar IP at 02:14 UTC | "
100
+ "2FA not enabled."
101
+ ),
102
+ "order": (
103
+ "Order O-7781 | Digital gift cards totaling $250 | Delivered instantly by email | "
104
+ "Marked high-risk by payments."
105
+ ),
106
+ "risk_log": (
107
+ "Risk engine score: 0.98 | Device mismatch: true | Velocity spike: true | "
108
+ "Recommendation: lock account and escalate to fraud operations."
109
+ ),
110
+ },
111
+ "policies": {
112
+ "account_takeover": (
113
+ "Suspected account takeover with digital goods requires immediate account lock, "
114
+ "fraud team escalation, and customer follow-up within 24 hours. Do not promise "
115
+ "an automatic refund before investigation."
116
+ )
117
+ },
118
+ "required_evidence": [
119
+ "ticket",
120
+ "artifact:account",
121
+ "artifact:risk_log",
122
+ "policy:account_takeover",
123
+ ],
124
+ "expected_resolution_code": "lock_account_and_escalate_fraud",
125
+ "available_resolution_codes": [
126
+ "lock_account_and_escalate_fraud",
127
+ "issue_refund",
128
+ "request_more_info",
129
+ "deny_request",
130
+ ],
131
+ "reply_keywords": ["account locked", "fraud team", "24 hours"],
132
+ "optimal_steps": 5,
133
+ },
134
+ }
135
+
136
+
137
+ TASK_SEQUENCE = [
138
+ "damaged-mug-replacement",
139
+ "duplicate-charge-refund",
140
+ "account-takeover-fraud",
141
+ ]