Spaces:

Clove25
/

tool-use-openenv

Sleeping

App Files Files Community

Clove25 commited on Mar 31

Commit

d9175ae

verified ·

1 Parent(s): 1f758fd

Upload 41 files

Browse files

Files changed (41) hide show

Dockerfile +21 -0
__pycache__/inference.cpython-313.pyc +0 -0
__pycache__/inference.cpython-314.pyc +0 -0
inference.py +187 -0
openenv.yaml +41 -0
tool_use_env/README.md +256 -0
tool_use_env/__init__.py +16 -0
tool_use_env/__pycache__/__init__.cpython-312.pyc +0 -0
tool_use_env/__pycache__/__init__.cpython-313.pyc +0 -0
tool_use_env/__pycache__/__init__.cpython-314.pyc +0 -0
tool_use_env/__pycache__/client.cpython-312.pyc +0 -0
tool_use_env/__pycache__/client.cpython-313.pyc +0 -0
tool_use_env/__pycache__/client.cpython-314.pyc +0 -0
tool_use_env/__pycache__/grader.cpython-312.pyc +0 -0
tool_use_env/__pycache__/models.cpython-312.pyc +0 -0
tool_use_env/__pycache__/models.cpython-313.pyc +0 -0
tool_use_env/agents/__pycache__/baseline.cpython-313.pyc +0 -0
tool_use_env/agents/baseline.py +267 -0
tool_use_env/client.py +139 -0
tool_use_env/grader.py +25 -0
tool_use_env/models.py +47 -0
tool_use_env/openenv_tool_use_env.egg-info/PKG-INFO +9 -0
tool_use_env/openenv_tool_use_env.egg-info/SOURCES.txt +20 -0
tool_use_env/openenv_tool_use_env.egg-info/dependency_links.txt +1 -0
tool_use_env/openenv_tool_use_env.egg-info/entry_points.txt +2 -0
tool_use_env/openenv_tool_use_env.egg-info/requires.txt +5 -0
tool_use_env/openenv_tool_use_env.egg-info/top_level.txt +1 -0
tool_use_env/pyproject.toml +45 -0
tool_use_env/server/Dockerfile +80 -0
tool_use_env/server/__init__.py +11 -0
tool_use_env/server/__pycache__/__init__.cpython-312.pyc +0 -0
tool_use_env/server/__pycache__/__init__.cpython-313.pyc +0 -0
tool_use_env/server/__pycache__/app.cpython-312.pyc +0 -0
tool_use_env/server/__pycache__/app.cpython-313.pyc +0 -0
tool_use_env/server/__pycache__/tool_use_env_environment.cpython-312.pyc +0 -0
tool_use_env/server/__pycache__/tool_use_env_environment.cpython-313.pyc +0 -0
tool_use_env/server/app.py +23 -0
tool_use_env/server/requirements.txt +7 -0
tool_use_env/server/tool_use_env_environment.py +222 -0
tool_use_env/tests/test_tools.py +23 -0
tool_use_env/uv.lock +0 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,21 @@

+FROM python:3.11-slim
+WORKDIR /app
+# Copy entire project
+COPY . .
+# Move into package directory
+WORKDIR /app/tool_use_env
+# Install uv (needed for pyproject-based install)
+RUN pip install --no-cache-dir uv
+# Install project + dependencies
+RUN uv pip install --system -e .
+# Expose port
+EXPOSE 8000
+# Run server
+CMD ["uvicorn", "tool_use_env.server.app:app", "--host", "0.0.0.0", "--port", "8000"]

__pycache__/inference.cpython-313.pyc ADDED Viewed

Binary file (6.03 kB). View file

__pycache__/inference.cpython-314.pyc ADDED Viewed

Binary file (6.74 kB). View file

inference.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import os
+import random
+from collections import defaultdict
+from dotenv import load_dotenv
+from openai import OpenAI
+from tool_use_env.client import ToolUseEnv
+from tool_use_env.models import ToolUseAction
+# --- Load env ---
+load_dotenv()
+HF_TOKEN = os.getenv("HF_TOKEN")
+HF_MODEL = os.getenv("HF_MODEL", "meta-llama/Meta-Llama-3-8B-Instruct")
+# --- HF client ---
+hf_client = OpenAI(
+    base_url="https://router.huggingface.co/v1",
+    api_key=HF_TOKEN
+)
+# --- Reproducibility ---
+random.seed(42)
+# --- Global flag ---
+HF_AVAILABLE = True
+# 🧠 Rule-based (correct logic)
+def rule_based_policy(query: str):
+    q = query.lower()
+    if any(op in q for op in ["+", "-", "*", "/"]):
+        return "use_calculator"
+    if "capital" in q or "who is" in q or "ceo" in q:
+        return "use_search"
+    return "use_search"
+# 🧠 Noisy fallback (simulate LLM mistakes)
+def noisy_rule_policy(query: str):
+    correct = rule_based_policy(query)
+    if random.random() < 0.08:   # 8% noise
+        action = random.choice([
+            "use_calculator",
+            "use_search",
+            "answer_directly"
+        ])
+    return correct
+# 🧠 LLM + fallback policy
+def llm_policy(query: str):
+    global HF_AVAILABLE
+    prompt = f"""
+You are an AI agent.
+Choose EXACTLY one action:
+- use_calculator
+- use_search
+- answer_directly
+Query: {query}
+ONLY output one action.
+"""
+    # --- Try HF only if still available ---
+    if HF_AVAILABLE:
+        try:
+            response = hf_client.chat.completions.create(
+                model=HF_MODEL,
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0
+            )
+            action = response.choices[0].message.content.strip()
+            if random.random() < 0.08:
+                action = random.choice([
+                    "use_calculator",
+                    "use_search",
+                    "answer_directly"
+                ])
+            if action in ["use_calculator", "use_search", "answer_directly"]:
+                print("[HF] Used")
+                return action
+        except Exception as e:
+            print("[HF FAILED → switching to fallback permanently]")
+            HF_AVAILABLE = False
+    # --- Fallback ---
+    return noisy_rule_policy(query)
+# 🧪 Evaluation
+def run_evaluation(num_episodes=50):
+    results = []
+    total_score = 0
+    difficulty_scores = defaultdict(list)
+    with ToolUseEnv(base_url="http://localhost:8000").sync() as env:
+        for _ in range(num_episodes):
+            result = env.reset()
+            obs = result.observation
+            query = obs.query
+            state = env.state()
+            difficulty = state.difficulty
+            action_type = llm_policy(query)
+            action = ToolUseAction(action_type=action_type)
+            result = env.step(action)
+            obs = result.observation
+            score = result.reward
+            total_score += score
+            difficulty_scores[difficulty].append(score)
+            results.append({
+                "query": query,
+                "difficulty": difficulty,
+                "action": action_type,
+                "score": score,
+                "message": obs.message
+            })
+            print(f"Score: {score:.2f}")
+    avg_score = total_score / num_episodes
+    print("\n=== OVERALL PERFORMANCE ===")
+    print(f"Average Score: {avg_score:.2f}")
+    print("\n=== DIFFICULTY BREAKDOWN ===")
+    for level in ["easy", "medium", "hard"]:
+        if difficulty_scores[level]:
+            avg = sum(difficulty_scores[level]) / len(difficulty_scores[level])
+            print(f"{level.capitalize()}: {avg:.2f}")
+    print("\n=== SAMPLE CASES ===")
+    for r in results[:5]:
+        print(f"\nQuery: {r['query']}")
+        print(f"Action: {r['action']}")
+        print(f"Score: {r['score']:.2f}")
+        print(f"Details: {r['message']}")
+    return results
+# 📊 Failure analysis (FIXED VERSION)
+def analyze_failures(results):
+    total = len(results)
+    tool_failures = 0
+    wrong_decisions = 0
+    for r in results:
+        score = r["score"]
+        action = r["action"]
+        if score < 0.5:
+            if "use_" in action:
+                tool_failures += 1
+            else:
+                wrong_decisions += 1
+    print("\n=== FAILURE ANALYSIS ===")
+    print(f"Tool failures: {tool_failures}/{total} ({(tool_failures/total)*100:.1f}%)")
+    print(f"Wrong decisions: {wrong_decisions}/{total} ({(wrong_decisions/total)*100:.1f}%)")
+# 🚀 Run
+if __name__ == "__main__":
+    results = run_evaluation(50)
+    analyze_failures(results)

openenv.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+name: tool_use_env
+description: Evaluate AI agents on reliable tool usage under uncertainty
+version: 1.0
+entrypoint: server.app:app
+actions:
+  type: object
+  properties:
+    action_type:
+      type: string
+      enum:
+        - use_calculator
+        - use_search
+        - answer_directly
+observations:
+  type: object
+  properties:
+    query:
+      type: string
+    tool_output:
+      type: string
+      nullable: true
+    message:
+      type: string
+reward_range: [0.0, 1.0]
+metadata:
+  difficulty_levels:
+    - easy
+    - medium
+    - hard
+  features:
+    - tool_selection
+    - partial_rewards
+    - decision_making
+    - efficiency_penalty

tool_use_env/README.md ADDED Viewed

	@@ -0,0 +1,256 @@

+---
+title: Tool Use Env Environment Server
+emoji: 📀
+colorFrom: purple
+colorTo: gray
+sdk: docker
+pinned: false
+app_port: 8000
+base_path: /web
+tags:
+  - openenv
+---
+# Tool Use Env Environment
+A simple test environment that echoes back messages. Perfect for testing the env APIs as well as demonstrating environment usage patterns.
+## Quick Start
+hi
+The simplest way to use the Tool Use Env environment is through the `ToolUseEnv` class:
+```python
+from tool_use_env import ToolUseAction, ToolUseEnv
+try:
+    # Create environment from Docker image
+    tool_use_envenv = ToolUseEnv.from_docker_image("tool_use_env-env:latest")
+    # Reset
+    result = tool_use_envenv.reset()
+    print(f"Reset: {result.observation.echoed_message}")
+    # Send multiple messages
+    messages = ["Hello, World!", "Testing echo", "Final message"]
+    for msg in messages:
+        result = tool_use_envenv.step(ToolUseAction(message=msg))
+        print(f"Sent: '{msg}'")
+        print(f"  → Echoed: '{result.observation.echoed_message}'")
+        print(f"  → Length: {result.observation.message_length}")
+        print(f"  → Reward: {result.reward}")
+finally:
+    # Always clean up
+    tool_use_envenv.close()
+```
+That's it! The `ToolUseEnv.from_docker_image()` method handles:
+- Starting the Docker container
+- Waiting for the server to be ready
+- Connecting to the environment
+- Container cleanup when you call `close()`
+## Building the Docker Image
+Before using the environment, you need to build the Docker image:
+```bash
+# From project root
+docker build -t tool_use_env-env:latest -f server/Dockerfile .
+```
+## Deploying to Hugging Face Spaces
+You can easily deploy your OpenEnv environment to Hugging Face Spaces using the `openenv push` command:
+```bash
+# From the environment directory (where openenv.yaml is located)
+openenv push
+# Or specify options
+openenv push --namespace my-org --private
+```
+The `openenv push` command will:
+1. Validate that the directory is an OpenEnv environment (checks for `openenv.yaml`)
+2. Prepare a custom build for Hugging Face Docker space (enables web interface)
+3. Upload to Hugging Face (ensuring you're logged in)
+### Prerequisites
+- Authenticate with Hugging Face: The command will prompt for login if not already authenticated
+### Options
+- `--directory`, `-d`: Directory containing the OpenEnv environment (defaults to current directory)
+- `--repo-id`, `-r`: Repository ID in format 'username/repo-name' (defaults to 'username/env-name' from openenv.yaml)
+- `--base-image`, `-b`: Base Docker image to use (overrides Dockerfile FROM)
+- `--private`: Deploy the space as private (default: public)
+### Examples
+```bash
+# Push to your personal namespace (defaults to username/env-name from openenv.yaml)
+openenv push
+# Push to a specific repository
+openenv push --repo-id my-org/my-env
+# Push with a custom base image
+openenv push --base-image ghcr.io/meta-pytorch/openenv-base:latest
+# Push as a private space
+openenv push --private
+# Combine options
+openenv push --repo-id my-org/my-env --base-image custom-base:latest --private
+```
+After deployment, your space will be available at:
+`https://huggingface.co/spaces/<repo-id>`
+The deployed space includes:
+- **Web Interface** at `/web` - Interactive UI for exploring the environment
+- **API Documentation** at `/docs` - Full OpenAPI/Swagger interface
+- **Health Check** at `/health` - Container health monitoring
+- **WebSocket** at `/ws` - Persistent session endpoint for low-latency interactions
+## Environment Details
+### Action
+**ToolUseAction**: Contains a single field
+- `message` (str) - The message to echo back
+### Observation
+**ToolUseObservation**: Contains the echo response and metadata
+- `echoed_message` (str) - The message echoed back
+- `message_length` (int) - Length of the message
+- `reward` (float) - Reward based on message length (length × 0.1)
+- `done` (bool) - Always False for echo environment
+- `metadata` (dict) - Additional info like step count
+### Reward
+The reward is calculated as: `message_length × 0.1`
+- "Hi" → reward: 0.2
+- "Hello, World!" → reward: 1.3
+- Empty message → reward: 0.0
+## Advanced Usage
+### Connecting to an Existing Server
+If you already have a Tool Use Env environment server running, you can connect directly:
+```python
+from tool_use_env import ToolUseEnv
+# Connect to existing server
+tool_use_envenv = ToolUseEnv(base_url="<ENV_HTTP_URL_HERE>")
+# Use as normal
+result = tool_use_envenv.reset()
+result = tool_use_envenv.step(ToolUseAction(message="Hello!"))
+```
+Note: When connecting to an existing server, `tool_use_envenv.close()` will NOT stop the server.
+### Using the Context Manager
+The client supports context manager usage for automatic connection management:
+```python
+from tool_use_env import ToolUseAction, ToolUseEnv
+# Connect with context manager (auto-connects and closes)
+with ToolUseEnv(base_url="http://localhost:8000") as env:
+    result = env.reset()
+    print(f"Reset: {result.observation.echoed_message}")
+    # Multiple steps with low latency
+    for msg in ["Hello", "World", "!"]:
+        result = env.step(ToolUseAction(message=msg))
+        print(f"Echoed: {result.observation.echoed_message}")
+```
+The client uses WebSocket connections for:
+- **Lower latency**: No HTTP connection overhead per request
+- **Persistent session**: Server maintains your environment state
+- **Efficient for episodes**: Better for many sequential steps
+### Concurrent WebSocket Sessions
+The server supports multiple concurrent WebSocket connections. To enable this,
+modify `server/app.py` to use factory mode:
+```python
+# In server/app.py - use factory mode for concurrent sessions
+app = create_app(
+    ToolUseEnvironment,  # Pass class, not instance
+    ToolUseAction,
+    ToolUseObservation,
+    max_concurrent_envs=4,  # Allow 4 concurrent sessions
+)
+```
+Then multiple clients can connect simultaneously:
+```python
+from tool_use_env import ToolUseAction, ToolUseEnv
+from concurrent.futures import ThreadPoolExecutor
+def run_episode(client_id: int):
+    with ToolUseEnv(base_url="http://localhost:8000") as env:
+        result = env.reset()
+        for i in range(10):
+            result = env.step(ToolUseAction(message=f"Client {client_id}, step {i}"))
+        return client_id, result.observation.message_length
+# Run 4 episodes concurrently
+with ThreadPoolExecutor(max_workers=4) as executor:
+    results = list(executor.map(run_episode, range(4)))
+```
+## Development & Testing
+### Direct Environment Testing
+Test the environment logic directly without starting the HTTP server:
+```bash
+# From the server directory
+python3 server/tool_use_env_environment.py
+```
+This verifies that:
+- Environment resets correctly
+- Step executes actions properly
+- State tracking works
+- Rewards are calculated correctly
+### Running Locally
+Run the server locally for development:
+```bash
+uvicorn server.app:app --reload
+```
+## Project Structure
+```
+tool_use_env/
+├── .dockerignore         # Docker build exclusions
+├── __init__.py            # Module exports
+├── README.md              # This file
+├── openenv.yaml           # OpenEnv manifest
+├── pyproject.toml         # Project metadata and dependencies
+├── uv.lock                # Locked dependencies (generated)
+├── client.py              # ToolUseEnv client
+├── models.py              # Action and Observation models
+└── server/
+    ├── __init__.py        # Server module exports
+    ├── tool_use_env_environment.py  # Core environment logic
+    ├── app.py             # FastAPI application (HTTP + WebSocket endpoints)
+    └── Dockerfile         # Container image definition
+```

tool_use_env/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Tool Use Env Environment."""
+from .client import ToolUseEnv
+from .models import ToolUseAction, ToolUseObservation
+__all__ = [
+    "ToolUseAction",
+    "ToolUseObservation",
+    "ToolUseEnv",
+]

tool_use_env/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (364 Bytes). View file

tool_use_env/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (364 Bytes). View file

tool_use_env/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (361 Bytes). View file

tool_use_env/__pycache__/client.cpython-312.pyc ADDED Viewed

Binary file (2.21 kB). View file

tool_use_env/__pycache__/client.cpython-313.pyc ADDED Viewed

Binary file (2.26 kB). View file

tool_use_env/__pycache__/client.cpython-314.pyc ADDED Viewed

Binary file (2.78 kB). View file

tool_use_env/__pycache__/grader.cpython-312.pyc ADDED Viewed

Binary file (716 Bytes). View file

tool_use_env/__pycache__/models.cpython-312.pyc ADDED Viewed

Binary file (1.29 kB). View file

tool_use_env/__pycache__/models.cpython-313.pyc ADDED Viewed

Binary file (1.41 kB). View file

tool_use_env/agents/__pycache__/baseline.cpython-313.pyc ADDED Viewed

Binary file (4.72 kB). View file

tool_use_env/agents/baseline.py ADDED Viewed

	@@ -0,0 +1,267 @@

+# from tool_use_env.client import ToolUseEnv
+# from tool_use_env.models import ToolUseAction
+# import random
+# def rule_based_policy(query: str):
+#     query = query.lower()
+#     # --- Introduce slight imperfection ---
+#     if random.random() < 0.1:
+#         return "answer_directly"
+#     if "what is" in query and any(op in query for op in ["+", "-", "*", "/"]):
+#         return "use_calculator"
+#     if "capital" in query or "who is" in query:
+#         return "use_search"
+#     return "answer_directly"
+# def run_single_episode(env):
+#     result = env.reset()
+#     obs = result.observation
+#     query = obs.query
+#     action_type = rule_based_policy(query)
+#     action = ToolUseAction(action_type=action_type)
+#     result = env.step(action)
+#     obs = result.observation
+#     return {
+#         "query": query,
+#         "action": action_type,
+#         "reward": result.reward,
+#         "message": obs.message
+#     }
+# def run_evaluation(num_episodes=20):
+#     results = []
+#     difficulty_scores = {
+#         "easy": [],
+#         "medium": [],
+#         "hard": []
+#     }
+#     total_score = 0
+#     with ToolUseEnv(base_url="http://localhost:8000").sync() as env:
+#         for _ in range(num_episodes):
+#             result = env.reset()
+#             obs = result.observation
+#             query = obs.query
+#             state = env.state()
+#             difficulty = state.difficulty
+#             action_type = rule_based_policy(query)
+#             action = ToolUseAction(action_type=action_type)
+#             result = env.step(action)
+#             score = result.reward
+#             total_score += score
+#             difficulty_scores[difficulty].append(score)
+#             results.append({
+#                 "query": query,
+#                 "difficulty": difficulty,
+#                 "action": action_type,
+#                 "score": score,
+#                 "message": result.observation.message
+#             })
+#     avg_score = total_score / num_episodes
+#     print("\n=== OVERALL PERFORMANCE ===")
+#     print(f"Average Score: {avg_score:.2f}")
+#     print("\n=== DIFFICULTY BREAKDOWN ===")
+#     for level in difficulty_scores:
+#         if difficulty_scores[level]:
+#             avg = sum(difficulty_scores[level]) / len(difficulty_scores[level])
+#             print(f"{level.capitalize()}: {avg:.2f}")
+#     print("\n=== SAMPLE CASES ===")
+#     for r in results[:5]:
+#         print(f"\nQuery: {r['query']}")
+#         print(f"Action: {r['action']}")
+#         print(f"Score: {r['score']:.2f}")
+#         print(f"Details: {r['message']}")
+#     return results
+# def analyze_failures(results):
+#     wrong_decisions = 0
+#     tool_failures = 0
+#     total = len(results)
+#     for r in results:
+#         msg = r["message"]
+#         if "Correct: False" in msg:
+#             if "use_" in msg:
+#                 tool_failures += 1
+#             else:
+#                 wrong_decisions += 1
+#     print("\n=== FAILURE ANALYSIS ===")
+#     print(f"Tool failures: {tool_failures}/{total} ({(tool_failures/total)*100:.1f}%)")
+#     print(f"Wrong decisions: {wrong_decisions}/{total} ({(wrong_decisions/total)*100:.1f}%)")
+# if __name__ == "__main__":
+#     results = run_evaluation(50)
+#     analyze_failures(results)
+import os
+import random
+from collections import defaultdict
+from dotenv import load_dotenv
+from openai import OpenAI
+from tool_use_env.client import ToolUseEnv
+from tool_use_env.models import ToolUseAction
+# --- Load environment variables ---
+load_dotenv()
+# --- Initialize OpenAI client ---
+client = OpenAI()
+# --- Reproducibility ---
+random.seed(42)
+# 🧠 LLM Policy (CORE)
+def llm_policy(query: str):
+    prompt = f"""
+You are an AI agent choosing the best tool.
+Available actions:
+- use_calculator (for math problems)
+- use_search (for factual questions)
+- answer_directly (if neither tool is needed)
+Query: {query}
+Respond with ONLY one of:
+use_calculator
+use_search
+answer_directly
+"""
+    try:
+        response = client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0
+        )
+        action = response.choices[0].message.content.strip()
+        # --- Safety check ---
+        if action not in ["use_calculator", "use_search", "answer_directly"]:
+            return "answer_directly"
+        return action
+    except Exception as e:
+        print(f"[ERROR] LLM call failed: {e}")
+        return "answer_directly"
+# 🧪 Evaluation Loop
+def run_evaluation(num_episodes=50):
+    results = []
+    total_score = 0
+    difficulty_scores = defaultdict(list)
+    with ToolUseEnv(base_url="http://localhost:8000").sync() as env:
+        for _ in range(num_episodes):
+            # --- Reset ---
+            result = env.reset()
+            obs = result.observation
+            query = obs.query
+            # --- Get difficulty ---
+            state = env.state()
+            difficulty = state.difficulty
+            # --- LLM decides action ---
+            action_type = llm_policy(query)
+            action = ToolUseAction(action_type=action_type)
+            # --- Step ---
+            result = env.step(action)
+            obs = result.observation
+            score = result.reward
+            total_score += score
+            difficulty_scores[difficulty].append(score)
+            results.append({
+                "query": query,
+                "difficulty": difficulty,
+                "action": action_type,
+                "score": score,
+                "message": obs.message
+            })
+            print(f"Score: {score:.2f}")
+    # --- Overall ---
+    avg_score = total_score / num_episodes
+    print("\n=== OVERALL PERFORMANCE ===")
+    print(f"Average Score: {avg_score:.2f}")
+    # --- Breakdown ---
+    print("\n=== DIFFICULTY BREAKDOWN ===")
+    for level in ["easy", "medium", "hard"]:
+        if difficulty_scores[level]:
+            avg = sum(difficulty_scores[level]) / len(difficulty_scores[level])
+            print(f"{level.capitalize()}: {avg:.2f}")
+    # --- Sample Cases ---
+    print("\n=== SAMPLE CASES ===")
+    for r in results[:5]:
+        print(f"\nQuery: {r['query']}")
+        print(f"Action: {r['action']}")
+        print(f"Score: {r['score']:.2f}")
+        print(f"Details: {r['message']}")
+    return results
+# 📊 Failure Analysis
+def analyze_failures(results):
+    total = len(results)
+    tool_failures = 0
+    wrong_decisions = 0
+    for r in results:
+        msg = r["message"]
+        if "Correct: False" in msg:
+            if "use_" in msg:
+                tool_failures += 1
+            else:
+                wrong_decisions += 1
+    print("\n=== FAILURE ANALYSIS ===")
+    print(f"Tool failures: {tool_failures}/{total} ({(tool_failures/total)*100:.1f}%)")
+    print(f"Wrong decisions: {wrong_decisions}/{total} ({(wrong_decisions/total)*100:.1f}%)")
+# 🚀 Main
+if __name__ == "__main__":
+    results = run_evaluation(50)
+    analyze_failures(results)

tool_use_env/client.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# # Copyright (c) Meta Platforms, Inc. and affiliates.
+# # All rights reserved.
+# #
+# # This source code is licensed under the BSD-style license found in the
+# # LICENSE file in the root directory of this source tree.
+# """Tool Use Env Environment Client."""
+# from typing import Dict
+# from openenv.core import EnvClient
+# from openenv.core.client_types import StepResult
+# from openenv.core.env_server.types import State
+# from .models import ToolUseAction, ToolUseObservation
+# class ToolUseEnv(
+#     EnvClient[ToolUseAction, ToolUseObservation, State]
+# ):
+#     """
+#     Client for the Tool Use Env Environment.
+#     This client maintains a persistent WebSocket connection to the environment server,
+#     enabling efficient multi-step interactions with lower latency.
+#     Each client instance has its own dedicated environment session on the server.
+#     Example:
+#         >>> # Connect to a running server
+#         >>> with ToolUseEnv(base_url="http://localhost:8000") as client:
+#         ...     result = client.reset()
+#         ...     print(result.observation.echoed_message)
+#         ...
+#         ...     result = client.step(ToolUseAction(message="Hello!"))
+#         ...     print(result.observation.echoed_message)
+#     Example with Docker:
+#         >>> # Automatically start container and connect
+#         >>> client = ToolUseEnv.from_docker_image("tool_use_env-env:latest")
+#         >>> try:
+#         ...     result = client.reset()
+#         ...     result = client.step(ToolUseAction(message="Test"))
+#         ... finally:
+#         ...     client.close()
+#     """
+#     def _step_payload(self, action: ToolUseAction) -> Dict:
+#         """
+#         Convert ToolUseAction to JSON payload for step message.
+#         Args:
+#             action: ToolUseAction instance
+#         Returns:
+#             Dictionary representation suitable for JSON encoding
+#         """
+#         return {
+#             "message": action.message,
+#         }
+#     def _parse_result(self, payload: Dict) -> StepResult[ToolUseObservation]:
+#         """
+#         Parse server response into StepResult[ToolUseObservation].
+#         Args:
+#             payload: JSON response data from server
+#         Returns:
+#             StepResult with ToolUseObservation
+#         """
+#         obs_data = payload.get("observation", {})
+#         observation = ToolUseObservation(
+#             echoed_message=obs_data.get("echoed_message", ""),
+#             message_length=obs_data.get("message_length", 0),
+#             done=payload.get("done", False),
+#             reward=payload.get("reward"),
+#             metadata=obs_data.get("metadata", {}),
+#         )
+#         return StepResult(
+#             observation=observation,
+#             reward=payload.get("reward"),
+#             done=payload.get("done", False),
+#         )
+#     def _parse_state(self, payload: Dict) -> State:
+#         """
+#         Parse server response into State object.
+#         Args:
+#             payload: JSON response from state request
+#         Returns:
+#             State object with episode_id and step_count
+#         """
+#         return State(
+#             episode_id=payload.get("episode_id"),
+#             step_count=payload.get("step_count", 0),
+#         )
+from openenv.core.env_client import EnvClient
+from openenv.core.client_types import StepResult
+from tool_use_env.models import ToolUseAction, ToolUseObservation, ToolUseState
+class ToolUseEnv(EnvClient[ToolUseAction, ToolUseObservation, ToolUseState]):
+    def _step_payload(self, action: ToolUseAction) -> dict:
+        return {
+            "action_type": action.action_type
+        }
+    def _parse_result(self, payload: dict) -> StepResult:
+        obs_data = payload.get("observation", {})
+        observation = ToolUseObservation(
+            done=payload.get("done", False),
+            reward=payload.get("reward"),
+            query=obs_data.get("query", ""),
+            tool_output=obs_data.get("tool_output"),
+            message=obs_data.get("message", "")
+        )
+        return StepResult(
+            observation=observation,
+            reward=payload.get("reward"),
+            done=payload.get("done", False),
+        )
+    def _parse_state(self, payload: dict) -> ToolUseState:
+        return ToolUseState(
+            episode_id=payload.get("episode_id"),
+            step_count=payload.get("step_count", 0),
+            current_query=payload.get("current_query", ""),
+            correct_action=payload.get("correct_action", ""),
+            correct_answer=payload.get("correct_answer", ""),
+            difficulty=payload.get("difficulty", "")
+        )

tool_use_env/grader.py ADDED Viewed

	@@ -0,0 +1,25 @@

+def compute_grade(action_taken, correct_action, output, correct_answer):
+    """
+    Returns score between 0.0 and 1.0
+    """
+    # 1. Action correctness
+    action_correct = 1.0 if action_taken == correct_action else 0.0
+    # 2. Answer correctness
+    answer_correct = 1.0 if output == correct_answer else 0.0
+    # 3. Efficiency (simple version)
+    if action_taken in ["use_calculator", "use_search"]:
+        efficiency = 0.5   # using tool has cost
+    else:
+        efficiency = 1.0   # direct answer is efficient
+    # Final score
+    score = (
+        0.4 * action_correct +
+        0.5 * answer_correct +
+        0.1 * efficiency
+    )
+    return round(score, 2)

tool_use_env/models.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Data models for the Tool Use Env Environment.
+The tool_use_env environment is a simple test environment that echoes back messages.
+"""
+# from openenv.core.env_server.types import Action, Observation
+# from pydantic import Field
+# class ToolUseAction(Action):
+#     """Action for the Tool Use Env environment - just a message to echo."""
+#     message: str = Field(..., description="Message to echo back")
+# class ToolUseObservation(Observation):
+#     """Observation from the Tool Use Env environment - the echoed message."""
+#     echoed_message: str = Field(default="", description="The echoed message")
+#     message_length: int = Field(default=0, description="Length of the echoed message")
+from openenv.core.env_server import Action, Observation, State
+from typing import Optional
+class ToolUseAction(Action):
+    action_type: str
+class ToolUseObservation(Observation):
+    query: str
+    tool_output: Optional[str]
+    message: str
+class ToolUseState(State):
+    current_query: str = ""
+    correct_action: str = ""
+    correct_answer: str = ""
+    difficulty: str = ""

tool_use_env/openenv_tool_use_env.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,9 @@

+Metadata-Version: 2.4
+Name: openenv-tool_use_env
+Version: 0.1.0
+Summary: Tool Use Env environment for OpenEnv
+Requires-Python: >=3.10
+Requires-Dist: openenv-core[core]>=0.2.1
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0.0; extra == "dev"
+Requires-Dist: pytest-cov>=4.0.0; extra == "dev"

tool_use_env/openenv_tool_use_env.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+README.md
+__init__.py
+client.py
+grader.py
+models.py
+pyproject.toml
+./__init__.py
+./client.py
+./grader.py
+./models.py
+openenv_tool_use_env.egg-info/PKG-INFO
+openenv_tool_use_env.egg-info/SOURCES.txt
+openenv_tool_use_env.egg-info/dependency_links.txt
+openenv_tool_use_env.egg-info/entry_points.txt
+openenv_tool_use_env.egg-info/requires.txt
+openenv_tool_use_env.egg-info/top_level.txt
+server/__init__.py
+server/app.py
+server/tool_use_env_environment.py
+tests/test_tools.py

tool_use_env/openenv_tool_use_env.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

tool_use_env/openenv_tool_use_env.egg-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [console_scripts]
2	+ server = tool_use_env.server.app:main

tool_use_env/openenv_tool_use_env.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+openenv-core[core]>=0.2.1
+[dev]
+pytest>=8.0.0
+pytest-cov>=4.0.0

tool_use_env/openenv_tool_use_env.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ tool_use_env

tool_use_env/pyproject.toml ADDED Viewed

	@@ -0,0 +1,45 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+[build-system]
+requires = ["setuptools>=45", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "openenv-tool_use_env"
+version = "0.1.0"
+description = "Tool Use Env environment for OpenEnv"
+requires-python = ">=3.10"
+dependencies = [
+    # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
+    # install from github
+    # "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
+    "openenv-core[core]>=0.2.1",
+    # Environment-specific dependencies
+    # Add all dependencies needed for your environment here
+    # Examples:
+    # "numpy>=1.19.0",
+    # "torch>=2.0.0",
+    # "gymnasium>=0.29.0",
+    # "openspiel>=1.0.0",
+    # "smolagents>=1.22.0,<2",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-cov>=4.0.0",
+]
+[project.scripts]
+# Server entry point - enables running via: uv run --project . server
+# or: python -m tool_use_env.server.app
+server = "tool_use_env.server.app:main"
+[tool.setuptools]
+include-package-data = true
+packages = ["tool_use_env", "tool_use_env.server"]
+package-dir = { "tool_use_env" = ".", "tool_use_env.server" = "server" }

tool_use_env/server/Dockerfile ADDED Viewed

	@@ -0,0 +1,80 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Multi-stage build using openenv-base
+# This Dockerfile is flexible and works for both:
+# - In-repo environments (with local OpenEnv sources)
+# - Standalone environments (with openenv from PyPI/Git)
+# The build script (openenv build) handles context detection and sets appropriate build args.
+ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
+FROM ${BASE_IMAGE} AS builder
+WORKDIR /app
+# Ensure git is available (required for installing dependencies from VCS)
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends git && \
+    rm -rf /var/lib/apt/lists/*
+# Build argument to control whether we're building standalone or in-repo
+ARG BUILD_MODE=in-repo
+ARG ENV_NAME=tool_use_env
+# Copy environment code (always at root of build context)
+COPY . /app/env
+# For in-repo builds, openenv is already vendored in the build context
+# For standalone builds, openenv will be installed via pyproject.toml
+WORKDIR /app/env
+# Ensure uv is available (for local builds where base image lacks it)
+RUN if ! command -v uv >/dev/null 2>&1; then \
+        curl -LsSf https://astral.sh/uv/install.sh | sh && \
+        mv /root/.local/bin/uv /usr/local/bin/uv && \
+        mv /root/.local/bin/uvx /usr/local/bin/uvx; \
+    fi
+# Install dependencies using uv sync
+# If uv.lock exists, use it; otherwise resolve on the fly
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-install-project --no-editable; \
+    else \
+        uv sync --no-install-project --no-editable; \
+    fi
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-editable; \
+    else \
+        uv sync --no-editable; \
+    fi
+# Final runtime stage
+FROM ${BASE_IMAGE}
+WORKDIR /app
+# Copy the virtual environment from builder
+COPY --from=builder /app/env/.venv /app/.venv
+# Copy the environment code
+COPY --from=builder /app/env /app/env
+# Set PATH to use the virtual environment
+ENV PATH="/app/.venv/bin:$PATH"
+# Set PYTHONPATH so imports work correctly
+ENV PYTHONPATH="/app/env:$PYTHONPATH"
+# Health check
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+# Run the FastAPI server
+# The module path is constructed to work with the /app/env structure
+CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]

tool_use_env/server/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Tool Use Env environment server components."""
+from .tool_use_env_environment import ToolUseEnvironment
+__all__ = ["ToolUseEnvironment"]

tool_use_env/server/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (328 Bytes). View file

tool_use_env/server/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (400 Bytes). View file

tool_use_env/server/__pycache__/app.cpython-312.pyc ADDED Viewed

Binary file (886 Bytes). View file

tool_use_env/server/__pycache__/app.cpython-313.pyc ADDED Viewed

Binary file (2.8 kB). View file

tool_use_env/server/__pycache__/tool_use_env_environment.cpython-312.pyc ADDED Viewed

Binary file (6.22 kB). View file

tool_use_env/server/__pycache__/tool_use_env_environment.cpython-313.pyc ADDED Viewed

Binary file (3.83 kB). View file

tool_use_env/server/app.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from openenv.core.env_server.http_server import create_app
+from tool_use_env.models import ToolUseAction, ToolUseObservation
+from tool_use_env.server.tool_use_env_environment import ToolUseEnvironment
+app = create_app(
+    ToolUseEnvironment,
+    ToolUseAction,
+    ToolUseObservation,
+    env_name="tool_use_env",
+    max_concurrent_envs=1,
+)
+import uvicorn
+def main(host: str = "0.0.0.0", port: int = 8000):
+    uvicorn.run("tool_use_env.server.app:app", host=host, port=port)
+if __name__ == "__main__":
+    main()

tool_use_env/server/requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+openenv
+fastapi
+dotenv
+uvicorn
+pydantic
+python-dotenv
+openai

tool_use_env/server/tool_use_env_environment.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import random
+import uuid
+from openenv.core.env_server import Environment
+from tool_use_env.models import ToolUseAction, ToolUseObservation, ToolUseState
+from tool_use_env.grader import compute_grade
+class ToolUseEnvironment(Environment):
+    SUPPORTS_CONCURRENT_SESSIONS = True
+    def __init__(self):
+        self._state = ToolUseState()
+        self._tasks = self._load_tasks()
+    def _load_tasks(self):
+        return [
+            {
+                "query": "What is 5 + 7?",
+                "answer": "12",
+                "correct_action": "use_calculator",
+                "difficulty": "easy"
+            },
+            {
+                "query": "Capital of France?",
+                "answer": "Paris",
+                "correct_action": "use_search",
+                "difficulty": "easy"
+            },
+            {
+                "query": "What is 123 * 456?",
+                "answer": "56088",
+                "correct_action": "use_calculator",
+                "difficulty": "hard"
+            },
+            {
+                "query": "What is 25 * 4?",
+                "answer": "100",
+                "correct_action": "use_calculator",
+                "difficulty": "medium"
+            },
+            {
+                "query": "Who is the CEO of Tesla?",
+                "answer": "Elon Musk",
+                "correct_action": "use_search",
+                "difficulty": "medium"
+            }
+        ]
+    def reset(self, seed=None, episode_id=None, **kwargs) -> ToolUseObservation:
+        task = random.choice(self._tasks)
+        self._state = ToolUseState(
+            episode_id=episode_id or str(uuid.uuid4()),
+            step_count=0,
+            current_query=task["query"],
+            correct_action=task["correct_action"],
+            correct_answer=task["answer"],
+            difficulty=task["difficulty"]
+        )
+        return ToolUseObservation(
+            done=False,
+            reward=None,
+            query=task["query"],
+            tool_output=None,
+            message="Choose an action"
+        )
+    # 🔢 Calculator tool (controlled noise)
+    def _calculator(self, query):
+        try:
+            expr = query.lower()
+            expr = expr.replace("what is", "").replace("?", "").strip()
+            correct = eval(expr)
+            difficulty = self._state.difficulty
+            if difficulty == "easy":
+                fail_prob = 0.06
+            elif difficulty == "medium":
+                fail_prob = 0.12
+            else:
+                fail_prob = 0.18
+            # complexity-based failure
+            if len(query) > 20:
+                fail_prob += 0.05
+            # 🔥 cap failure (IMPORTANT)
+            fail_prob = min(fail_prob, 0.25)
+            if random.random() < fail_prob:
+                # 🔥 scale noise based on magnitude
+                if abs(correct) < 50:
+                    noise = random.randint(-2, 2)
+                else:
+                    noise = int(correct * random.uniform(-0.05, 0.05))
+                return str(correct + noise)
+            return str(correct)
+        except Exception:
+            return "error"
+    # 🔍 Search tool (controlled noise)
+    def _search(self, query):
+        kb = {
+            "Capital of France": "Paris",
+            "CEO of Tesla": "Elon Musk"
+        }
+        difficulty = self._state.difficulty
+        for key in kb:
+            if key.lower() in query.lower():
+                if difficulty == "easy":
+                    fail_prob = 0.07
+                elif difficulty == "medium":
+                    fail_prob = 0.15
+                else:
+                    fail_prob = 0.22
+                # complexity-based failure
+                if len(query) > 20:
+                    fail_prob += 0.05
+                # 🔥 cap failure
+                fail_prob = min(fail_prob, 0.30)
+                if random.random() < fail_prob:
+                    return random.choice([
+                        "Unknown",
+                        "Not sure",
+                        "No results found"
+                    ])
+                return kb[key]
+        return "not found"
+    def step(self, action: ToolUseAction, timeout_s=None, **kwargs) -> ToolUseObservation:
+        self._state.step_count += 1
+        query = self._state.current_query
+        correct_action = self._state.correct_action
+        correct_answer = self._state.correct_answer
+        difficulty = self._state.difficulty
+        action_type = action.action_type
+        # --- Execute tool ---
+        if action_type == "use_calculator":
+            output = self._calculator(query)
+        elif action_type == "use_search":
+            output = self._search(query)
+        elif action_type == "answer_directly":
+            output = "unknown"
+        else:
+            output = "invalid action"
+        # --- Check correctness ---
+        answer_correct = (output == correct_answer)
+        # 🧠 REWARD SYSTEM (FINAL)
+        # 1. Action correctness
+        action_score = 0.4 if action_type == correct_action else 0.1
+        # 2. Answer correctness
+        answer_score = 0.5 if answer_correct else 0.0
+        # 3. Tool cost (small penalty)
+        if action_type == "use_calculator":
+            tool_penalty = 0.05
+        elif action_type == "use_search":
+            tool_penalty = 0.08
+        else:
+            tool_penalty = 0.0
+        # 4. Failure bonus (good reasoning but tool failed)
+        failure_bonus = 0.1 if (not answer_correct and action_type == correct_action) else 0.0
+        # 5. Combine
+        reward = action_score + answer_score + failure_bonus - tool_penalty
+        # 6. Difficulty scaling (light)
+        if difficulty == "medium":
+            reward *= 1.02
+        elif difficulty == "hard":
+            reward *= 0.9
+        # 7. Clamp (VERY IMPORTANT)
+        reward = max(0.0, min(1.0, reward))
+        # --- Grade (for reporting only) ---
+        grade = compute_grade(
+            action_taken=action_type,
+            correct_action=correct_action,
+            output=output,
+            correct_answer=correct_answer
+        )
+        return ToolUseObservation(
+            done=True,
+            reward=reward,
+            query=query,
+            tool_output=output,
+            message=(
+                f"Action: {action_type}, "
+                f"Output: {output}, "
+                f"Correct: {answer_correct}, "
+                f"Reward: {reward:.2f}, "
+                f"Grade: {grade:.2f}"
+            )
+        )
+    @property
+    def state(self) -> ToolUseState:
+        return self._state

tool_use_env/tests/test_tools.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from server.tool_use_env_environment import ToolUseEnvironment
+env = ToolUseEnvironment()
+def test_calculator_correct():
+    result = env._calculator("What is 2 + 2?")
+    assert result in ["4", "3", "5"]  # allow noise
+def test_search():
+    result = env._search("Capital of France?")
+    assert result in ["Paris", "Unknown"]
+def test_step_output():
+    env = ToolUseEnvironment()
+    action = {"action_type": "use_calculator"}
+    result = env.step(action)
+    obs = result.observation
+    print(obs.query)
+    assert -1 <= result.reward <= 1
+    assert result.query is not None

tool_use_env/uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff