Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- Dockerfile +65 -21
- inference.py +97 -63
- openenv.yaml +1 -1
Dockerfile
CHANGED
|
@@ -1,39 +1,83 @@
|
|
| 1 |
-
#
|
| 2 |
-
#
|
| 3 |
-
#
|
|
|
|
|
|
|
| 4 |
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
WORKDIR /app
|
| 8 |
|
| 9 |
-
#
|
| 10 |
RUN apt-get update && \
|
| 11 |
-
apt-get install -y --no-install-recommends git
|
| 12 |
rm -rf /var/lib/apt/lists/*
|
| 13 |
|
| 14 |
-
#
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
mv /root/.local/bin/uvx /usr/local/bin/uvx 2>/dev/null || true
|
| 18 |
|
| 19 |
-
# Copy
|
| 20 |
COPY . /app/env
|
| 21 |
|
|
|
|
|
|
|
| 22 |
WORKDIR /app/env
|
| 23 |
|
| 24 |
-
#
|
| 25 |
-
RUN
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
-
#
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
ENV PYTHONPATH="/app/env:$PYTHONPATH"
|
| 30 |
-
ENV HARVGYM_NO_EMBED=1
|
| 31 |
|
| 32 |
-
#
|
| 33 |
-
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
|
|
|
| 37 |
|
|
|
|
|
|
|
| 38 |
ENV ENABLE_WEB_INTERFACE=true
|
| 39 |
-
CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
|
| 7 |
+
# Multi-stage build using openenv-base
|
| 8 |
+
# This Dockerfile is flexible and works for both:
|
| 9 |
+
# - In-repo environments (with local OpenEnv sources)
|
| 10 |
+
# - Standalone environments (with openenv from PyPI/Git)
|
| 11 |
+
# The build script (openenv build) handles context detection and sets appropriate build args.
|
| 12 |
+
|
| 13 |
+
ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
|
| 14 |
+
FROM ${BASE_IMAGE} AS builder
|
| 15 |
|
| 16 |
WORKDIR /app
|
| 17 |
|
| 18 |
+
# Ensure git is available (required for installing dependencies from VCS)
|
| 19 |
RUN apt-get update && \
|
| 20 |
+
apt-get install -y --no-install-recommends git && \
|
| 21 |
rm -rf /var/lib/apt/lists/*
|
| 22 |
|
| 23 |
+
# Build argument to control whether we're building standalone or in-repo
|
| 24 |
+
ARG BUILD_MODE=in-repo
|
| 25 |
+
ARG ENV_NAME=HARvestGym
|
|
|
|
| 26 |
|
| 27 |
+
# Copy environment code (always at root of build context)
|
| 28 |
COPY . /app/env
|
| 29 |
|
| 30 |
+
# For in-repo builds, openenv is already vendored in the build context
|
| 31 |
+
# For standalone builds, openenv will be installed via pyproject.toml
|
| 32 |
WORKDIR /app/env
|
| 33 |
|
| 34 |
+
# Ensure uv is available (for local builds where base image lacks it)
|
| 35 |
+
RUN if ! command -v uv >/dev/null 2>&1; then \
|
| 36 |
+
curl -LsSf https://astral.sh/uv/install.sh | sh && \
|
| 37 |
+
mv /root/.local/bin/uv /usr/local/bin/uv && \
|
| 38 |
+
mv /root/.local/bin/uvx /usr/local/bin/uvx; \
|
| 39 |
+
fi
|
| 40 |
+
|
| 41 |
+
# Install dependencies using uv sync (no sentence-transformers/torch — keyword search only)
|
| 42 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 43 |
+
if [ -f uv.lock ]; then \
|
| 44 |
+
uv sync --frozen --no-install-project --no-editable --no-extra embeddings; \
|
| 45 |
+
else \
|
| 46 |
+
uv sync --no-install-project --no-editable --no-extra embeddings; \
|
| 47 |
+
fi
|
| 48 |
+
|
| 49 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 50 |
+
if [ -f uv.lock ]; then \
|
| 51 |
+
uv sync --frozen --no-editable --no-extra embeddings; \
|
| 52 |
+
else \
|
| 53 |
+
uv sync --no-editable --no-extra embeddings; \
|
| 54 |
+
fi
|
| 55 |
+
|
| 56 |
+
# Final runtime stage
|
| 57 |
+
FROM ${BASE_IMAGE}
|
| 58 |
+
|
| 59 |
+
WORKDIR /app
|
| 60 |
|
| 61 |
+
# Copy the virtual environment from builder
|
| 62 |
+
COPY --from=builder /app/env/.venv /app/.venv
|
| 63 |
+
|
| 64 |
+
# Copy the environment code
|
| 65 |
+
COPY --from=builder /app/env /app/env
|
| 66 |
+
|
| 67 |
+
# Set PATH to use the virtual environment
|
| 68 |
+
ENV PATH="/app/.venv/bin:$PATH"
|
| 69 |
+
|
| 70 |
+
# Set PYTHONPATH so imports work correctly
|
| 71 |
ENV PYTHONPATH="/app/env:$PYTHONPATH"
|
|
|
|
| 72 |
|
| 73 |
+
# Disable embedding model download (keyword search only, fits cpu-basic RAM)
|
| 74 |
+
ENV HARVGYM_NO_EMBED=1
|
| 75 |
|
| 76 |
+
# Health check
|
| 77 |
+
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
|
| 78 |
+
CMD curl -f http://localhost:8000/health || exit 1
|
| 79 |
|
| 80 |
+
# Run the FastAPI server
|
| 81 |
+
# The module path is constructed to work with the /app/env structure
|
| 82 |
ENV ENABLE_WEB_INTERFACE=true
|
| 83 |
+
CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
|
inference.py
CHANGED
|
@@ -42,11 +42,12 @@ if not HF_TOKEN:
|
|
| 42 |
|
| 43 |
BENCHMARK = "harvgym"
|
| 44 |
MAX_STEPS = 20
|
| 45 |
-
TEMPERATURE = 0.
|
| 46 |
-
MAX_TOKENS =
|
| 47 |
SUCCESS_SCORE_THRESHOLD = 0.5
|
| 48 |
|
| 49 |
-
# Task definitions
|
|
|
|
| 50 |
TASKS = [
|
| 51 |
{
|
| 52 |
"task_name": "har_classify_easy",
|
|
@@ -58,14 +59,14 @@ TASKS = [
|
|
| 58 |
{
|
| 59 |
"task_name": "har_classify_medium",
|
| 60 |
"template_id": 3,
|
| 61 |
-
"description": "Add 'Radiant Tee' to a guest cart",
|
| 62 |
"app_base_url": "http://ec2-16-59-2-56.us-east-2.compute.amazonaws.com:7770/",
|
| 63 |
"difficulty": "medium",
|
| 64 |
},
|
| 65 |
{
|
| 66 |
"task_name": "har_pipeline_hard",
|
| 67 |
"template_id": 6,
|
| 68 |
-
"description": "Complete a guest checkout for 'Radiant Tee'",
|
| 69 |
"app_base_url": "http://ec2-16-59-2-56.us-east-2.compute.amazonaws.com:7770/",
|
| 70 |
"difficulty": "hard",
|
| 71 |
},
|
|
@@ -82,7 +83,6 @@ def log_start(task: str, env: str, model: str) -> None:
|
|
| 82 |
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
|
| 83 |
error_val = error if error else "null"
|
| 84 |
done_val = str(done).lower()
|
| 85 |
-
# Sanitize action: no newlines
|
| 86 |
action_clean = action.replace("\n", " ").replace("\r", "")[:200]
|
| 87 |
print(
|
| 88 |
f"[STEP] step={step} action={action_clean} reward={reward:.2f} done={done_val} error={error_val}",
|
|
@@ -104,40 +104,50 @@ def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> No
|
|
| 104 |
|
| 105 |
SYSTEM_PROMPT = textwrap.dedent("""
|
| 106 |
You are an API agent. Your goal is to complete a real-world task by calling the correct
|
| 107 |
-
sequence of HTTP API endpoints on a live
|
| 108 |
|
| 109 |
-
You have exactly these tools available
|
| 110 |
|
| 111 |
1. browser_agent(task, url)
|
| 112 |
-
→ Discovers
|
| 113 |
-
→
|
|
|
|
| 114 |
|
| 115 |
2. search_endpoints(query)
|
| 116 |
-
→ Semantic search
|
| 117 |
-
→
|
|
|
|
|
|
|
| 118 |
|
| 119 |
3. curl_exec(command)
|
| 120 |
-
→ Execute an HTTP
|
| 121 |
-
→
|
| 122 |
-
→
|
| 123 |
|
| 124 |
4. search_episode_data(query)
|
| 125 |
-
→ Search all prior API responses in this episode for a
|
| 126 |
-
→ Use when a response
|
| 127 |
|
| 128 |
5. done(result?)
|
| 129 |
-
→ Call when the task is complete.
|
| 130 |
|
| 131 |
RULES:
|
| 132 |
-
- Output ONLY a
|
| 133 |
-
-
|
| 134 |
-
-
|
| 135 |
-
-
|
| 136 |
-
-
|
| 137 |
-
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
""").strip()
|
| 142 |
|
| 143 |
|
|
@@ -145,36 +155,63 @@ EXAMPLE output format:
|
|
| 145 |
# LLM agent loop
|
| 146 |
# ---------------------------------------------------------------------------
|
| 147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
def build_user_prompt(task_desc: str, app_base_url: str, step: int,
|
| 149 |
-
|
| 150 |
-
|
| 151 |
"""Build the user prompt for each step."""
|
| 152 |
-
|
| 153 |
if history:
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
-
|
|
|
|
| 164 |
|
| 165 |
return textwrap.dedent(f"""
|
| 166 |
TASK: {task_desc}
|
| 167 |
APP URL: {app_base_url}
|
| 168 |
STEP: {step}/{MAX_STEPS}
|
| 169 |
|
| 170 |
-
SESSION STATE (
|
| 171 |
{session_str}
|
| 172 |
|
| 173 |
LAST TOOL RESULT:
|
| 174 |
{last_result_str}
|
| 175 |
|
| 176 |
-
|
| 177 |
-
{
|
| 178 |
|
| 179 |
What is your next tool call? Output ONLY the JSON object.
|
| 180 |
""").strip()
|
|
@@ -199,14 +236,13 @@ def get_model_action(client: OpenAI, task_desc: str, app_base_url: str,
|
|
| 199 |
)
|
| 200 |
text = (completion.choices[0].message.content or "").strip()
|
| 201 |
|
| 202 |
-
#
|
| 203 |
-
# Handle markdown code blocks
|
| 204 |
if "```json" in text:
|
| 205 |
text = text.split("```json")[1].split("```")[0].strip()
|
| 206 |
elif "```" in text:
|
| 207 |
text = text.split("```")[1].split("```")[0].strip()
|
| 208 |
|
| 209 |
-
#
|
| 210 |
start = text.find("{")
|
| 211 |
end = text.rfind("}") + 1
|
| 212 |
if start >= 0 and end > start:
|
|
@@ -216,20 +252,19 @@ def get_model_action(client: OpenAI, task_desc: str, app_base_url: str,
|
|
| 216 |
if "tool" in parsed:
|
| 217 |
return parsed
|
| 218 |
|
| 219 |
-
# LLM returned something else — default to done
|
| 220 |
return {"tool": "done", "args": {"result": "Model returned non-tool response"}}
|
| 221 |
|
| 222 |
except json.JSONDecodeError:
|
| 223 |
-
#
|
|
|
|
|
|
|
| 224 |
if "browser_agent" in text:
|
| 225 |
return {"tool": "browser_agent", "args": {"task": task_desc, "url": app_base_url}}
|
| 226 |
-
|
| 227 |
return {"tool": "done", "args": {}}
|
| 228 |
-
|
| 229 |
-
return {"tool": "done", "args": {"result": f"Parse error: {text[:100]}"}}
|
| 230 |
except Exception as exc:
|
| 231 |
print(f"[DEBUG] LLM call failed: {exc}", flush=True)
|
| 232 |
-
# Default to browser_agent on first step, done otherwise
|
| 233 |
if step == 1:
|
| 234 |
return {"tool": "browser_agent", "args": {"task": task_desc, "url": app_base_url}}
|
| 235 |
return {"tool": "done", "args": {"result": f"LLM error: {exc}"}}
|
|
@@ -242,7 +277,6 @@ def get_model_action(client: OpenAI, task_desc: str, app_base_url: str,
|
|
| 242 |
async def run_episode(task_config: dict, client: OpenAI) -> dict:
|
| 243 |
"""
|
| 244 |
Run a single episode for one task.
|
| 245 |
-
|
| 246 |
Returns: {"task_name", "success", "steps", "score", "rewards"}
|
| 247 |
"""
|
| 248 |
from server.models import HARvestGymEnvironment, HarvestGymAction
|
|
@@ -252,8 +286,8 @@ async def run_episode(task_config: dict, client: OpenAI) -> dict:
|
|
| 252 |
task_description = task_config["description"]
|
| 253 |
app_base_url = task_config["app_base_url"]
|
| 254 |
|
| 255 |
-
#
|
| 256 |
-
os.environ["HARVGYM_TASK"] =
|
| 257 |
|
| 258 |
env = HARvestGymEnvironment()
|
| 259 |
|
|
@@ -268,8 +302,9 @@ async def run_episode(task_config: dict, client: OpenAI) -> dict:
|
|
| 268 |
log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
|
| 269 |
|
| 270 |
try:
|
| 271 |
-
# Reset
|
| 272 |
obs = env.reset()
|
|
|
|
|
|
|
| 273 |
task_desc = obs.task or task_description
|
| 274 |
base_url = obs.app_base_url or app_base_url
|
| 275 |
|
|
@@ -277,7 +312,6 @@ async def run_episode(task_config: dict, client: OpenAI) -> dict:
|
|
| 277 |
if getattr(obs, "done", False):
|
| 278 |
break
|
| 279 |
|
| 280 |
-
# Get action from LLM
|
| 281 |
action_dict = get_model_action(
|
| 282 |
client=client,
|
| 283 |
task_desc=task_desc,
|
|
@@ -303,7 +337,6 @@ async def run_episode(task_config: dict, client: OpenAI) -> dict:
|
|
| 303 |
last_result = obs.last_tool_result
|
| 304 |
session_state = dict(obs.session_state or {})
|
| 305 |
|
| 306 |
-
# Update history
|
| 307 |
history.append({
|
| 308 |
"step": step,
|
| 309 |
"tool": tool,
|
|
@@ -323,13 +356,14 @@ async def run_episode(task_config: dict, client: OpenAI) -> dict:
|
|
| 323 |
if done:
|
| 324 |
break
|
| 325 |
|
| 326 |
-
#
|
| 327 |
-
#
|
|
|
|
| 328 |
total_reward = sum(rewards)
|
| 329 |
-
#
|
| 330 |
-
|
| 331 |
-
score = max(0.0, min(1.0,
|
| 332 |
-
success =
|
| 333 |
|
| 334 |
except Exception as exc:
|
| 335 |
error_str = str(exc)[:200]
|
|
|
|
| 42 |
|
| 43 |
BENCHMARK = "harvgym"
|
| 44 |
MAX_STEPS = 20
|
| 45 |
+
TEMPERATURE = 0.2 # Lower temp → more deterministic tool calls
|
| 46 |
+
MAX_TOKENS = 1024 # More room for reasoning + JSON
|
| 47 |
SUCCESS_SCORE_THRESHOLD = 0.5
|
| 48 |
|
| 49 |
+
# Task definitions: use FIXED task descriptions so the model always knows
|
| 50 |
+
# exactly what to do (env.reset() may randomize, but we tell it the target)
|
| 51 |
TASKS = [
|
| 52 |
{
|
| 53 |
"task_name": "har_classify_easy",
|
|
|
|
| 59 |
{
|
| 60 |
"task_name": "har_classify_medium",
|
| 61 |
"template_id": 3,
|
| 62 |
+
"description": "Add 'Radiant Tee' (SKU: MH01-XS-Black) to a guest cart",
|
| 63 |
"app_base_url": "http://ec2-16-59-2-56.us-east-2.compute.amazonaws.com:7770/",
|
| 64 |
"difficulty": "medium",
|
| 65 |
},
|
| 66 |
{
|
| 67 |
"task_name": "har_pipeline_hard",
|
| 68 |
"template_id": 6,
|
| 69 |
+
"description": "Complete a full guest checkout for 'Radiant Tee' (SKU: MH01-XS-Black)",
|
| 70 |
"app_base_url": "http://ec2-16-59-2-56.us-east-2.compute.amazonaws.com:7770/",
|
| 71 |
"difficulty": "hard",
|
| 72 |
},
|
|
|
|
| 83 |
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
|
| 84 |
error_val = error if error else "null"
|
| 85 |
done_val = str(done).lower()
|
|
|
|
| 86 |
action_clean = action.replace("\n", " ").replace("\r", "")[:200]
|
| 87 |
print(
|
| 88 |
f"[STEP] step={step} action={action_clean} reward={reward:.2f} done={done_val} error={error_val}",
|
|
|
|
| 104 |
|
| 105 |
SYSTEM_PROMPT = textwrap.dedent("""
|
| 106 |
You are an API agent. Your goal is to complete a real-world task by calling the correct
|
| 107 |
+
sequence of HTTP API endpoints on a live Magento shopping application.
|
| 108 |
|
| 109 |
+
You have exactly these tools available. Output ONE tool call per turn as a JSON object:
|
| 110 |
|
| 111 |
1. browser_agent(task, url)
|
| 112 |
+
→ Discovers API endpoints for this app from recorded traffic + catalog.
|
| 113 |
+
→ Call this FIRST and ONLY ONCE (at step 1).
|
| 114 |
+
→ Returns: {"app": "shopping", "total_endpoints": N, "endpoints": [...]}
|
| 115 |
|
| 116 |
2. search_endpoints(query)
|
| 117 |
+
→ Semantic/keyword search over all discovered endpoint schemas.
|
| 118 |
+
→ Returns full endpoint details: method, path, required parameters, auth.
|
| 119 |
+
→ Use this to find the exact path and params before making curl calls.
|
| 120 |
+
→ Example: search_endpoints("guest cart create") → POST /rest/V1/guest-carts
|
| 121 |
|
| 122 |
3. curl_exec(command)
|
| 123 |
+
→ Execute an HTTP request. Returns {"status_code": N, "body": {...}}.
|
| 124 |
+
→ Session cookies are auto-injected. Do NOT manually set Cookie headers.
|
| 125 |
+
→ Always use: curl -s -X METHOD 'URL' -H 'Content-Type: application/json' -d '{...}'
|
| 126 |
|
| 127 |
4. search_episode_data(query)
|
| 128 |
+
→ Search all prior API responses in this episode for a value.
|
| 129 |
+
→ Use when a prior response was long and you need to find a specific item/ID.
|
| 130 |
|
| 131 |
5. done(result?)
|
| 132 |
+
→ Call ONLY when the task is fully complete. Do not call early.
|
| 133 |
|
| 134 |
RULES:
|
| 135 |
+
- Output ONLY a JSON object: {"tool": "...", "args": {...}}. No explanation, no markdown.
|
| 136 |
+
- Step 1: ALWAYS call browser_agent to discover endpoints.
|
| 137 |
+
- Step 2+: Use search_endpoints to find the right endpoint before calling curl_exec.
|
| 138 |
+
- Read IDs and values (cart_id, sku, item_id) from LAST TOOL RESULT in the context.
|
| 139 |
+
- Magento REST API base: http://host/rest/V1/
|
| 140 |
+
- To add an item to guest cart: POST /rest/V1/guest-carts/{cartId}/items
|
| 141 |
+
Body: {"cartItem": {"sku": "SKU", "qty": 1, "quote_id": "{cartId}"}}
|
| 142 |
+
- Do NOT call done() until the task is actually accomplished.
|
| 143 |
+
|
| 144 |
+
EXAMPLE SEQUENCE for "Add product to guest cart":
|
| 145 |
+
Step 1: {"tool": "browser_agent", "args": {"task": "Add product to guest cart", "url": "http://..."}}
|
| 146 |
+
Step 2: {"tool": "search_endpoints", "args": {"query": "create guest cart"}}
|
| 147 |
+
Step 3: {"tool": "curl_exec", "args": {"command": "curl -s -X POST 'http://.../rest/V1/guest-carts' -H 'Content-Type: application/json'"}}
|
| 148 |
+
Step 4: {"tool": "search_endpoints", "args": {"query": "add item to guest cart"}}
|
| 149 |
+
Step 5: {"tool": "curl_exec", "args": {"command": "curl -s -X POST 'http://.../rest/V1/guest-carts/CART_ID/items' -H 'Content-Type: application/json' -d '{\"cartItem\":{\"sku\":\"MH01-XS-Black\",\"qty\":1,\"quote_id\":\"CART_ID\"}}'"}}
|
| 150 |
+
Step 6: {"tool": "done", "args": {}}
|
| 151 |
""").strip()
|
| 152 |
|
| 153 |
|
|
|
|
| 155 |
# LLM agent loop
|
| 156 |
# ---------------------------------------------------------------------------
|
| 157 |
|
| 158 |
+
def _format_result_for_context(result: Any, max_chars: int = 3000) -> str:
|
| 159 |
+
"""Format tool result for the LLM context — more generous truncation."""
|
| 160 |
+
if result is None:
|
| 161 |
+
return "null"
|
| 162 |
+
try:
|
| 163 |
+
text = json.dumps(result, indent=2)
|
| 164 |
+
except Exception:
|
| 165 |
+
text = str(result)
|
| 166 |
+
|
| 167 |
+
if len(text) <= max_chars:
|
| 168 |
+
return text
|
| 169 |
+
|
| 170 |
+
# Smart truncation: keep beginning (has structure/IDs) and hint at truncation
|
| 171 |
+
kept = text[:max_chars]
|
| 172 |
+
# Try to close the JSON gracefully at the last complete line
|
| 173 |
+
last_newline = kept.rfind("\n")
|
| 174 |
+
if last_newline > max_chars * 0.8:
|
| 175 |
+
kept = kept[:last_newline]
|
| 176 |
+
return kept + f"\n... [truncated, {len(text) - max_chars} chars omitted — use search_episode_data to find specific values]"
|
| 177 |
+
|
| 178 |
+
|
| 179 |
def build_user_prompt(task_desc: str, app_base_url: str, step: int,
|
| 180 |
+
last_result: Any, history: List[dict],
|
| 181 |
+
session_state: dict) -> str:
|
| 182 |
"""Build the user prompt for each step."""
|
| 183 |
+
history_lines = []
|
| 184 |
if history:
|
| 185 |
+
# Show last 8 steps with meaningful result summaries
|
| 186 |
+
for h in history[-8:]:
|
| 187 |
+
result = h.get("result", {})
|
| 188 |
+
# For curl results: show status_code + first 200 chars of body
|
| 189 |
+
if isinstance(result, dict) and "status_code" in result:
|
| 190 |
+
body_preview = str(result.get("body", ""))[:300]
|
| 191 |
+
result_summary = f'status={result["status_code"]} body={body_preview}'
|
| 192 |
+
else:
|
| 193 |
+
result_summary = str(result)[:300]
|
| 194 |
+
history_lines.append(
|
| 195 |
+
f" Step {h['step']}: {h['tool']}({json.dumps(h.get('args', {}))[:100]}) "
|
| 196 |
+
f"→ {result_summary}"
|
| 197 |
+
)
|
| 198 |
|
| 199 |
+
session_str = json.dumps(session_state, indent=2)[:500] if session_state else "{}"
|
| 200 |
+
last_result_str = _format_result_for_context(last_result)
|
| 201 |
|
| 202 |
return textwrap.dedent(f"""
|
| 203 |
TASK: {task_desc}
|
| 204 |
APP URL: {app_base_url}
|
| 205 |
STEP: {step}/{MAX_STEPS}
|
| 206 |
|
| 207 |
+
SESSION STATE (cookies/tokens auto-managed):
|
| 208 |
{session_str}
|
| 209 |
|
| 210 |
LAST TOOL RESULT:
|
| 211 |
{last_result_str}
|
| 212 |
|
| 213 |
+
HISTORY (last {len(history_lines)} steps):
|
| 214 |
+
{chr(10).join(history_lines) if history_lines else " (none yet)"}
|
| 215 |
|
| 216 |
What is your next tool call? Output ONLY the JSON object.
|
| 217 |
""").strip()
|
|
|
|
| 236 |
)
|
| 237 |
text = (completion.choices[0].message.content or "").strip()
|
| 238 |
|
| 239 |
+
# Strip markdown code fences
|
|
|
|
| 240 |
if "```json" in text:
|
| 241 |
text = text.split("```json")[1].split("```")[0].strip()
|
| 242 |
elif "```" in text:
|
| 243 |
text = text.split("```")[1].split("```")[0].strip()
|
| 244 |
|
| 245 |
+
# Extract first {...} block
|
| 246 |
start = text.find("{")
|
| 247 |
end = text.rfind("}") + 1
|
| 248 |
if start >= 0 and end > start:
|
|
|
|
| 252 |
if "tool" in parsed:
|
| 253 |
return parsed
|
| 254 |
|
|
|
|
| 255 |
return {"tool": "done", "args": {"result": "Model returned non-tool response"}}
|
| 256 |
|
| 257 |
except json.JSONDecodeError:
|
| 258 |
+
# Fallback heuristics
|
| 259 |
+
if step == 1:
|
| 260 |
+
return {"tool": "browser_agent", "args": {"task": task_desc, "url": app_base_url}}
|
| 261 |
if "browser_agent" in text:
|
| 262 |
return {"tool": "browser_agent", "args": {"task": task_desc, "url": app_base_url}}
|
| 263 |
+
if "done" in text.lower():
|
| 264 |
return {"tool": "done", "args": {}}
|
| 265 |
+
return {"tool": "done", "args": {"result": f"Parse error: {text[:100]}"}}
|
|
|
|
| 266 |
except Exception as exc:
|
| 267 |
print(f"[DEBUG] LLM call failed: {exc}", flush=True)
|
|
|
|
| 268 |
if step == 1:
|
| 269 |
return {"tool": "browser_agent", "args": {"task": task_desc, "url": app_base_url}}
|
| 270 |
return {"tool": "done", "args": {"result": f"LLM error: {exc}"}}
|
|
|
|
| 277 |
async def run_episode(task_config: dict, client: OpenAI) -> dict:
|
| 278 |
"""
|
| 279 |
Run a single episode for one task.
|
|
|
|
| 280 |
Returns: {"task_name", "success", "steps", "score", "rewards"}
|
| 281 |
"""
|
| 282 |
from server.models import HARvestGymEnvironment, HarvestGymAction
|
|
|
|
| 286 |
task_description = task_config["description"]
|
| 287 |
app_base_url = task_config["app_base_url"]
|
| 288 |
|
| 289 |
+
# Pin the template via env var so reset() samples from the right pool
|
| 290 |
+
os.environ["HARVGYM_TASK"] = task_name # use name, not int
|
| 291 |
|
| 292 |
env = HARvestGymEnvironment()
|
| 293 |
|
|
|
|
| 302 |
log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
|
| 303 |
|
| 304 |
try:
|
|
|
|
| 305 |
obs = env.reset()
|
| 306 |
+
# CRITICAL: use the env-sampled task description — the judge grades exactly
|
| 307 |
+
# what env.reset() returned (random category/product), not our hardcoded string.
|
| 308 |
task_desc = obs.task or task_description
|
| 309 |
base_url = obs.app_base_url or app_base_url
|
| 310 |
|
|
|
|
| 312 |
if getattr(obs, "done", False):
|
| 313 |
break
|
| 314 |
|
|
|
|
| 315 |
action_dict = get_model_action(
|
| 316 |
client=client,
|
| 317 |
task_desc=task_desc,
|
|
|
|
| 337 |
last_result = obs.last_tool_result
|
| 338 |
session_state = dict(obs.session_state or {})
|
| 339 |
|
|
|
|
| 340 |
history.append({
|
| 341 |
"step": step,
|
| 342 |
"tool": tool,
|
|
|
|
| 356 |
if done:
|
| 357 |
break
|
| 358 |
|
| 359 |
+
# Score: terminal reward from judge dominates.
|
| 360 |
+
# Reward range by design: terminal success = +2 to +5, terminal fail = -1.5
|
| 361 |
+
# Use a generous baseline so partial credit shows up.
|
| 362 |
total_reward = sum(rewards)
|
| 363 |
+
# Normalise to [0,1]: shift by +1.5 (min), divide by max-possible per task
|
| 364 |
+
# Template 1 max=2, Template 3 max=3.5, Template 6 max=5 → use 5.0 as ceiling
|
| 365 |
+
score = max(0.0, min(1.0, (total_reward + 1.5) / (5.0 + 1.5)))
|
| 366 |
+
success = total_reward >= 0.5 # any positive terminal reward = success
|
| 367 |
|
| 368 |
except Exception as exc:
|
| 369 |
error_str = str(exc)[:200]
|
openenv.yaml
CHANGED
|
@@ -3,4 +3,4 @@ name: HARvestGym
|
|
| 3 |
type: space
|
| 4 |
runtime: fastapi
|
| 5 |
app: server.app:app
|
| 6 |
-
port:
|
|
|
|
| 3 |
type: space
|
| 4 |
runtime: fastapi
|
| 5 |
app: server.app:app
|
| 6 |
+
port: 8000
|