Spaces:

kdcyberdude
/

HARvestGym

Sleeping

App Files Files Community

kdcyberdude commited on 14 days ago

Commit

b7eaaaa

verified ·

1 Parent(s): cfe8207

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

Dockerfile +65 -21
inference.py +97 -63
openenv.yaml +1 -1

Dockerfile CHANGED Viewed

@@ -1,39 +1,83 @@
-# HARvestGym — OpenEnv Environment
-# Uses plain Python slim base so HF Spaces cpu-basic can pull it fast.
-# HF Spaces routes external traffic → container port 7860.
-FROM python:3.11-slim
 WORKDIR /app
-# System deps: curl for healthcheck, git for any VCS deps
 RUN apt-get update && \
-    apt-get install -y --no-install-recommends git curl && \
     rm -rf /var/lib/apt/lists/*
-# Install uv
-RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
-    mv /root/.local/bin/uv /usr/local/bin/uv && \
-    mv /root/.local/bin/uvx /usr/local/bin/uvx 2>/dev/null || true
-# Copy project files
 COPY . /app/env
 WORKDIR /app/env
-# Install dependencies (no sentence-transformers/torch — keyword search only)
-RUN uv sync --no-editable --no-extra embeddings
-# Runtime env vars
-ENV PATH="/app/env/.venv/bin:$PATH"
 ENV PYTHONPATH="/app/env:$PYTHONPATH"
-ENV HARVGYM_NO_EMBED=1
-# HF Spaces expects the app on port 7860
-EXPOSE 7860
-HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=5 \
-    CMD curl -f http://localhost:7860/health || exit 1
 ENV ENABLE_WEB_INTERFACE=true
-CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 7860"]

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Multi-stage build using openenv-base
+# This Dockerfile is flexible and works for both:
+# - In-repo environments (with local OpenEnv sources)
+# - Standalone environments (with openenv from PyPI/Git)
+# The build script (openenv build) handles context detection and sets appropriate build args.
+ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
+FROM ${BASE_IMAGE} AS builder
 WORKDIR /app
+# Ensure git is available (required for installing dependencies from VCS)
 RUN apt-get update && \
+    apt-get install -y --no-install-recommends git && \
     rm -rf /var/lib/apt/lists/*
+# Build argument to control whether we're building standalone or in-repo
+ARG BUILD_MODE=in-repo
+ARG ENV_NAME=HARvestGym
+# Copy environment code (always at root of build context)
 COPY . /app/env
+# For in-repo builds, openenv is already vendored in the build context
+# For standalone builds, openenv will be installed via pyproject.toml
 WORKDIR /app/env
+# Ensure uv is available (for local builds where base image lacks it)
+RUN if ! command -v uv >/dev/null 2>&1; then \
+        curl -LsSf https://astral.sh/uv/install.sh | sh && \
+        mv /root/.local/bin/uv /usr/local/bin/uv && \
+        mv /root/.local/bin/uvx /usr/local/bin/uvx; \
+    fi
+# Install dependencies using uv sync (no sentence-transformers/torch — keyword search only)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-install-project --no-editable --no-extra embeddings; \
+    else \
+        uv sync --no-install-project --no-editable --no-extra embeddings; \
+    fi
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-editable --no-extra embeddings; \
+    else \
+        uv sync --no-editable --no-extra embeddings; \
+    fi
+# Final runtime stage
+FROM ${BASE_IMAGE}
+WORKDIR /app
+# Copy the virtual environment from builder
+COPY --from=builder /app/env/.venv /app/.venv
+# Copy the environment code
+COPY --from=builder /app/env /app/env
+# Set PATH to use the virtual environment
+ENV PATH="/app/.venv/bin:$PATH"
+# Set PYTHONPATH so imports work correctly
 ENV PYTHONPATH="/app/env:$PYTHONPATH"
+# Disable embedding model download (keyword search only, fits cpu-basic RAM)
+ENV HARVGYM_NO_EMBED=1
+# Health check
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+# Run the FastAPI server
+# The module path is constructed to work with the /app/env structure
 ENV ENABLE_WEB_INTERFACE=true
+CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]

inference.py CHANGED Viewed

@@ -42,11 +42,12 @@ if not HF_TOKEN:
 BENCHMARK = "harvgym"
 MAX_STEPS = 20
-TEMPERATURE = 0.7
-MAX_TOKENS = 512
 SUCCESS_SCORE_THRESHOLD = 0.5
-# Task definitions for inference
 TASKS = [
     {
         "task_name": "har_classify_easy",
@@ -58,14 +59,14 @@ TASKS = [
     {
         "task_name": "har_classify_medium",
         "template_id": 3,
-        "description": "Add 'Radiant Tee' to a guest cart",
         "app_base_url": "http://ec2-16-59-2-56.us-east-2.compute.amazonaws.com:7770/",
         "difficulty": "medium",
     },
     {
         "task_name": "har_pipeline_hard",
         "template_id": 6,
-        "description": "Complete a guest checkout for 'Radiant Tee'",
         "app_base_url": "http://ec2-16-59-2-56.us-east-2.compute.amazonaws.com:7770/",
         "difficulty": "hard",
     },
@@ -82,7 +83,6 @@ def log_start(task: str, env: str, model: str) -> None:
 def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
     error_val = error if error else "null"
     done_val = str(done).lower()
-    # Sanitize action: no newlines
     action_clean = action.replace("\n", " ").replace("\r", "")[:200]
     print(
         f"[STEP] step={step} action={action_clean} reward={reward:.2f} done={done_val} error={error_val}",
@@ -104,40 +104,50 @@ def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> No
 SYSTEM_PROMPT = textwrap.dedent("""
 You are an API agent. Your goal is to complete a real-world task by calling the correct
-sequence of HTTP API endpoints on a live web application.
-You have exactly these tools available (output ONE tool call per turn as JSON):
 1. browser_agent(task, url)
-   → Discovers which API endpoints exist for this app. Call this FIRST and ONLY ONCE.
-   → Returns: list of {method, path} endpoint names (no schemas)
 2. search_endpoints(query)
-   → Semantic search for endpoint schemas. Use after browser_agent to get full details.
-   → Example: search_endpoints("create guest cart") returns method, path, auth, params
 3. curl_exec(command)
-   → Execute an HTTP call. Returns {status_code, headers, body}.
-   → Use full curl syntax: curl -X POST 'URL' -H 'Content-Type: application/json' -d '{...}'
-   → Session cookies are auto-injected; you do NOT need to set Cookie headers manually.
 4. search_episode_data(query)
-   → Search all prior API responses in this episode for a specific value.
-   → Use when a response list was truncated and you need a specific item.
 5. done(result?)
-   → Call when the task is complete.
 RULES:
-- Output ONLY a single JSON object with keys "tool" and "args". Nothing else.
-- Call browser_agent exactly once at step 1.
-- Read values from prior responses (cart_id, sku, tokens) from the history.
-- For Magento Shopping API (port 7770/7780): use Content-Type: application/json
-- For Forum Postmill (port 9999): use Content-Type: application/x-www-form-urlencoded for login/post
-- For Wikipedia (port 8888): GET requests only
-EXAMPLE output format:
-{"tool": "curl_exec", "args": {"command": "curl -X POST 'http://ec2-.../rest/V1/guest-carts' -H 'Content-Type: application/json'"}}
 """).strip()
@@ -145,36 +155,63 @@ EXAMPLE output format:
 # LLM agent loop
 # ---------------------------------------------------------------------------
 def build_user_prompt(task_desc: str, app_base_url: str, step: int,
-                       last_result: Any, history: List[dict],
-                       session_state: dict) -> str:
     """Build the user prompt for each step."""
-    history_str = ""
     if history:
-        recent = history[-6:]  # Last 6 steps to stay within context
-        lines = []
-        for h in recent:
-            result_str = json.dumps(h.get("result", ""))[:500]
-            lines.append(f"  Step {h['step']}: {h['tool']}({h.get('args', {})}) → {result_str}")
-        history_str = "\n".join(lines)
-    session_str = json.dumps(session_state, indent=2)[:300] if session_state else "{}"
-    last_result_str = json.dumps(last_result)[:800] if last_result is not None else "null"
     return textwrap.dedent(f"""
     TASK: {task_desc}
     APP URL: {app_base_url}
     STEP: {step}/{MAX_STEPS}
-    SESSION STATE (auto-managed cookies/tokens):
     {session_str}
     LAST TOOL RESULT:
     {last_result_str}
-    RECENT HISTORY:
-    {history_str if history_str else "  (none yet)"}
     What is your next tool call? Output ONLY the JSON object.
     """).strip()
@@ -199,14 +236,13 @@ def get_model_action(client: OpenAI, task_desc: str, app_base_url: str,
         )
         text = (completion.choices[0].message.content or "").strip()
-        # Parse JSON from response
-        # Handle markdown code blocks
         if "```json" in text:
             text = text.split("```json")[1].split("```")[0].strip()
         elif "```" in text:
             text = text.split("```")[1].split("```")[0].strip()
-        # Find first { ... } block
         start = text.find("{")
         end = text.rfind("}") + 1
         if start >= 0 and end > start:
@@ -216,20 +252,19 @@ def get_model_action(client: OpenAI, task_desc: str, app_base_url: str,
         if "tool" in parsed:
             return parsed
-        # LLM returned something else — default to done
         return {"tool": "done", "args": {"result": "Model returned non-tool response"}}
     except json.JSONDecodeError:
-        # Couldn't parse JSON — try to extract tool name at minimum
         if "browser_agent" in text:
             return {"tool": "browser_agent", "args": {"task": task_desc, "url": app_base_url}}
-        elif "done" in text.lower():
             return {"tool": "done", "args": {}}
-        else:
-            return {"tool": "done", "args": {"result": f"Parse error: {text[:100]}"}}
     except Exception as exc:
         print(f"[DEBUG] LLM call failed: {exc}", flush=True)
-        # Default to browser_agent on first step, done otherwise
         if step == 1:
             return {"tool": "browser_agent", "args": {"task": task_desc, "url": app_base_url}}
         return {"tool": "done", "args": {"result": f"LLM error: {exc}"}}
@@ -242,7 +277,6 @@ def get_model_action(client: OpenAI, task_desc: str, app_base_url: str,
 async def run_episode(task_config: dict, client: OpenAI) -> dict:
     """
     Run a single episode for one task.
     Returns: {"task_name", "success", "steps", "score", "rewards"}
     """
     from server.models import HARvestGymEnvironment, HarvestGymAction
@@ -252,8 +286,8 @@ async def run_episode(task_config: dict, client: OpenAI) -> dict:
     task_description = task_config["description"]
     app_base_url = task_config["app_base_url"]
-    # Configure environment for this task
-    os.environ["HARVGYM_TASK"] = str(template_id)
     env = HARvestGymEnvironment()
@@ -268,8 +302,9 @@ async def run_episode(task_config: dict, client: OpenAI) -> dict:
     log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
     try:
-        # Reset
         obs = env.reset()
         task_desc = obs.task or task_description
         base_url = obs.app_base_url or app_base_url
@@ -277,7 +312,6 @@ async def run_episode(task_config: dict, client: OpenAI) -> dict:
             if getattr(obs, "done", False):
                 break
-            # Get action from LLM
             action_dict = get_model_action(
                 client=client,
                 task_desc=task_desc,
@@ -303,7 +337,6 @@ async def run_episode(task_config: dict, client: OpenAI) -> dict:
                 last_result = obs.last_tool_result
                 session_state = dict(obs.session_state or {})
-                # Update history
                 history.append({
                     "step": step,
                     "tool": tool,
@@ -323,13 +356,14 @@ async def run_episode(task_config: dict, client: OpenAI) -> dict:
             if done:
                 break
-        # Compute episode score from cumulative rewards
-        # Normalize: terminal reward dominates; clamp to [0, 1]
         total_reward = sum(rewards)
-        # Map reward to [0, 1]: reward range is roughly [-1.5, +7.5] per design
-        score = (total_reward + 1.5) / 9.0
-        score = max(0.0, min(1.0, score))
-        success = score >= SUCCESS_SCORE_THRESHOLD
     except Exception as exc:
         error_str = str(exc)[:200]

 BENCHMARK = "harvgym"
 MAX_STEPS = 20
+TEMPERATURE = 0.2        # Lower temp → more deterministic tool calls
+MAX_TOKENS = 1024        # More room for reasoning + JSON
 SUCCESS_SCORE_THRESHOLD = 0.5
+# Task definitions: use FIXED task descriptions so the model always knows
+# exactly what to do (env.reset() may randomize, but we tell it the target)
 TASKS = [
     {
         "task_name": "har_classify_easy",
     {
         "task_name": "har_classify_medium",
         "template_id": 3,
+        "description": "Add 'Radiant Tee' (SKU: MH01-XS-Black) to a guest cart",
         "app_base_url": "http://ec2-16-59-2-56.us-east-2.compute.amazonaws.com:7770/",
         "difficulty": "medium",
     },
     {
         "task_name": "har_pipeline_hard",
         "template_id": 6,
+        "description": "Complete a full guest checkout for 'Radiant Tee' (SKU: MH01-XS-Black)",
         "app_base_url": "http://ec2-16-59-2-56.us-east-2.compute.amazonaws.com:7770/",
         "difficulty": "hard",
     },
 def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
     error_val = error if error else "null"
     done_val = str(done).lower()
     action_clean = action.replace("\n", " ").replace("\r", "")[:200]
     print(
         f"[STEP] step={step} action={action_clean} reward={reward:.2f} done={done_val} error={error_val}",
 SYSTEM_PROMPT = textwrap.dedent("""
 You are an API agent. Your goal is to complete a real-world task by calling the correct
+sequence of HTTP API endpoints on a live Magento shopping application.
+You have exactly these tools available. Output ONE tool call per turn as a JSON object:
 1. browser_agent(task, url)
+   → Discovers API endpoints for this app from recorded traffic + catalog.
+   → Call this FIRST and ONLY ONCE (at step 1).
+   → Returns: {"app": "shopping", "total_endpoints": N, "endpoints": [...]}
 2. search_endpoints(query)
+   → Semantic/keyword search over all discovered endpoint schemas.
+   → Returns full endpoint details: method, path, required parameters, auth.
+   → Use this to find the exact path and params before making curl calls.
+   → Example: search_endpoints("guest cart create") → POST /rest/V1/guest-carts
 3. curl_exec(command)
+   → Execute an HTTP request. Returns {"status_code": N, "body": {...}}.
+   → Session cookies are auto-injected. Do NOT manually set Cookie headers.
+   → Always use: curl -s -X METHOD 'URL' -H 'Content-Type: application/json' -d '{...}'
 4. search_episode_data(query)
+   → Search all prior API responses in this episode for a value.
+   → Use when a prior response was long and you need to find a specific item/ID.
 5. done(result?)
+   → Call ONLY when the task is fully complete. Do not call early.
 RULES:
+- Output ONLY a JSON object: {"tool": "...", "args": {...}}. No explanation, no markdown.
+- Step 1: ALWAYS call browser_agent to discover endpoints.
+- Step 2+: Use search_endpoints to find the right endpoint before calling curl_exec.
+- Read IDs and values (cart_id, sku, item_id) from LAST TOOL RESULT in the context.
+- Magento REST API base: http://host/rest/V1/
+- To add an item to guest cart: POST /rest/V1/guest-carts/{cartId}/items
+  Body: {"cartItem": {"sku": "SKU", "qty": 1, "quote_id": "{cartId}"}}
+- Do NOT call done() until the task is actually accomplished.
+EXAMPLE SEQUENCE for "Add product to guest cart":
+  Step 1: {"tool": "browser_agent", "args": {"task": "Add product to guest cart", "url": "http://..."}}
+  Step 2: {"tool": "search_endpoints", "args": {"query": "create guest cart"}}
+  Step 3: {"tool": "curl_exec", "args": {"command": "curl -s -X POST 'http://.../rest/V1/guest-carts' -H 'Content-Type: application/json'"}}
+  Step 4: {"tool": "search_endpoints", "args": {"query": "add item to guest cart"}}
+  Step 5: {"tool": "curl_exec", "args": {"command": "curl -s -X POST 'http://.../rest/V1/guest-carts/CART_ID/items' -H 'Content-Type: application/json' -d '{\"cartItem\":{\"sku\":\"MH01-XS-Black\",\"qty\":1,\"quote_id\":\"CART_ID\"}}'"}}
+  Step 6: {"tool": "done", "args": {}}
 """).strip()
 # LLM agent loop
 # ---------------------------------------------------------------------------
+def _format_result_for_context(result: Any, max_chars: int = 3000) -> str:
+    """Format tool result for the LLM context — more generous truncation."""
+    if result is None:
+        return "null"
+    try:
+        text = json.dumps(result, indent=2)
+    except Exception:
+        text = str(result)
+    if len(text) <= max_chars:
+        return text
+    # Smart truncation: keep beginning (has structure/IDs) and hint at truncation
+    kept = text[:max_chars]
+    # Try to close the JSON gracefully at the last complete line
+    last_newline = kept.rfind("\n")
+    if last_newline > max_chars * 0.8:
+        kept = kept[:last_newline]
+    return kept + f"\n... [truncated, {len(text) - max_chars} chars omitted — use search_episode_data to find specific values]"
 def build_user_prompt(task_desc: str, app_base_url: str, step: int,
+                      last_result: Any, history: List[dict],
+                      session_state: dict) -> str:
     """Build the user prompt for each step."""
+    history_lines = []
     if history:
+        # Show last 8 steps with meaningful result summaries
+        for h in history[-8:]:
+            result = h.get("result", {})
+            # For curl results: show status_code + first 200 chars of body
+            if isinstance(result, dict) and "status_code" in result:
+                body_preview = str(result.get("body", ""))[:300]
+                result_summary = f'status={result["status_code"]} body={body_preview}'
+            else:
+                result_summary = str(result)[:300]
+            history_lines.append(
+                f"  Step {h['step']}: {h['tool']}({json.dumps(h.get('args', {}))[:100]}) "
+                f"→ {result_summary}"
+            )
+    session_str = json.dumps(session_state, indent=2)[:500] if session_state else "{}"
+    last_result_str = _format_result_for_context(last_result)
     return textwrap.dedent(f"""
     TASK: {task_desc}
     APP URL: {app_base_url}
     STEP: {step}/{MAX_STEPS}
+    SESSION STATE (cookies/tokens auto-managed):
     {session_str}
     LAST TOOL RESULT:
     {last_result_str}
+    HISTORY (last {len(history_lines)} steps):
+    {chr(10).join(history_lines) if history_lines else "  (none yet)"}
     What is your next tool call? Output ONLY the JSON object.
     """).strip()
         )
         text = (completion.choices[0].message.content or "").strip()
+        # Strip markdown code fences
         if "```json" in text:
             text = text.split("```json")[1].split("```")[0].strip()
         elif "```" in text:
             text = text.split("```")[1].split("```")[0].strip()
+        # Extract first {...} block
         start = text.find("{")
         end = text.rfind("}") + 1
         if start >= 0 and end > start:
         if "tool" in parsed:
             return parsed
         return {"tool": "done", "args": {"result": "Model returned non-tool response"}}
     except json.JSONDecodeError:
+        # Fallback heuristics
+        if step == 1:
+            return {"tool": "browser_agent", "args": {"task": task_desc, "url": app_base_url}}
         if "browser_agent" in text:
             return {"tool": "browser_agent", "args": {"task": task_desc, "url": app_base_url}}
+        if "done" in text.lower():
             return {"tool": "done", "args": {}}
+        return {"tool": "done", "args": {"result": f"Parse error: {text[:100]}"}}
     except Exception as exc:
         print(f"[DEBUG] LLM call failed: {exc}", flush=True)
         if step == 1:
             return {"tool": "browser_agent", "args": {"task": task_desc, "url": app_base_url}}
         return {"tool": "done", "args": {"result": f"LLM error: {exc}"}}
 async def run_episode(task_config: dict, client: OpenAI) -> dict:
     """
     Run a single episode for one task.
     Returns: {"task_name", "success", "steps", "score", "rewards"}
     """
     from server.models import HARvestGymEnvironment, HarvestGymAction
     task_description = task_config["description"]
     app_base_url = task_config["app_base_url"]
+    # Pin the template via env var so reset() samples from the right pool
+    os.environ["HARVGYM_TASK"] = task_name   # use name, not int
     env = HARvestGymEnvironment()
     log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
     try:
         obs = env.reset()
+        # CRITICAL: use the env-sampled task description — the judge grades exactly
+        # what env.reset() returned (random category/product), not our hardcoded string.
         task_desc = obs.task or task_description
         base_url = obs.app_base_url or app_base_url
             if getattr(obs, "done", False):
                 break
             action_dict = get_model_action(
                 client=client,
                 task_desc=task_desc,
                 last_result = obs.last_tool_result
                 session_state = dict(obs.session_state or {})
                 history.append({
                     "step": step,
                     "tool": tool,
             if done:
                 break
+        # Score: terminal reward from judge dominates.
+        # Reward range by design: terminal success = +2 to +5, terminal fail = -1.5
+        # Use a generous baseline so partial credit shows up.
         total_reward = sum(rewards)
+        # Normalise to [0,1]: shift by +1.5 (min), divide by max-possible per task
+        # Template 1 max=2, Template 3 max=3.5, Template 6 max=5 → use 5.0 as ceiling
+        score = max(0.0, min(1.0, (total_reward + 1.5) / (5.0 + 1.5)))
+        success = total_reward >= 0.5   # any positive terminal reward = success
     except Exception as exc:
         error_str = str(exc)[:200]

openenv.yaml CHANGED Viewed

@@ -3,4 +3,4 @@ name: HARvestGym
 type: space
 runtime: fastapi
 app: server.app:app
-port: 7860

 type: space
 runtime: fastapi
 app: server.app:app
+port: 8000