kdcyberdude commited on
Commit
b7eaaaa
·
verified ·
1 Parent(s): cfe8207

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. Dockerfile +65 -21
  2. inference.py +97 -63
  3. openenv.yaml +1 -1
Dockerfile CHANGED
@@ -1,39 +1,83 @@
1
- # HARvestGym OpenEnv Environment
2
- # Uses plain Python slim base so HF Spaces cpu-basic can pull it fast.
3
- # HF Spaces routes external traffic → container port 7860.
 
 
4
 
5
- FROM python:3.11-slim
 
 
 
 
 
 
 
6
 
7
  WORKDIR /app
8
 
9
- # System deps: curl for healthcheck, git for any VCS deps
10
  RUN apt-get update && \
11
- apt-get install -y --no-install-recommends git curl && \
12
  rm -rf /var/lib/apt/lists/*
13
 
14
- # Install uv
15
- RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
16
- mv /root/.local/bin/uv /usr/local/bin/uv && \
17
- mv /root/.local/bin/uvx /usr/local/bin/uvx 2>/dev/null || true
18
 
19
- # Copy project files
20
  COPY . /app/env
21
 
 
 
22
  WORKDIR /app/env
23
 
24
- # Install dependencies (no sentence-transformers/torch keyword search only)
25
- RUN uv sync --no-editable --no-extra embeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- # Runtime env vars
28
- ENV PATH="/app/env/.venv/bin:$PATH"
 
 
 
 
 
 
 
 
29
  ENV PYTHONPATH="/app/env:$PYTHONPATH"
30
- ENV HARVGYM_NO_EMBED=1
31
 
32
- # HF Spaces expects the app on port 7860
33
- EXPOSE 7860
34
 
35
- HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=5 \
36
- CMD curl -f http://localhost:7860/health || exit 1
 
37
 
 
 
38
  ENV ENABLE_WEB_INTERFACE=true
39
- CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 7860"]
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
 
7
+ # Multi-stage build using openenv-base
8
+ # This Dockerfile is flexible and works for both:
9
+ # - In-repo environments (with local OpenEnv sources)
10
+ # - Standalone environments (with openenv from PyPI/Git)
11
+ # The build script (openenv build) handles context detection and sets appropriate build args.
12
+
13
+ ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
14
+ FROM ${BASE_IMAGE} AS builder
15
 
16
  WORKDIR /app
17
 
18
+ # Ensure git is available (required for installing dependencies from VCS)
19
  RUN apt-get update && \
20
+ apt-get install -y --no-install-recommends git && \
21
  rm -rf /var/lib/apt/lists/*
22
 
23
+ # Build argument to control whether we're building standalone or in-repo
24
+ ARG BUILD_MODE=in-repo
25
+ ARG ENV_NAME=HARvestGym
 
26
 
27
+ # Copy environment code (always at root of build context)
28
  COPY . /app/env
29
 
30
+ # For in-repo builds, openenv is already vendored in the build context
31
+ # For standalone builds, openenv will be installed via pyproject.toml
32
  WORKDIR /app/env
33
 
34
+ # Ensure uv is available (for local builds where base image lacks it)
35
+ RUN if ! command -v uv >/dev/null 2>&1; then \
36
+ curl -LsSf https://astral.sh/uv/install.sh | sh && \
37
+ mv /root/.local/bin/uv /usr/local/bin/uv && \
38
+ mv /root/.local/bin/uvx /usr/local/bin/uvx; \
39
+ fi
40
+
41
+ # Install dependencies using uv sync (no sentence-transformers/torch — keyword search only)
42
+ RUN --mount=type=cache,target=/root/.cache/uv \
43
+ if [ -f uv.lock ]; then \
44
+ uv sync --frozen --no-install-project --no-editable --no-extra embeddings; \
45
+ else \
46
+ uv sync --no-install-project --no-editable --no-extra embeddings; \
47
+ fi
48
+
49
+ RUN --mount=type=cache,target=/root/.cache/uv \
50
+ if [ -f uv.lock ]; then \
51
+ uv sync --frozen --no-editable --no-extra embeddings; \
52
+ else \
53
+ uv sync --no-editable --no-extra embeddings; \
54
+ fi
55
+
56
+ # Final runtime stage
57
+ FROM ${BASE_IMAGE}
58
+
59
+ WORKDIR /app
60
 
61
+ # Copy the virtual environment from builder
62
+ COPY --from=builder /app/env/.venv /app/.venv
63
+
64
+ # Copy the environment code
65
+ COPY --from=builder /app/env /app/env
66
+
67
+ # Set PATH to use the virtual environment
68
+ ENV PATH="/app/.venv/bin:$PATH"
69
+
70
+ # Set PYTHONPATH so imports work correctly
71
  ENV PYTHONPATH="/app/env:$PYTHONPATH"
 
72
 
73
+ # Disable embedding model download (keyword search only, fits cpu-basic RAM)
74
+ ENV HARVGYM_NO_EMBED=1
75
 
76
+ # Health check
77
+ HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
78
+ CMD curl -f http://localhost:8000/health || exit 1
79
 
80
+ # Run the FastAPI server
81
+ # The module path is constructed to work with the /app/env structure
82
  ENV ENABLE_WEB_INTERFACE=true
83
+ CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
inference.py CHANGED
@@ -42,11 +42,12 @@ if not HF_TOKEN:
42
 
43
  BENCHMARK = "harvgym"
44
  MAX_STEPS = 20
45
- TEMPERATURE = 0.7
46
- MAX_TOKENS = 512
47
  SUCCESS_SCORE_THRESHOLD = 0.5
48
 
49
- # Task definitions for inference
 
50
  TASKS = [
51
  {
52
  "task_name": "har_classify_easy",
@@ -58,14 +59,14 @@ TASKS = [
58
  {
59
  "task_name": "har_classify_medium",
60
  "template_id": 3,
61
- "description": "Add 'Radiant Tee' to a guest cart",
62
  "app_base_url": "http://ec2-16-59-2-56.us-east-2.compute.amazonaws.com:7770/",
63
  "difficulty": "medium",
64
  },
65
  {
66
  "task_name": "har_pipeline_hard",
67
  "template_id": 6,
68
- "description": "Complete a guest checkout for 'Radiant Tee'",
69
  "app_base_url": "http://ec2-16-59-2-56.us-east-2.compute.amazonaws.com:7770/",
70
  "difficulty": "hard",
71
  },
@@ -82,7 +83,6 @@ def log_start(task: str, env: str, model: str) -> None:
82
  def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
83
  error_val = error if error else "null"
84
  done_val = str(done).lower()
85
- # Sanitize action: no newlines
86
  action_clean = action.replace("\n", " ").replace("\r", "")[:200]
87
  print(
88
  f"[STEP] step={step} action={action_clean} reward={reward:.2f} done={done_val} error={error_val}",
@@ -104,40 +104,50 @@ def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> No
104
 
105
  SYSTEM_PROMPT = textwrap.dedent("""
106
  You are an API agent. Your goal is to complete a real-world task by calling the correct
107
- sequence of HTTP API endpoints on a live web application.
108
 
109
- You have exactly these tools available (output ONE tool call per turn as JSON):
110
 
111
  1. browser_agent(task, url)
112
- → Discovers which API endpoints exist for this app. Call this FIRST and ONLY ONCE.
113
- Returns: list of {method, path} endpoint names (no schemas)
 
114
 
115
  2. search_endpoints(query)
116
- → Semantic search for endpoint schemas. Use after browser_agent to get full details.
117
- Example: search_endpoints("create guest cart") returns method, path, auth, params
 
 
118
 
119
  3. curl_exec(command)
120
- → Execute an HTTP call. Returns {status_code, headers, body}.
121
- Use full curl syntax: curl -X POST 'URL' -H 'Content-Type: application/json' -d '{...}'
122
- Session cookies are auto-injected; you do NOT need to set Cookie headers manually.
123
 
124
  4. search_episode_data(query)
125
- → Search all prior API responses in this episode for a specific value.
126
- → Use when a response list was truncated and you need a specific item.
127
 
128
  5. done(result?)
129
- → Call when the task is complete.
130
 
131
  RULES:
132
- - Output ONLY a single JSON object with keys "tool" and "args". Nothing else.
133
- - Call browser_agent exactly once at step 1.
134
- - Read values from prior responses (cart_id, sku, tokens) from the history.
135
- - For Magento Shopping API (port 7770/7780): use Content-Type: application/json
136
- - For Forum Postmill (port 9999): use Content-Type: application/x-www-form-urlencoded for login/post
137
- - For Wikipedia (port 8888): GET requests only
138
-
139
- EXAMPLE output format:
140
- {"tool": "curl_exec", "args": {"command": "curl -X POST 'http://ec2-.../rest/V1/guest-carts' -H 'Content-Type: application/json'"}}
 
 
 
 
 
 
 
141
  """).strip()
142
 
143
 
@@ -145,36 +155,63 @@ EXAMPLE output format:
145
  # LLM agent loop
146
  # ---------------------------------------------------------------------------
147
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  def build_user_prompt(task_desc: str, app_base_url: str, step: int,
149
- last_result: Any, history: List[dict],
150
- session_state: dict) -> str:
151
  """Build the user prompt for each step."""
152
- history_str = ""
153
  if history:
154
- recent = history[-6:] # Last 6 steps to stay within context
155
- lines = []
156
- for h in recent:
157
- result_str = json.dumps(h.get("result", ""))[:500]
158
- lines.append(f" Step {h['step']}: {h['tool']}({h.get('args', {})}) {result_str}")
159
- history_str = "\n".join(lines)
160
-
161
- session_str = json.dumps(session_state, indent=2)[:300] if session_state else "{}"
 
 
 
 
 
162
 
163
- last_result_str = json.dumps(last_result)[:800] if last_result is not None else "null"
 
164
 
165
  return textwrap.dedent(f"""
166
  TASK: {task_desc}
167
  APP URL: {app_base_url}
168
  STEP: {step}/{MAX_STEPS}
169
 
170
- SESSION STATE (auto-managed cookies/tokens):
171
  {session_str}
172
 
173
  LAST TOOL RESULT:
174
  {last_result_str}
175
 
176
- RECENT HISTORY:
177
- {history_str if history_str else " (none yet)"}
178
 
179
  What is your next tool call? Output ONLY the JSON object.
180
  """).strip()
@@ -199,14 +236,13 @@ def get_model_action(client: OpenAI, task_desc: str, app_base_url: str,
199
  )
200
  text = (completion.choices[0].message.content or "").strip()
201
 
202
- # Parse JSON from response
203
- # Handle markdown code blocks
204
  if "```json" in text:
205
  text = text.split("```json")[1].split("```")[0].strip()
206
  elif "```" in text:
207
  text = text.split("```")[1].split("```")[0].strip()
208
 
209
- # Find first { ... } block
210
  start = text.find("{")
211
  end = text.rfind("}") + 1
212
  if start >= 0 and end > start:
@@ -216,20 +252,19 @@ def get_model_action(client: OpenAI, task_desc: str, app_base_url: str,
216
  if "tool" in parsed:
217
  return parsed
218
 
219
- # LLM returned something else — default to done
220
  return {"tool": "done", "args": {"result": "Model returned non-tool response"}}
221
 
222
  except json.JSONDecodeError:
223
- # Couldn't parse JSON — try to extract tool name at minimum
 
 
224
  if "browser_agent" in text:
225
  return {"tool": "browser_agent", "args": {"task": task_desc, "url": app_base_url}}
226
- elif "done" in text.lower():
227
  return {"tool": "done", "args": {}}
228
- else:
229
- return {"tool": "done", "args": {"result": f"Parse error: {text[:100]}"}}
230
  except Exception as exc:
231
  print(f"[DEBUG] LLM call failed: {exc}", flush=True)
232
- # Default to browser_agent on first step, done otherwise
233
  if step == 1:
234
  return {"tool": "browser_agent", "args": {"task": task_desc, "url": app_base_url}}
235
  return {"tool": "done", "args": {"result": f"LLM error: {exc}"}}
@@ -242,7 +277,6 @@ def get_model_action(client: OpenAI, task_desc: str, app_base_url: str,
242
  async def run_episode(task_config: dict, client: OpenAI) -> dict:
243
  """
244
  Run a single episode for one task.
245
-
246
  Returns: {"task_name", "success", "steps", "score", "rewards"}
247
  """
248
  from server.models import HARvestGymEnvironment, HarvestGymAction
@@ -252,8 +286,8 @@ async def run_episode(task_config: dict, client: OpenAI) -> dict:
252
  task_description = task_config["description"]
253
  app_base_url = task_config["app_base_url"]
254
 
255
- # Configure environment for this task
256
- os.environ["HARVGYM_TASK"] = str(template_id)
257
 
258
  env = HARvestGymEnvironment()
259
 
@@ -268,8 +302,9 @@ async def run_episode(task_config: dict, client: OpenAI) -> dict:
268
  log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
269
 
270
  try:
271
- # Reset
272
  obs = env.reset()
 
 
273
  task_desc = obs.task or task_description
274
  base_url = obs.app_base_url or app_base_url
275
 
@@ -277,7 +312,6 @@ async def run_episode(task_config: dict, client: OpenAI) -> dict:
277
  if getattr(obs, "done", False):
278
  break
279
 
280
- # Get action from LLM
281
  action_dict = get_model_action(
282
  client=client,
283
  task_desc=task_desc,
@@ -303,7 +337,6 @@ async def run_episode(task_config: dict, client: OpenAI) -> dict:
303
  last_result = obs.last_tool_result
304
  session_state = dict(obs.session_state or {})
305
 
306
- # Update history
307
  history.append({
308
  "step": step,
309
  "tool": tool,
@@ -323,13 +356,14 @@ async def run_episode(task_config: dict, client: OpenAI) -> dict:
323
  if done:
324
  break
325
 
326
- # Compute episode score from cumulative rewards
327
- # Normalize: terminal reward dominates; clamp to [0, 1]
 
328
  total_reward = sum(rewards)
329
- # Map reward to [0, 1]: reward range is roughly [-1.5, +7.5] per design
330
- score = (total_reward + 1.5) / 9.0
331
- score = max(0.0, min(1.0, score))
332
- success = score >= SUCCESS_SCORE_THRESHOLD
333
 
334
  except Exception as exc:
335
  error_str = str(exc)[:200]
 
42
 
43
  BENCHMARK = "harvgym"
44
  MAX_STEPS = 20
45
+ TEMPERATURE = 0.2 # Lower temp → more deterministic tool calls
46
+ MAX_TOKENS = 1024 # More room for reasoning + JSON
47
  SUCCESS_SCORE_THRESHOLD = 0.5
48
 
49
+ # Task definitions: use FIXED task descriptions so the model always knows
50
+ # exactly what to do (env.reset() may randomize, but we tell it the target)
51
  TASKS = [
52
  {
53
  "task_name": "har_classify_easy",
 
59
  {
60
  "task_name": "har_classify_medium",
61
  "template_id": 3,
62
+ "description": "Add 'Radiant Tee' (SKU: MH01-XS-Black) to a guest cart",
63
  "app_base_url": "http://ec2-16-59-2-56.us-east-2.compute.amazonaws.com:7770/",
64
  "difficulty": "medium",
65
  },
66
  {
67
  "task_name": "har_pipeline_hard",
68
  "template_id": 6,
69
+ "description": "Complete a full guest checkout for 'Radiant Tee' (SKU: MH01-XS-Black)",
70
  "app_base_url": "http://ec2-16-59-2-56.us-east-2.compute.amazonaws.com:7770/",
71
  "difficulty": "hard",
72
  },
 
83
  def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
84
  error_val = error if error else "null"
85
  done_val = str(done).lower()
 
86
  action_clean = action.replace("\n", " ").replace("\r", "")[:200]
87
  print(
88
  f"[STEP] step={step} action={action_clean} reward={reward:.2f} done={done_val} error={error_val}",
 
104
 
105
  SYSTEM_PROMPT = textwrap.dedent("""
106
  You are an API agent. Your goal is to complete a real-world task by calling the correct
107
+ sequence of HTTP API endpoints on a live Magento shopping application.
108
 
109
+ You have exactly these tools available. Output ONE tool call per turn as a JSON object:
110
 
111
  1. browser_agent(task, url)
112
+ → Discovers API endpoints for this app from recorded traffic + catalog.
113
+ Call this FIRST and ONLY ONCE (at step 1).
114
+ → Returns: {"app": "shopping", "total_endpoints": N, "endpoints": [...]}
115
 
116
  2. search_endpoints(query)
117
+ → Semantic/keyword search over all discovered endpoint schemas.
118
+ Returns full endpoint details: method, path, required parameters, auth.
119
+ → Use this to find the exact path and params before making curl calls.
120
+ → Example: search_endpoints("guest cart create") → POST /rest/V1/guest-carts
121
 
122
  3. curl_exec(command)
123
+ → Execute an HTTP request. Returns {"status_code": N, "body": {...}}.
124
+ Session cookies are auto-injected. Do NOT manually set Cookie headers.
125
+ Always use: curl -s -X METHOD 'URL' -H 'Content-Type: application/json' -d '{...}'
126
 
127
  4. search_episode_data(query)
128
+ → Search all prior API responses in this episode for a value.
129
+ → Use when a prior response was long and you need to find a specific item/ID.
130
 
131
  5. done(result?)
132
+ → Call ONLY when the task is fully complete. Do not call early.
133
 
134
  RULES:
135
+ - Output ONLY a JSON object: {"tool": "...", "args": {...}}. No explanation, no markdown.
136
+ - Step 1: ALWAYS call browser_agent to discover endpoints.
137
+ - Step 2+: Use search_endpoints to find the right endpoint before calling curl_exec.
138
+ - Read IDs and values (cart_id, sku, item_id) from LAST TOOL RESULT in the context.
139
+ - Magento REST API base: http://host/rest/V1/
140
+ - To add an item to guest cart: POST /rest/V1/guest-carts/{cartId}/items
141
+ Body: {"cartItem": {"sku": "SKU", "qty": 1, "quote_id": "{cartId}"}}
142
+ - Do NOT call done() until the task is actually accomplished.
143
+
144
+ EXAMPLE SEQUENCE for "Add product to guest cart":
145
+ Step 1: {"tool": "browser_agent", "args": {"task": "Add product to guest cart", "url": "http://..."}}
146
+ Step 2: {"tool": "search_endpoints", "args": {"query": "create guest cart"}}
147
+ Step 3: {"tool": "curl_exec", "args": {"command": "curl -s -X POST 'http://.../rest/V1/guest-carts' -H 'Content-Type: application/json'"}}
148
+ Step 4: {"tool": "search_endpoints", "args": {"query": "add item to guest cart"}}
149
+ Step 5: {"tool": "curl_exec", "args": {"command": "curl -s -X POST 'http://.../rest/V1/guest-carts/CART_ID/items' -H 'Content-Type: application/json' -d '{\"cartItem\":{\"sku\":\"MH01-XS-Black\",\"qty\":1,\"quote_id\":\"CART_ID\"}}'"}}
150
+ Step 6: {"tool": "done", "args": {}}
151
  """).strip()
152
 
153
 
 
155
  # LLM agent loop
156
  # ---------------------------------------------------------------------------
157
 
158
+ def _format_result_for_context(result: Any, max_chars: int = 3000) -> str:
159
+ """Format tool result for the LLM context — more generous truncation."""
160
+ if result is None:
161
+ return "null"
162
+ try:
163
+ text = json.dumps(result, indent=2)
164
+ except Exception:
165
+ text = str(result)
166
+
167
+ if len(text) <= max_chars:
168
+ return text
169
+
170
+ # Smart truncation: keep beginning (has structure/IDs) and hint at truncation
171
+ kept = text[:max_chars]
172
+ # Try to close the JSON gracefully at the last complete line
173
+ last_newline = kept.rfind("\n")
174
+ if last_newline > max_chars * 0.8:
175
+ kept = kept[:last_newline]
176
+ return kept + f"\n... [truncated, {len(text) - max_chars} chars omitted — use search_episode_data to find specific values]"
177
+
178
+
179
  def build_user_prompt(task_desc: str, app_base_url: str, step: int,
180
+ last_result: Any, history: List[dict],
181
+ session_state: dict) -> str:
182
  """Build the user prompt for each step."""
183
+ history_lines = []
184
  if history:
185
+ # Show last 8 steps with meaningful result summaries
186
+ for h in history[-8:]:
187
+ result = h.get("result", {})
188
+ # For curl results: show status_code + first 200 chars of body
189
+ if isinstance(result, dict) and "status_code" in result:
190
+ body_preview = str(result.get("body", ""))[:300]
191
+ result_summary = f'status={result["status_code"]} body={body_preview}'
192
+ else:
193
+ result_summary = str(result)[:300]
194
+ history_lines.append(
195
+ f" Step {h['step']}: {h['tool']}({json.dumps(h.get('args', {}))[:100]}) "
196
+ f"→ {result_summary}"
197
+ )
198
 
199
+ session_str = json.dumps(session_state, indent=2)[:500] if session_state else "{}"
200
+ last_result_str = _format_result_for_context(last_result)
201
 
202
  return textwrap.dedent(f"""
203
  TASK: {task_desc}
204
  APP URL: {app_base_url}
205
  STEP: {step}/{MAX_STEPS}
206
 
207
+ SESSION STATE (cookies/tokens auto-managed):
208
  {session_str}
209
 
210
  LAST TOOL RESULT:
211
  {last_result_str}
212
 
213
+ HISTORY (last {len(history_lines)} steps):
214
+ {chr(10).join(history_lines) if history_lines else " (none yet)"}
215
 
216
  What is your next tool call? Output ONLY the JSON object.
217
  """).strip()
 
236
  )
237
  text = (completion.choices[0].message.content or "").strip()
238
 
239
+ # Strip markdown code fences
 
240
  if "```json" in text:
241
  text = text.split("```json")[1].split("```")[0].strip()
242
  elif "```" in text:
243
  text = text.split("```")[1].split("```")[0].strip()
244
 
245
+ # Extract first {...} block
246
  start = text.find("{")
247
  end = text.rfind("}") + 1
248
  if start >= 0 and end > start:
 
252
  if "tool" in parsed:
253
  return parsed
254
 
 
255
  return {"tool": "done", "args": {"result": "Model returned non-tool response"}}
256
 
257
  except json.JSONDecodeError:
258
+ # Fallback heuristics
259
+ if step == 1:
260
+ return {"tool": "browser_agent", "args": {"task": task_desc, "url": app_base_url}}
261
  if "browser_agent" in text:
262
  return {"tool": "browser_agent", "args": {"task": task_desc, "url": app_base_url}}
263
+ if "done" in text.lower():
264
  return {"tool": "done", "args": {}}
265
+ return {"tool": "done", "args": {"result": f"Parse error: {text[:100]}"}}
 
266
  except Exception as exc:
267
  print(f"[DEBUG] LLM call failed: {exc}", flush=True)
 
268
  if step == 1:
269
  return {"tool": "browser_agent", "args": {"task": task_desc, "url": app_base_url}}
270
  return {"tool": "done", "args": {"result": f"LLM error: {exc}"}}
 
277
  async def run_episode(task_config: dict, client: OpenAI) -> dict:
278
  """
279
  Run a single episode for one task.
 
280
  Returns: {"task_name", "success", "steps", "score", "rewards"}
281
  """
282
  from server.models import HARvestGymEnvironment, HarvestGymAction
 
286
  task_description = task_config["description"]
287
  app_base_url = task_config["app_base_url"]
288
 
289
+ # Pin the template via env var so reset() samples from the right pool
290
+ os.environ["HARVGYM_TASK"] = task_name # use name, not int
291
 
292
  env = HARvestGymEnvironment()
293
 
 
302
  log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
303
 
304
  try:
 
305
  obs = env.reset()
306
+ # CRITICAL: use the env-sampled task description — the judge grades exactly
307
+ # what env.reset() returned (random category/product), not our hardcoded string.
308
  task_desc = obs.task or task_description
309
  base_url = obs.app_base_url or app_base_url
310
 
 
312
  if getattr(obs, "done", False):
313
  break
314
 
 
315
  action_dict = get_model_action(
316
  client=client,
317
  task_desc=task_desc,
 
337
  last_result = obs.last_tool_result
338
  session_state = dict(obs.session_state or {})
339
 
 
340
  history.append({
341
  "step": step,
342
  "tool": tool,
 
356
  if done:
357
  break
358
 
359
+ # Score: terminal reward from judge dominates.
360
+ # Reward range by design: terminal success = +2 to +5, terminal fail = -1.5
361
+ # Use a generous baseline so partial credit shows up.
362
  total_reward = sum(rewards)
363
+ # Normalise to [0,1]: shift by +1.5 (min), divide by max-possible per task
364
+ # Template 1 max=2, Template 3 max=3.5, Template 6 max=5 → use 5.0 as ceiling
365
+ score = max(0.0, min(1.0, (total_reward + 1.5) / (5.0 + 1.5)))
366
+ success = total_reward >= 0.5 # any positive terminal reward = success
367
 
368
  except Exception as exc:
369
  error_str = str(exc)[:200]
openenv.yaml CHANGED
@@ -3,4 +3,4 @@ name: HARvestGym
3
  type: space
4
  runtime: fastapi
5
  app: server.app:app
6
- port: 7860
 
3
  type: space
4
  runtime: fastapi
5
  app: server.app:app
6
+ port: 8000