Spaces:
Running
Running
Issue resolved regarding PHASE 2
Browse files- inference.py +9 -11
inference.py
CHANGED
|
@@ -195,8 +195,6 @@ def _get_fallback_action(obs: dict, step: int, actions_taken: list) -> dict:
|
|
| 195 |
|
| 196 |
def run_task(client: OpenAI, task_id: str, seed: int = 42) -> dict:
|
| 197 |
"""Run one complete episode for a task. Returns score + breakdown."""
|
| 198 |
-
print(f"\n Running task: {task_id}...")
|
| 199 |
-
|
| 200 |
# Reset
|
| 201 |
try:
|
| 202 |
resp = requests.post(
|
|
@@ -207,9 +205,11 @@ def run_task(client: OpenAI, task_id: str, seed: int = 42) -> dict:
|
|
| 207 |
resp.raise_for_status()
|
| 208 |
obs = resp.json()
|
| 209 |
except Exception as e:
|
| 210 |
-
print(f"
|
| 211 |
return {"score": 0.0, "error": str(e), "task_id": task_id}
|
| 212 |
|
|
|
|
|
|
|
| 213 |
max_steps = MAX_STEPS_PER_TASK.get(task_id, 10)
|
| 214 |
conversation_history = []
|
| 215 |
actions_taken = []
|
|
@@ -238,10 +238,9 @@ def run_task(client: OpenAI, task_id: str, seed: int = 42) -> dict:
|
|
| 238 |
conversation_history.append({"role": "assistant", "content": response_text})
|
| 239 |
action = _parse_action(response_text)
|
| 240 |
if action is None:
|
| 241 |
-
print(f" Step {steps_taken}: parse failed, using fallback")
|
| 242 |
action = _get_fallback_action(obs, steps_taken, actions_taken)
|
| 243 |
except Exception as e:
|
| 244 |
-
print(f"
|
| 245 |
action = _get_fallback_action(obs, steps_taken, actions_taken)
|
| 246 |
|
| 247 |
# Step environment
|
|
@@ -255,12 +254,10 @@ def run_task(client: OpenAI, task_id: str, seed: int = 42) -> dict:
|
|
| 255 |
obs = step_resp.json()
|
| 256 |
done = obs.get("done", False)
|
| 257 |
reward = obs.get("reward", 0.0)
|
| 258 |
-
feedback = obs.get("last_action_feedback", "")
|
| 259 |
actions_taken.append(action)
|
| 260 |
-
print(f"
|
| 261 |
-
f"-> reward={reward:+.2f} | {feedback[:50]}")
|
| 262 |
except Exception as e:
|
| 263 |
-
print(f"
|
| 264 |
break
|
| 265 |
|
| 266 |
steps_taken += 1
|
|
@@ -274,11 +271,12 @@ def run_task(client: OpenAI, task_id: str, seed: int = 42) -> dict:
|
|
| 274 |
score = grader_result.get("score", 0.0)
|
| 275 |
breakdown = grader_result.get("breakdown", {})
|
| 276 |
except Exception as e:
|
| 277 |
-
print(f"
|
| 278 |
score = obs.get("cumulative_score", 0.0)
|
| 279 |
breakdown = {}
|
| 280 |
|
| 281 |
-
print(f"
|
|
|
|
| 282 |
return {
|
| 283 |
"task_id": task_id,
|
| 284 |
"score": score,
|
|
|
|
| 195 |
|
| 196 |
def run_task(client: OpenAI, task_id: str, seed: int = 42) -> dict:
|
| 197 |
"""Run one complete episode for a task. Returns score + breakdown."""
|
|
|
|
|
|
|
| 198 |
# Reset
|
| 199 |
try:
|
| 200 |
resp = requests.post(
|
|
|
|
| 205 |
resp.raise_for_status()
|
| 206 |
obs = resp.json()
|
| 207 |
except Exception as e:
|
| 208 |
+
print(f"[ERROR] reset task={task_id} error={e}", flush=True)
|
| 209 |
return {"score": 0.0, "error": str(e), "task_id": task_id}
|
| 210 |
|
| 211 |
+
print(f"[START] task={task_id}", flush=True)
|
| 212 |
+
|
| 213 |
max_steps = MAX_STEPS_PER_TASK.get(task_id, 10)
|
| 214 |
conversation_history = []
|
| 215 |
actions_taken = []
|
|
|
|
| 238 |
conversation_history.append({"role": "assistant", "content": response_text})
|
| 239 |
action = _parse_action(response_text)
|
| 240 |
if action is None:
|
|
|
|
| 241 |
action = _get_fallback_action(obs, steps_taken, actions_taken)
|
| 242 |
except Exception as e:
|
| 243 |
+
print(f"[ERROR] step={steps_taken + 1} llm_error={e}", flush=True)
|
| 244 |
action = _get_fallback_action(obs, steps_taken, actions_taken)
|
| 245 |
|
| 246 |
# Step environment
|
|
|
|
| 254 |
obs = step_resp.json()
|
| 255 |
done = obs.get("done", False)
|
| 256 |
reward = obs.get("reward", 0.0)
|
|
|
|
| 257 |
actions_taken.append(action)
|
| 258 |
+
print(f"[STEP] step={steps_taken + 1} reward={reward:.4f}", flush=True)
|
|
|
|
| 259 |
except Exception as e:
|
| 260 |
+
print(f"[ERROR] step={steps_taken + 1} env_error={e}", flush=True)
|
| 261 |
break
|
| 262 |
|
| 263 |
steps_taken += 1
|
|
|
|
| 271 |
score = grader_result.get("score", 0.0)
|
| 272 |
breakdown = grader_result.get("breakdown", {})
|
| 273 |
except Exception as e:
|
| 274 |
+
print(f"[ERROR] grader task={task_id} error={e}", flush=True)
|
| 275 |
score = obs.get("cumulative_score", 0.0)
|
| 276 |
breakdown = {}
|
| 277 |
|
| 278 |
+
print(f"[INFO] Score: {score:.4f} ({steps_taken} steps)", flush=True)
|
| 279 |
+
print(f"[END] task={task_id} score={score:.4f} steps={steps_taken}", flush=True)
|
| 280 |
return {
|
| 281 |
"task_id": task_id,
|
| 282 |
"score": score,
|