Spaces:
Sleeping
Sleeping
Commit ·
f776f88
1
Parent(s): f69544d
fix: harden strict score bounds with finite-value guard
Browse files- inference.py +16 -5
- models.py +6 -1
inference.py
CHANGED
|
@@ -19,6 +19,7 @@ Usage:
|
|
| 19 |
import argparse
|
| 20 |
import asyncio
|
| 21 |
import json
|
|
|
|
| 22 |
import os
|
| 23 |
import sys
|
| 24 |
from typing import List, Optional
|
|
@@ -60,6 +61,17 @@ MAX_STEPS = 200
|
|
| 60 |
SUCCESS_SCORE_THRESHOLD = 0.5
|
| 61 |
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
def log_start(task: str, env: str, model: str) -> None:
|
| 64 |
print(f'[START] task={task} env={env} model={model}', flush=True)
|
| 65 |
|
|
@@ -214,9 +226,9 @@ async def run_episode(url: str, difficulty: str = 'medium', use_llm: bool = Fals
|
|
| 214 |
resp = json.loads(await ws.recv())
|
| 215 |
payload = resp.get('data', {})
|
| 216 |
obs = payload.get('observation', payload)
|
| 217 |
-
raw_reward =
|
| 218 |
-
# Normalize step reward to strictly (0, 1) as required by the grader
|
| 219 |
-
reward =
|
| 220 |
done = payload.get('done', obs.get('done', False))
|
| 221 |
error = payload.get('error', None)
|
| 222 |
|
|
@@ -230,8 +242,7 @@ async def run_episode(url: str, difficulty: str = 'medium', use_llm: bool = Fals
|
|
| 230 |
await ws.send(json.dumps({'type': 'state'}))
|
| 231 |
state_resp = json.loads(await ws.recv())
|
| 232 |
state = state_resp.get('data', {})
|
| 233 |
-
score =
|
| 234 |
-
score = min(max(score, 0.01), 0.99)
|
| 235 |
|
| 236 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 237 |
|
|
|
|
| 19 |
import argparse
|
| 20 |
import asyncio
|
| 21 |
import json
|
| 22 |
+
import math
|
| 23 |
import os
|
| 24 |
import sys
|
| 25 |
from typing import List, Optional
|
|
|
|
| 61 |
SUCCESS_SCORE_THRESHOLD = 0.5
|
| 62 |
|
| 63 |
|
| 64 |
+
def _strict_unit_interval(value: object, fallback: float = 0.5) -> float:
|
| 65 |
+
"""Clamp to a strict (0, 1) range and guard non-finite values."""
|
| 66 |
+
try:
|
| 67 |
+
v = float(value)
|
| 68 |
+
except (TypeError, ValueError):
|
| 69 |
+
v = fallback
|
| 70 |
+
if not math.isfinite(v):
|
| 71 |
+
v = fallback
|
| 72 |
+
return min(max(v, 0.01), 0.99)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
def log_start(task: str, env: str, model: str) -> None:
|
| 76 |
print(f'[START] task={task} env={env} model={model}', flush=True)
|
| 77 |
|
|
|
|
| 226 |
resp = json.loads(await ws.recv())
|
| 227 |
payload = resp.get('data', {})
|
| 228 |
obs = payload.get('observation', payload)
|
| 229 |
+
raw_reward = payload.get('reward', obs.get('last_reward', 0.0))
|
| 230 |
+
# Normalize step reward to strictly (0, 1) as required by the grader.
|
| 231 |
+
reward = _strict_unit_interval(raw_reward, fallback=0.5)
|
| 232 |
done = payload.get('done', obs.get('done', False))
|
| 233 |
error = payload.get('error', None)
|
| 234 |
|
|
|
|
| 242 |
await ws.send(json.dumps({'type': 'state'}))
|
| 243 |
state_resp = json.loads(await ws.recv())
|
| 244 |
state = state_resp.get('data', {})
|
| 245 |
+
score = _strict_unit_interval(state.get('score', obs.get('score', 0.5)), fallback=0.5)
|
|
|
|
| 246 |
|
| 247 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 248 |
|
models.py
CHANGED
|
@@ -36,5 +36,10 @@ class ContainerObservation(Observation):
|
|
| 36 |
max_height: int = Field(0)
|
| 37 |
difficulty: str = Field("medium")
|
| 38 |
last_reward: float = Field(0.0)
|
| 39 |
-
score: float = Field(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
done: bool = Field(False)
|
|
|
|
| 36 |
max_height: int = Field(0)
|
| 37 |
difficulty: str = Field("medium")
|
| 38 |
last_reward: float = Field(0.0)
|
| 39 |
+
score: float = Field(
|
| 40 |
+
0.5,
|
| 41 |
+
description="Normalized score strictly in (0.0, 1.0)",
|
| 42 |
+
gt=0.0,
|
| 43 |
+
lt=1.0,
|
| 44 |
+
)
|
| 45 |
done: bool = Field(False)
|