Draken1606 commited on
Commit
f776f88
·
1 Parent(s): f69544d

fix: harden strict score bounds with finite-value guard

Browse files
Files changed (2) hide show
  1. inference.py +16 -5
  2. models.py +6 -1
inference.py CHANGED
@@ -19,6 +19,7 @@ Usage:
19
  import argparse
20
  import asyncio
21
  import json
 
22
  import os
23
  import sys
24
  from typing import List, Optional
@@ -60,6 +61,17 @@ MAX_STEPS = 200
60
  SUCCESS_SCORE_THRESHOLD = 0.5
61
 
62
 
 
 
 
 
 
 
 
 
 
 
 
63
  def log_start(task: str, env: str, model: str) -> None:
64
  print(f'[START] task={task} env={env} model={model}', flush=True)
65
 
@@ -214,9 +226,9 @@ async def run_episode(url: str, difficulty: str = 'medium', use_llm: bool = Fals
214
  resp = json.loads(await ws.recv())
215
  payload = resp.get('data', {})
216
  obs = payload.get('observation', payload)
217
- raw_reward = float(payload.get('reward', obs.get('last_reward', 0.0)) or obs.get('last_reward', 0.0))
218
- # Normalize step reward to strictly (0, 1) as required by the grader
219
- reward = min(max(raw_reward, 0.01), 0.99)
220
  done = payload.get('done', obs.get('done', False))
221
  error = payload.get('error', None)
222
 
@@ -230,8 +242,7 @@ async def run_episode(url: str, difficulty: str = 'medium', use_llm: bool = Fals
230
  await ws.send(json.dumps({'type': 'state'}))
231
  state_resp = json.loads(await ws.recv())
232
  state = state_resp.get('data', {})
233
- score = float(state.get('score', obs.get('score', 0.5)))
234
- score = min(max(score, 0.01), 0.99)
235
 
236
  success = score >= SUCCESS_SCORE_THRESHOLD
237
 
 
19
  import argparse
20
  import asyncio
21
  import json
22
+ import math
23
  import os
24
  import sys
25
  from typing import List, Optional
 
61
  SUCCESS_SCORE_THRESHOLD = 0.5
62
 
63
 
64
+ def _strict_unit_interval(value: object, fallback: float = 0.5) -> float:
65
+ """Clamp to a strict (0, 1) range and guard non-finite values."""
66
+ try:
67
+ v = float(value)
68
+ except (TypeError, ValueError):
69
+ v = fallback
70
+ if not math.isfinite(v):
71
+ v = fallback
72
+ return min(max(v, 0.01), 0.99)
73
+
74
+
75
  def log_start(task: str, env: str, model: str) -> None:
76
  print(f'[START] task={task} env={env} model={model}', flush=True)
77
 
 
226
  resp = json.loads(await ws.recv())
227
  payload = resp.get('data', {})
228
  obs = payload.get('observation', payload)
229
+ raw_reward = payload.get('reward', obs.get('last_reward', 0.0))
230
+ # Normalize step reward to strictly (0, 1) as required by the grader.
231
+ reward = _strict_unit_interval(raw_reward, fallback=0.5)
232
  done = payload.get('done', obs.get('done', False))
233
  error = payload.get('error', None)
234
 
 
242
  await ws.send(json.dumps({'type': 'state'}))
243
  state_resp = json.loads(await ws.recv())
244
  state = state_resp.get('data', {})
245
+ score = _strict_unit_interval(state.get('score', obs.get('score', 0.5)), fallback=0.5)
 
246
 
247
  success = score >= SUCCESS_SCORE_THRESHOLD
248
 
models.py CHANGED
@@ -36,5 +36,10 @@ class ContainerObservation(Observation):
36
  max_height: int = Field(0)
37
  difficulty: str = Field("medium")
38
  last_reward: float = Field(0.0)
39
- score: float = Field(0.5, description="Normalized score strictly in (0.0, 1.0)")
 
 
 
 
 
40
  done: bool = Field(False)
 
36
  max_height: int = Field(0)
37
  difficulty: str = Field("medium")
38
  last_reward: float = Field(0.0)
39
+ score: float = Field(
40
+ 0.5,
41
+ description="Normalized score strictly in (0.0, 1.0)",
42
+ gt=0.0,
43
+ lt=1.0,
44
+ )
45
  done: bool = Field(False)