Charan Sai Mamidala commited on
Commit
dd054aa
·
1 Parent(s): fe47b4f

fix: use 0.001/0.999 bounds — 1e-6 formats as 0.0000 in :.4f stdout which validator reads as 0.0

Browse files
Files changed (2) hide show
  1. env/core.py +5 -5
  2. inference.py +8 -6
env/core.py CHANGED
@@ -287,7 +287,7 @@ class GDPRAuditorEnvironment:
287
  if self._ep is None:
288
  return (
289
  self._error_obs("Environment not reset"),
290
- RewModel(value=1e-6, reason="Environment not initialized", issues_found=0, total_issues=0),
291
  True,
292
  {"error": "Environment not reset. Call /reset first."},
293
  )
@@ -373,10 +373,10 @@ class GDPRAuditorEnvironment:
373
  return None
374
 
375
  def _calculate_reward(self) -> RewModel:
376
- # Scores must be strictly in (0, 1) never exactly 0.0 or 1.0
377
- _EPSILON = 1e-6
378
- _MIN_SCORE = _EPSILON
379
- _MAX_SCORE = 1.0 - _EPSILON
380
 
381
  task = self._ep.task_config
382
  total_issues = len(task.hidden_issues)
 
287
  if self._ep is None:
288
  return (
289
  self._error_obs("Environment not reset"),
290
+ RewModel(value=0.001, reason="Environment not initialized", issues_found=0, total_issues=0),
291
  True,
292
  {"error": "Environment not reset. Call /reset first."},
293
  )
 
373
  return None
374
 
375
  def _calculate_reward(self) -> RewModel:
376
+ # Scores must be strictly in (0, 1) and visible in :.4f format
377
+ # 1e-6 formats as "0.0000" which the validator reads as 0.0 — use 0.001 minimum
378
+ _MIN_SCORE = 0.001
379
+ _MAX_SCORE = 0.999
380
 
381
  task = self._ep.task_config
382
  total_issues = len(task.hidden_issues)
inference.py CHANGED
@@ -219,7 +219,7 @@ def run_task(client: OpenAI, task_key: str, verbose: bool = False) -> dict:
219
  if not reward_data:
220
  print(f"[DEBUG] No reward in response: {result}", flush=True)
221
  if isinstance(reward_data, dict):
222
- reward = reward_data.get("value", 1e-6) # default to epsilon, not 0.0
223
  else:
224
  reward = float(reward_data)
225
  done = result.get("done", False)
@@ -228,7 +228,7 @@ def run_task(client: OpenAI, task_key: str, verbose: bool = False) -> dict:
228
 
229
  except Exception as exc:
230
  error_msg = str(exc).replace('\n', ' ').replace('\r', '')
231
- reward = 1e-6 # never exactly 0.0validator requires strictly (0, 1)
232
  done = True
233
  obs_data = {}
234
 
@@ -240,10 +240,12 @@ def run_task(client: OpenAI, task_key: str, verbose: bool = False) -> dict:
240
  if done:
241
  break
242
 
243
- _EPSILON = 1e-6
244
- score = max(rewards) if rewards else _EPSILON
245
- # Clamp to strictly open interval (0, 1) — validator rejects 0.0 and 1.0
246
- score = max(_EPSILON, min(1.0 - _EPSILON, score))
 
 
247
  success = score >= SUCCESS_SCORE_THRESHOLD
248
 
249
  except Exception as exc:
 
219
  if not reward_data:
220
  print(f"[DEBUG] No reward in response: {result}", flush=True)
221
  if isinstance(reward_data, dict):
222
+ reward = reward_data.get("value", 0.001) # default 0.001, never 0.0
223
  else:
224
  reward = float(reward_data)
225
  done = result.get("done", False)
 
228
 
229
  except Exception as exc:
230
  error_msg = str(exc).replace('\n', ' ').replace('\r', '')
231
+ reward = 0.001 # 1e-6 formats as "0.0000"use 0.001 so :.4f gives "0.0010"
232
  done = True
233
  obs_data = {}
234
 
 
240
  if done:
241
  break
242
 
243
+ # 1e-6 formats as "0.0000" with :.4f — validator parses that as 0.0 and fails
244
+ # Use 0.001 min so it formats as "0.0010", and 0.999 max so it formats as "0.9990"
245
+ _MIN_SCORE = 0.001
246
+ _MAX_SCORE = 0.999
247
+ score = max(rewards) if rewards else _MIN_SCORE
248
+ score = max(_MIN_SCORE, min(_MAX_SCORE, score))
249
  success = score >= SUCCESS_SCORE_THRESHOLD
250
 
251
  except Exception as exc: