Spaces:
Sleeping
Sleeping
Charan Sai Mamidala commited on
Commit ·
dd054aa
1
Parent(s): fe47b4f
fix: use 0.001/0.999 bounds — 1e-6 formats as 0.0000 in :.4f stdout which validator reads as 0.0
Browse files- env/core.py +5 -5
- inference.py +8 -6
env/core.py
CHANGED
|
@@ -287,7 +287,7 @@ class GDPRAuditorEnvironment:
|
|
| 287 |
if self._ep is None:
|
| 288 |
return (
|
| 289 |
self._error_obs("Environment not reset"),
|
| 290 |
-
RewModel(value=
|
| 291 |
True,
|
| 292 |
{"error": "Environment not reset. Call /reset first."},
|
| 293 |
)
|
|
@@ -373,10 +373,10 @@ class GDPRAuditorEnvironment:
|
|
| 373 |
return None
|
| 374 |
|
| 375 |
def _calculate_reward(self) -> RewModel:
|
| 376 |
-
# Scores must be strictly in (0, 1)
|
| 377 |
-
|
| 378 |
-
_MIN_SCORE =
|
| 379 |
-
_MAX_SCORE =
|
| 380 |
|
| 381 |
task = self._ep.task_config
|
| 382 |
total_issues = len(task.hidden_issues)
|
|
|
|
| 287 |
if self._ep is None:
|
| 288 |
return (
|
| 289 |
self._error_obs("Environment not reset"),
|
| 290 |
+
RewModel(value=0.001, reason="Environment not initialized", issues_found=0, total_issues=0),
|
| 291 |
True,
|
| 292 |
{"error": "Environment not reset. Call /reset first."},
|
| 293 |
)
|
|
|
|
| 373 |
return None
|
| 374 |
|
| 375 |
def _calculate_reward(self) -> RewModel:
|
| 376 |
+
# Scores must be strictly in (0, 1) and visible in :.4f format
|
| 377 |
+
# 1e-6 formats as "0.0000" which the validator reads as 0.0 — use 0.001 minimum
|
| 378 |
+
_MIN_SCORE = 0.001
|
| 379 |
+
_MAX_SCORE = 0.999
|
| 380 |
|
| 381 |
task = self._ep.task_config
|
| 382 |
total_issues = len(task.hidden_issues)
|
inference.py
CHANGED
|
@@ -219,7 +219,7 @@ def run_task(client: OpenAI, task_key: str, verbose: bool = False) -> dict:
|
|
| 219 |
if not reward_data:
|
| 220 |
print(f"[DEBUG] No reward in response: {result}", flush=True)
|
| 221 |
if isinstance(reward_data, dict):
|
| 222 |
-
reward = reward_data.get("value",
|
| 223 |
else:
|
| 224 |
reward = float(reward_data)
|
| 225 |
done = result.get("done", False)
|
|
@@ -228,7 +228,7 @@ def run_task(client: OpenAI, task_key: str, verbose: bool = False) -> dict:
|
|
| 228 |
|
| 229 |
except Exception as exc:
|
| 230 |
error_msg = str(exc).replace('\n', ' ').replace('\r', '')
|
| 231 |
-
reward = 1e-6
|
| 232 |
done = True
|
| 233 |
obs_data = {}
|
| 234 |
|
|
@@ -240,10 +240,12 @@ def run_task(client: OpenAI, task_key: str, verbose: bool = False) -> dict:
|
|
| 240 |
if done:
|
| 241 |
break
|
| 242 |
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
|
|
|
|
|
|
| 247 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 248 |
|
| 249 |
except Exception as exc:
|
|
|
|
| 219 |
if not reward_data:
|
| 220 |
print(f"[DEBUG] No reward in response: {result}", flush=True)
|
| 221 |
if isinstance(reward_data, dict):
|
| 222 |
+
reward = reward_data.get("value", 0.001) # default 0.001, never 0.0
|
| 223 |
else:
|
| 224 |
reward = float(reward_data)
|
| 225 |
done = result.get("done", False)
|
|
|
|
| 228 |
|
| 229 |
except Exception as exc:
|
| 230 |
error_msg = str(exc).replace('\n', ' ').replace('\r', '')
|
| 231 |
+
reward = 0.001 # 1e-6 formats as "0.0000" — use 0.001 so :.4f gives "0.0010"
|
| 232 |
done = True
|
| 233 |
obs_data = {}
|
| 234 |
|
|
|
|
| 240 |
if done:
|
| 241 |
break
|
| 242 |
|
| 243 |
+
# 1e-6 formats as "0.0000" with :.4f — validator parses that as 0.0 and fails
|
| 244 |
+
# Use 0.001 min so it formats as "0.0010", and 0.999 max so it formats as "0.9990"
|
| 245 |
+
_MIN_SCORE = 0.001
|
| 246 |
+
_MAX_SCORE = 0.999
|
| 247 |
+
score = max(rewards) if rewards else _MIN_SCORE
|
| 248 |
+
score = max(_MIN_SCORE, min(_MAX_SCORE, score))
|
| 249 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 250 |
|
| 251 |
except Exception as exc:
|