Spaces:
Running
Running
Commit ·
ee547a6
1
Parent(s): f63920a
chore: Apply Bug #2 and Bug #3 strict min/max bound clamping to prevent out of range scores and fix windows encoding
Browse files- README.md +1 -1
- inference.py +2 -1
- server/app.py +1 -1
README.md
CHANGED
|
@@ -233,7 +233,7 @@ entropyenv/
|
|
| 233 |
|
| 234 |
| Model | Provider | sec_easy | sec_med | sec_hard | dep_easy | dep_med | dep_hard | cli_easy | cli_med | cli_hard | **Avg** |
|
| 235 |
|-------|----------|:--------:|:-------:|:--------:|:--------:|:-------:|:--------:|:--------:|:-------:|:--------:|:-------:|
|
| 236 |
-
| *
|
| 237 |
|
| 238 |
**Scoring formula:** `score = 0.60 × max(step_rewards) + 0.40 × mean(step_rewards)`, clamped to `[0.01, 0.99]`
|
| 239 |
|
|
|
|
| 233 |
|
| 234 |
| Model | Provider | sec_easy | sec_med | sec_hard | dep_easy | dep_med | dep_hard | cli_easy | cli_med | cli_hard | **Avg** |
|
| 235 |
|-------|----------|:--------:|:-------:|:--------:|:--------:|:-------:|:--------:|:--------:|:-------:|:--------:|:-------:|
|
| 236 |
+
| *(Run `python unnecessary/run_14_models.py` to auto-populate this table)* | | | | | | | | | | | |
|
| 237 |
|
| 238 |
**Scoring formula:** `score = 0.60 × max(step_rewards) + 0.40 × mean(step_rewards)`, clamped to `[0.01, 0.99]`
|
| 239 |
|
inference.py
CHANGED
|
@@ -337,7 +337,8 @@ def run_task(client: OpenAI, task_id: str) -> tuple:
|
|
| 337 |
rewards.append(0.01)
|
| 338 |
break
|
| 339 |
|
| 340 |
-
|
|
|
|
| 341 |
done = bool(step_data.get("done", False))
|
| 342 |
obs = step_data.get("observation", step_data)
|
| 343 |
step_error = step_data.get("error") or error_msg
|
|
|
|
| 337 |
rewards.append(0.01)
|
| 338 |
break
|
| 339 |
|
| 340 |
+
raw_reward = float(step_data.get("reward", 0.01))
|
| 341 |
+
reward = round(min(max(raw_reward, 0.01), 0.99), 4)
|
| 342 |
done = bool(step_data.get("done", False))
|
| 343 |
obs = step_data.get("observation", step_data)
|
| 344 |
step_error = step_data.get("error") or error_msg
|
server/app.py
CHANGED
|
@@ -172,7 +172,7 @@ async def step(request: Request):
|
|
| 172 |
if not valid:
|
| 173 |
last_r = 0.01
|
| 174 |
if session.history:
|
| 175 |
-
last_r = max(0.01, session.history[-1].get('reward', 0.01))
|
| 176 |
return {
|
| 177 |
'reward': last_r,
|
| 178 |
'done': False,
|
|
|
|
| 172 |
if not valid:
|
| 173 |
last_r = 0.01
|
| 174 |
if session.history:
|
| 175 |
+
last_r = min(max(0.01, float(session.history[-1].get('reward', 0.01))), 0.99)
|
| 176 |
return {
|
| 177 |
'reward': last_r,
|
| 178 |
'done': False,
|