Spaces:
Sleeping
Sleeping
Commit ·
2c28868
1
Parent(s): ad826e7
fix: clamp scores to strict (0,1) exclusive — validator requires no 0.0 or 1.0
Browse files- inference.py +4 -4
- server/grader.py +2 -2
- sql_query_reviewer/models.py +1 -1
inference.py
CHANGED
|
@@ -207,7 +207,7 @@ async def run_episode_async(
|
|
| 207 |
except Exception:
|
| 208 |
score = sum(rewards) / max(len(rewards), 1) if rewards else 0.0
|
| 209 |
|
| 210 |
-
score = max(0.
|
| 211 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 212 |
|
| 213 |
except Exception as exc:
|
|
@@ -283,7 +283,7 @@ def run_episode_sync(
|
|
| 283 |
except Exception:
|
| 284 |
score = sum(rewards) / max(len(rewards), 1) if rewards else 0.0
|
| 285 |
|
| 286 |
-
score = max(0.
|
| 287 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 288 |
|
| 289 |
except Exception as exc:
|
|
@@ -305,7 +305,7 @@ async def async_main() -> int:
|
|
| 305 |
print("[DEBUG] WARNING: No API key found (HF_TOKEN / API_KEY / OPENAI_API_KEY)", flush=True)
|
| 306 |
for tid in ["easy_001", "medium_001", "hard_001"]:
|
| 307 |
log_start(task=tid, env=BENCHMARK, model=MODEL_NAME)
|
| 308 |
-
log_end(success=False, steps=0, score=0.
|
| 309 |
return 1
|
| 310 |
|
| 311 |
llm_client = OpenAI(api_key=API_KEY, base_url=API_BASE_URL)
|
|
@@ -380,7 +380,7 @@ async def async_main() -> int:
|
|
| 380 |
print("[DEBUG] All connection methods exhausted", flush=True)
|
| 381 |
for task_id in task_ids:
|
| 382 |
log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
|
| 383 |
-
log_end(success=False, steps=0, score=0.
|
| 384 |
return 1
|
| 385 |
|
| 386 |
except Exception as exc:
|
|
|
|
| 207 |
except Exception:
|
| 208 |
score = sum(rewards) / max(len(rewards), 1) if rewards else 0.0
|
| 209 |
|
| 210 |
+
score = max(0.01, min(0.99, score))
|
| 211 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 212 |
|
| 213 |
except Exception as exc:
|
|
|
|
| 283 |
except Exception:
|
| 284 |
score = sum(rewards) / max(len(rewards), 1) if rewards else 0.0
|
| 285 |
|
| 286 |
+
score = max(0.01, min(0.99, score))
|
| 287 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 288 |
|
| 289 |
except Exception as exc:
|
|
|
|
| 305 |
print("[DEBUG] WARNING: No API key found (HF_TOKEN / API_KEY / OPENAI_API_KEY)", flush=True)
|
| 306 |
for tid in ["easy_001", "medium_001", "hard_001"]:
|
| 307 |
log_start(task=tid, env=BENCHMARK, model=MODEL_NAME)
|
| 308 |
+
log_end(success=False, steps=0, score=0.01, rewards=[])
|
| 309 |
return 1
|
| 310 |
|
| 311 |
llm_client = OpenAI(api_key=API_KEY, base_url=API_BASE_URL)
|
|
|
|
| 380 |
print("[DEBUG] All connection methods exhausted", flush=True)
|
| 381 |
for task_id in task_ids:
|
| 382 |
log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
|
| 383 |
+
log_end(success=False, steps=0, score=0.01, rewards=[])
|
| 384 |
return 1
|
| 385 |
|
| 386 |
except Exception as exc:
|
server/grader.py
CHANGED
|
@@ -79,7 +79,7 @@ def grade_episode(
|
|
| 79 |
false_positive_count: int,
|
| 80 |
) -> float:
|
| 81 |
if not ground_truth_issues:
|
| 82 |
-
return
|
| 83 |
|
| 84 |
total_severity = sum(issue.severity for issue in ground_truth_issues)
|
| 85 |
found_severity = sum(issue.severity for issue in ground_truth_issues if issue.id in found_issue_ids)
|
|
@@ -87,5 +87,5 @@ def grade_episode(
|
|
| 87 |
efficiency_bonus = max(0.0, 0.1 * (1 - (total_steps / max(max_steps, 1))))
|
| 88 |
false_positive_penalty = 0.05 * false_positive_count
|
| 89 |
final_score = coverage_score + efficiency_bonus - false_positive_penalty
|
| 90 |
-
return clamp(final_score, 0.
|
| 91 |
|
|
|
|
| 79 |
false_positive_count: int,
|
| 80 |
) -> float:
|
| 81 |
if not ground_truth_issues:
|
| 82 |
+
return 0.99 if false_positive_count == 0 else clamp(1.0 - (0.1 * false_positive_count), 0.01, 0.99)
|
| 83 |
|
| 84 |
total_severity = sum(issue.severity for issue in ground_truth_issues)
|
| 85 |
found_severity = sum(issue.severity for issue in ground_truth_issues if issue.id in found_issue_ids)
|
|
|
|
| 87 |
efficiency_bonus = max(0.0, 0.1 * (1 - (total_steps / max(max_steps, 1))))
|
| 88 |
false_positive_penalty = 0.05 * false_positive_count
|
| 89 |
final_score = coverage_score + efficiency_bonus - false_positive_penalty
|
| 90 |
+
return clamp(final_score, 0.01, 0.99)
|
| 91 |
|
sql_query_reviewer/models.py
CHANGED
|
@@ -85,7 +85,7 @@ class SQLReviewState(StrictModel):
|
|
| 85 |
approved: bool = False
|
| 86 |
fixes_suggested: list[str] = Field(default_factory=list)
|
| 87 |
false_positive_count: int = Field(default=0, ge=0)
|
| 88 |
-
final_score: float | None = Field(default=None,
|
| 89 |
|
| 90 |
|
| 91 |
class StepResult(StrictModel):
|
|
|
|
| 85 |
approved: bool = False
|
| 86 |
fixes_suggested: list[str] = Field(default_factory=list)
|
| 87 |
false_positive_count: int = Field(default=0, ge=0)
|
| 88 |
+
final_score: float | None = Field(default=None, gt=0.0, lt=1.0)
|
| 89 |
|
| 90 |
|
| 91 |
class StepResult(StrictModel):
|