hellinferno commited on
Commit
2c28868
·
1 Parent(s): ad826e7

fix: clamp scores to strict (0,1) exclusive — validator requires no 0.0 or 1.0

Browse files
inference.py CHANGED
@@ -207,7 +207,7 @@ async def run_episode_async(
207
  except Exception:
208
  score = sum(rewards) / max(len(rewards), 1) if rewards else 0.0
209
 
210
- score = max(0.0, min(1.0, score))
211
  success = score >= SUCCESS_SCORE_THRESHOLD
212
 
213
  except Exception as exc:
@@ -283,7 +283,7 @@ def run_episode_sync(
283
  except Exception:
284
  score = sum(rewards) / max(len(rewards), 1) if rewards else 0.0
285
 
286
- score = max(0.0, min(1.0, score))
287
  success = score >= SUCCESS_SCORE_THRESHOLD
288
 
289
  except Exception as exc:
@@ -305,7 +305,7 @@ async def async_main() -> int:
305
  print("[DEBUG] WARNING: No API key found (HF_TOKEN / API_KEY / OPENAI_API_KEY)", flush=True)
306
  for tid in ["easy_001", "medium_001", "hard_001"]:
307
  log_start(task=tid, env=BENCHMARK, model=MODEL_NAME)
308
- log_end(success=False, steps=0, score=0.0, rewards=[])
309
  return 1
310
 
311
  llm_client = OpenAI(api_key=API_KEY, base_url=API_BASE_URL)
@@ -380,7 +380,7 @@ async def async_main() -> int:
380
  print("[DEBUG] All connection methods exhausted", flush=True)
381
  for task_id in task_ids:
382
  log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
383
- log_end(success=False, steps=0, score=0.0, rewards=[])
384
  return 1
385
 
386
  except Exception as exc:
 
207
  except Exception:
208
  score = sum(rewards) / max(len(rewards), 1) if rewards else 0.0
209
 
210
+ score = max(0.01, min(0.99, score))
211
  success = score >= SUCCESS_SCORE_THRESHOLD
212
 
213
  except Exception as exc:
 
283
  except Exception:
284
  score = sum(rewards) / max(len(rewards), 1) if rewards else 0.0
285
 
286
+ score = max(0.01, min(0.99, score))
287
  success = score >= SUCCESS_SCORE_THRESHOLD
288
 
289
  except Exception as exc:
 
305
  print("[DEBUG] WARNING: No API key found (HF_TOKEN / API_KEY / OPENAI_API_KEY)", flush=True)
306
  for tid in ["easy_001", "medium_001", "hard_001"]:
307
  log_start(task=tid, env=BENCHMARK, model=MODEL_NAME)
308
+ log_end(success=False, steps=0, score=0.01, rewards=[])
309
  return 1
310
 
311
  llm_client = OpenAI(api_key=API_KEY, base_url=API_BASE_URL)
 
380
  print("[DEBUG] All connection methods exhausted", flush=True)
381
  for task_id in task_ids:
382
  log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
383
+ log_end(success=False, steps=0, score=0.01, rewards=[])
384
  return 1
385
 
386
  except Exception as exc:
server/grader.py CHANGED
@@ -79,7 +79,7 @@ def grade_episode(
79
  false_positive_count: int,
80
  ) -> float:
81
  if not ground_truth_issues:
82
- return 1.0 if false_positive_count == 0 else clamp(1.0 - (0.1 * false_positive_count), 0.0, 1.0)
83
 
84
  total_severity = sum(issue.severity for issue in ground_truth_issues)
85
  found_severity = sum(issue.severity for issue in ground_truth_issues if issue.id in found_issue_ids)
@@ -87,5 +87,5 @@ def grade_episode(
87
  efficiency_bonus = max(0.0, 0.1 * (1 - (total_steps / max(max_steps, 1))))
88
  false_positive_penalty = 0.05 * false_positive_count
89
  final_score = coverage_score + efficiency_bonus - false_positive_penalty
90
- return clamp(final_score, 0.0, 1.0)
91
 
 
79
  false_positive_count: int,
80
  ) -> float:
81
  if not ground_truth_issues:
82
+ return 0.99 if false_positive_count == 0 else clamp(1.0 - (0.1 * false_positive_count), 0.01, 0.99)
83
 
84
  total_severity = sum(issue.severity for issue in ground_truth_issues)
85
  found_severity = sum(issue.severity for issue in ground_truth_issues if issue.id in found_issue_ids)
 
87
  efficiency_bonus = max(0.0, 0.1 * (1 - (total_steps / max(max_steps, 1))))
88
  false_positive_penalty = 0.05 * false_positive_count
89
  final_score = coverage_score + efficiency_bonus - false_positive_penalty
90
+ return clamp(final_score, 0.01, 0.99)
91
 
sql_query_reviewer/models.py CHANGED
@@ -85,7 +85,7 @@ class SQLReviewState(StrictModel):
85
  approved: bool = False
86
  fixes_suggested: list[str] = Field(default_factory=list)
87
  false_positive_count: int = Field(default=0, ge=0)
88
- final_score: float | None = Field(default=None, ge=0.0, le=1.0)
89
 
90
 
91
  class StepResult(StrictModel):
 
85
  approved: bool = False
86
  fixes_suggested: list[str] = Field(default_factory=list)
87
  false_positive_count: int = Field(default=0, ge=0)
88
+ final_score: float | None = Field(default=None, gt=0.0, lt=1.0)
89
 
90
 
91
  class StepResult(StrictModel):