Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- models.py +11 -6
- server/python_env_environment.py +12 -12
models.py
CHANGED
|
@@ -178,23 +178,28 @@ class RewardSummary(BaseModel):
|
|
| 178 |
class PythonReviewAction(Action):
|
| 179 |
"""Structured review action emitted by a model or trainer."""
|
| 180 |
|
| 181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
line_number: Optional[int] = Field(default=None, ge=1)
|
| 183 |
issue_type: Optional[IssueType] = None
|
| 184 |
severity: Optional[Severity] = None
|
| 185 |
comment: Optional[str] = None
|
| 186 |
suggestion: Optional[str] = None
|
| 187 |
question: Optional[str] = None
|
| 188 |
-
|
| 189 |
-
# Template compatibility
|
| 190 |
-
operation: str = "submit_findings"
|
| 191 |
-
findings: List[ReviewFinding] = Field(default_factory=list)
|
| 192 |
-
patched_code: Optional[str] = None
|
| 193 |
|
| 194 |
@model_validator(mode="after")
|
| 195 |
def validate_action_shape(self) -> "PythonReviewAction":
|
| 196 |
"""Require the right fields for each action type."""
|
| 197 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
if self.action_type == ActionType.ADD_COMMENT:
|
| 199 |
missing = []
|
| 200 |
if self.line_number is None:
|
|
|
|
| 178 |
class PythonReviewAction(Action):
|
| 179 |
"""Structured review action emitted by a model or trainer."""
|
| 180 |
|
| 181 |
+
# Primary UI Fields (matches CodingEnv style)
|
| 182 |
+
operation: str = Field(default="submit_findings", description="The operation to perform.")
|
| 183 |
+
code: Optional[str] = Field(default=None, description="The fixed source code.")
|
| 184 |
+
findings: List[ReviewFinding] = Field(default_factory=list, description="The structured findings list.")
|
| 185 |
+
|
| 186 |
+
# Optional Review Fields (for benchmark compatibility)
|
| 187 |
+
action_type: ActionType = ActionType.ADD_COMMENT
|
| 188 |
line_number: Optional[int] = Field(default=None, ge=1)
|
| 189 |
issue_type: Optional[IssueType] = None
|
| 190 |
severity: Optional[Severity] = None
|
| 191 |
comment: Optional[str] = None
|
| 192 |
suggestion: Optional[str] = None
|
| 193 |
question: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
@model_validator(mode="after")
|
| 196 |
def validate_action_shape(self) -> "PythonReviewAction":
|
| 197 |
"""Require the right fields for each action type."""
|
| 198 |
|
| 199 |
+
# Bypass benchmark validation if using the template 'operation' style (e.g. submit_findings)
|
| 200 |
+
if self.operation != ActionType.ADD_COMMENT:
|
| 201 |
+
return self
|
| 202 |
+
|
| 203 |
if self.action_type == ActionType.ADD_COMMENT:
|
| 204 |
missing = []
|
| 205 |
if self.line_number is None:
|
server/python_env_environment.py
CHANGED
|
@@ -64,7 +64,7 @@ class ReviewTask:
|
|
| 64 |
descriptor: TaskDescriptor
|
| 65 |
references: tuple[ReferenceFinding, ...]
|
| 66 |
hint: str
|
| 67 |
-
|
| 68 |
|
| 69 |
|
| 70 |
TASK_BANK: Dict[str, ReviewTask] = {
|
|
@@ -96,7 +96,7 @@ TASK_BANK: Dict[str, ReviewTask] = {
|
|
| 96 |
),
|
| 97 |
),
|
| 98 |
hint="Look for state that survives between separate function calls.",
|
| 99 |
-
|
| 100 |
"def add_tag(tag, tags=None):\n"
|
| 101 |
" if tags is None:\n"
|
| 102 |
" tags = []\n"
|
|
@@ -132,7 +132,7 @@ TASK_BANK: Dict[str, ReviewTask] = {
|
|
| 132 |
),
|
| 133 |
),
|
| 134 |
hint="Check how external commands are invoked and whether user input is escaped.",
|
| 135 |
-
|
| 136 |
"import subprocess\n\n"
|
| 137 |
"def run_backup(path):\n"
|
| 138 |
" subprocess.run([\"tar\", \"-czf\", \"backup.tgz\", path], check=True)\n"
|
|
@@ -184,7 +184,7 @@ TASK_BANK: Dict[str, ReviewTask] = {
|
|
| 184 |
),
|
| 185 |
),
|
| 186 |
hint="Consider what happens to the final error after the retry loop finishes.",
|
| 187 |
-
|
| 188 |
"import time\n\n"
|
| 189 |
"def fetch_with_retry(client, url, retries=3):\n"
|
| 190 |
" last_error = None\n"
|
|
@@ -274,12 +274,12 @@ class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
|
|
| 274 |
if operation == "request_hint":
|
| 275 |
self._hints_used += 1
|
| 276 |
feedback = self._current_task.hint
|
| 277 |
-
evaluation = self._evaluate(self._submitted_findings, action.
|
| 278 |
reward = evaluation.score
|
| 279 |
else:
|
| 280 |
if action.findings:
|
| 281 |
self._submitted_findings.extend(action.findings)
|
| 282 |
-
evaluation = self._evaluate(self._submitted_findings, action.
|
| 283 |
reward = evaluation.score
|
| 284 |
if operation == "finalize":
|
| 285 |
done = True
|
|
@@ -307,7 +307,7 @@ class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
|
|
| 307 |
feedback=feedback,
|
| 308 |
reward=reward,
|
| 309 |
done=done,
|
| 310 |
-
|
| 311 |
)
|
| 312 |
|
| 313 |
def _build_observation(
|
|
@@ -316,10 +316,10 @@ class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
|
|
| 316 |
feedback: str,
|
| 317 |
reward: float,
|
| 318 |
done: bool,
|
| 319 |
-
|
| 320 |
) -> PythonObservation:
|
| 321 |
assert self._current_task is not None
|
| 322 |
-
evaluation = self._evaluate(self._submitted_findings,
|
| 323 |
attempts_remaining = max(
|
| 324 |
self._max_steps() - self._state.step_count,
|
| 325 |
0,
|
|
@@ -345,7 +345,7 @@ class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
|
|
| 345 |
def _evaluate(
|
| 346 |
self,
|
| 347 |
findings: Iterable[ReviewFinding],
|
| 348 |
-
|
| 349 |
) -> TaskEvaluation:
|
| 350 |
assert self._current_task is not None
|
| 351 |
|
|
@@ -372,9 +372,9 @@ class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
|
|
| 372 |
weighted_recall = min(matched_weight / total_weight, 1.0)
|
| 373 |
|
| 374 |
patch_score = 0.0
|
| 375 |
-
if self._current_task.
|
| 376 |
patch_score = float(
|
| 377 |
-
_normalize_code(
|
| 378 |
)
|
| 379 |
|
| 380 |
raw_score = (
|
|
|
|
| 64 |
descriptor: TaskDescriptor
|
| 65 |
references: tuple[ReferenceFinding, ...]
|
| 66 |
hint: str
|
| 67 |
+
code: Optional[str] = None
|
| 68 |
|
| 69 |
|
| 70 |
TASK_BANK: Dict[str, ReviewTask] = {
|
|
|
|
| 96 |
),
|
| 97 |
),
|
| 98 |
hint="Look for state that survives between separate function calls.",
|
| 99 |
+
code=(
|
| 100 |
"def add_tag(tag, tags=None):\n"
|
| 101 |
" if tags is None:\n"
|
| 102 |
" tags = []\n"
|
|
|
|
| 132 |
),
|
| 133 |
),
|
| 134 |
hint="Check how external commands are invoked and whether user input is escaped.",
|
| 135 |
+
code=(
|
| 136 |
"import subprocess\n\n"
|
| 137 |
"def run_backup(path):\n"
|
| 138 |
" subprocess.run([\"tar\", \"-czf\", \"backup.tgz\", path], check=True)\n"
|
|
|
|
| 184 |
),
|
| 185 |
),
|
| 186 |
hint="Consider what happens to the final error after the retry loop finishes.",
|
| 187 |
+
code=(
|
| 188 |
"import time\n\n"
|
| 189 |
"def fetch_with_retry(client, url, retries=3):\n"
|
| 190 |
" last_error = None\n"
|
|
|
|
| 274 |
if operation == "request_hint":
|
| 275 |
self._hints_used += 1
|
| 276 |
feedback = self._current_task.hint
|
| 277 |
+
evaluation = self._evaluate(self._submitted_findings, action.code)
|
| 278 |
reward = evaluation.score
|
| 279 |
else:
|
| 280 |
if action.findings:
|
| 281 |
self._submitted_findings.extend(action.findings)
|
| 282 |
+
evaluation = self._evaluate(self._submitted_findings, action.code)
|
| 283 |
reward = evaluation.score
|
| 284 |
if operation == "finalize":
|
| 285 |
done = True
|
|
|
|
| 307 |
feedback=feedback,
|
| 308 |
reward=reward,
|
| 309 |
done=done,
|
| 310 |
+
code=action.code,
|
| 311 |
)
|
| 312 |
|
| 313 |
def _build_observation(
|
|
|
|
| 316 |
feedback: str,
|
| 317 |
reward: float,
|
| 318 |
done: bool,
|
| 319 |
+
code: Optional[str] = None,
|
| 320 |
) -> PythonObservation:
|
| 321 |
assert self._current_task is not None
|
| 322 |
+
evaluation = self._evaluate(self._submitted_findings, code)
|
| 323 |
attempts_remaining = max(
|
| 324 |
self._max_steps() - self._state.step_count,
|
| 325 |
0,
|
|
|
|
| 345 |
def _evaluate(
|
| 346 |
self,
|
| 347 |
findings: Iterable[ReviewFinding],
|
| 348 |
+
code: Optional[str],
|
| 349 |
) -> TaskEvaluation:
|
| 350 |
assert self._current_task is not None
|
| 351 |
|
|
|
|
| 372 |
weighted_recall = min(matched_weight / total_weight, 1.0)
|
| 373 |
|
| 374 |
patch_score = 0.0
|
| 375 |
+
if self._current_task.code and code:
|
| 376 |
patch_score = float(
|
| 377 |
+
_normalize_code(code) == _normalize_code(self._current_task.code)
|
| 378 |
)
|
| 379 |
|
| 380 |
raw_score = (
|