Spaces:
Sleeping
Sleeping
File size: 11,788 Bytes
d25ab77 666dff1 d25ab77 83bfb8f b577709 b5cee2e 36758e6 d25ab77 36758e6 d25ab77 83bfb8f b577709 83bfb8f b5cee2e d25ab77 524ccd2 d25ab77 524ccd2 d25ab77 524ccd2 d25ab77 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 | """Shared models for the Python code-review OpenEnv benchmark."""
from __future__ import annotations
from enum import Enum
from typing import Dict, List, Optional
from pydantic import BaseModel, Field, model_validator
from openenv.core.env_server.types import Action, Observation, State
class Difficulty(str, Enum):
EASY = "easy"
MEDIUM = "medium"
HARD = "hard"
class ActionType(str, Enum):
ADD_COMMENT = "ADD_COMMENT"
APPROVE = "APPROVE"
REQUEST_CHANGES = "REQUEST_CHANGES"
ASK_CONTEXT = "ASK_CONTEXT"
SKIP_LINE = "SKIP_LINE"
class IssueType(str, Enum):
STYLE = "STYLE"
LOGIC = "LOGIC"
SECURITY = "SECURITY"
PERFORMANCE = "PERFORMANCE"
DOCS = "DOCS"
class Severity(str, Enum):
LOW = "LOW"
MEDIUM = "MEDIUM"
HIGH = "HIGH"
CRITICAL = "CRITICAL"
class GoldIssue(BaseModel):
"""Hidden benchmark annotation for one issue in a snippet."""
issue_id: str
line: int = Field(..., ge=1)
issue_type: IssueType
severity: Severity
description: str
required: bool = True
explanation_keywords: List[str] = Field(default_factory=list)
fix_keywords: List[str] = Field(default_factory=list)
owasp_category: Optional[str] = None
owasp_keywords: List[str] = Field(default_factory=list)
class ReviewComment(BaseModel):
"""Stored review action visible to the agent in `review_history`."""
step_index: int = Field(..., ge=1)
action_type: ActionType
line_number: Optional[int] = Field(default=None, ge=1)
issue_type: Optional[IssueType] = None
severity: Optional[Severity] = None
comment: Optional[str] = None
suggestion: Optional[str] = None
question: Optional[str] = None
matched_issue_ids: List[str] = Field(default_factory=list)
reward_delta: float = 0.0
class CodeReviewSnippet(BaseModel):
"""Benchmark sample loaded from JSON."""
snippet_id: str
filename: str
code: str
context: Optional[str] = None
diff: Optional[str] = None
gold_issues: List[GoldIssue]
must_approve: bool = False
must_reject: bool = True
class TaskMetadata(BaseModel):
"""Visible task-family metadata."""
task_id: str
name: str
difficulty: Difficulty
description: str
snippet_count: int = Field(..., ge=0)
max_steps: int = Field(..., ge=1)
min_score: float = Field(default=0.0, ge=0.0, le=1.0)
max_score: float = Field(default=1.0, ge=0.0, le=1.0)
class ReviewFinding(BaseModel):
"""Compatibility shim for earlier template-derived environment code."""
title: str = ""
line: Optional[int] = Field(default=None, ge=1)
category: str = "bug"
severity: str = "warning"
rationale: str = ""
recommendation: Optional[str] = None
rule_id: Optional[str] = None
class TaskDescriptor(BaseModel):
"""Compatibility shim for earlier template-derived environment code."""
task_id: str
difficulty: str
title: str
objective: str
code: str
max_steps: int = Field(..., ge=1)
success_threshold: float = Field(default=0.0, ge=0.0, le=1.0)
class TaskEvaluation(BaseModel):
"""Compatibility shim for earlier template-derived environment code."""
matched_reference_ids: List[str] = Field(default_factory=list)
matched_findings: int = 0
total_findings: int = 0
false_positives: int = 0
duplicate_findings: int = 0
weighted_recall: float = 0.0
patch_score: float = 0.0
score: float = 0.0
passed: bool = False
class PythonEnvConfig(BaseModel):
"""Environment configuration used by the benchmark runtime."""
task_order: List[str] = Field(
default_factory=lambda: ["task_easy", "task_medium", "task_hard"]
)
max_steps_per_task: int = Field(default=25, ge=1, le=100)
max_history_entries: int = Field(default=200, ge=1, le=1000)
rotate_tasks: bool = True
# Evaluation parameters
patch_bonus_multiplier: float = 0.2
false_positive_penalty: float = 0.05
duplicate_penalty: float = 0.02
hint_penalty: float = 0.1
class EpisodeMetrics(BaseModel):
"""Current episode metrics for UI, evaluation, and RL logging."""
precision: float = Field(default=0.0, ge=0.0, le=1.0)
recall: float = Field(default=0.0, ge=0.0, le=1.0)
f1: float = Field(default=0.0, ge=0.0, le=1.0)
true_positives: int = Field(default=0, ge=0)
false_positives: int = Field(default=0, ge=0)
missed_issues: int = Field(default=0, ge=0)
required_found: int = Field(default=0, ge=0)
required_total: int = Field(default=0, ge=0)
bonus_found: int = Field(default=0, ge=0)
duplicate_comments: int = Field(default=0, ge=0)
context_requests: int = Field(default=0, ge=0)
skipped_clean_lines: int = Field(default=0, ge=0)
skipped_issue_lines: int = Field(default=0, ge=0)
current_score: float = Field(default=0.0, ge=0.0, le=1.0)
cumulative_reward: float = 0.0
breakdown: Dict[str, float] = Field(default_factory=dict)
class RewardSummary(BaseModel):
"""Reward details from the most recent step."""
step_reward: float = 0.0
cumulative_reward: float = 0.0
breakdown: Dict[str, float] = Field(default_factory=dict)
false_positives: int = Field(default=0, ge=0)
true_positives: int = Field(default=0, ge=0)
missed_issues: int = Field(default=0, ge=0)
class PythonReviewAction(Action):
"""Structured review action emitted by a model or trainer."""
operation: str = Field(default="ADD_COMMENT", description="The operation to perform.")
findings: List[ReviewFinding] = Field(default_factory=list, description="The findings list.")
patched_code: Optional[str] = Field(default=None, description="The fixed source code.")
action_type: ActionType = Field(
default=ActionType.ADD_COMMENT,
description="Choose the review action: comment on a line, skip a clean line, ask for context, approve, or request changes.",
)
line_number: Optional[int] = Field(
default=None,
ge=1,
description="Required for ADD_COMMENT and SKIP_LINE. Enter the code line number you are acting on.",
)
issue_type: Optional[IssueType] = Field(
default=None,
description="Required for ADD_COMMENT. Pick the issue category for the selected line.",
)
severity: Optional[Severity] = Field(
default=None,
description="Required for ADD_COMMENT. Pick how serious the issue is.",
)
comment: Optional[str] = Field(
default=None,
description="Required for ADD_COMMENT. Also used for ASK_CONTEXT if you want to ask a question in plain text.",
)
suggestion: Optional[str] = None
question: Optional[str] = None
@classmethod
def model_json_schema(cls, *args, **kwargs):
"""Trim legacy fields from the generated UI schema."""
schema = super().model_json_schema(*args, **kwargs)
properties = schema.get("properties", {})
visible_fields = {
"action_type",
"line_number",
"issue_type",
"severity",
"comment",
}
schema["properties"] = {
name: value for name, value in properties.items() if name in visible_fields
}
schema["required"] = [
name for name in schema.get("required", []) if name in visible_fields
]
return schema
@model_validator(mode="after")
def validate_action_shape(self) -> "PythonReviewAction":
"""Require the right fields for each action type."""
# Legacy template actions still use string operations like `submit_findings`.
# Benchmark actions should validate against `action_type`.
if self.operation != "ADD_COMMENT":
return self
if self.action_type == ActionType.ADD_COMMENT:
missing = []
if self.line_number is None:
missing.append("line_number")
if self.issue_type is None:
missing.append("issue_type")
if self.severity is None:
missing.append("severity")
if not (self.comment or "").strip():
missing.append("comment")
if missing:
raise ValueError("ADD_COMMENT requires: " + ", ".join(missing))
elif self.action_type == ActionType.SKIP_LINE:
if self.line_number is None:
raise ValueError("SKIP_LINE requires line_number")
elif self.action_type == ActionType.ASK_CONTEXT:
if not (self.question or self.comment or "").strip():
raise ValueError("ASK_CONTEXT requires question or comment")
elif self.action_type in {ActionType.APPROVE, ActionType.REQUEST_CHANGES}:
noisy_fields = {
"line_number": self.line_number,
"issue_type": self.issue_type,
"severity": self.severity,
"comment": self.comment,
"suggestion": self.suggestion,
"question": self.question,
}
populated = [
name for name, value in noisy_fields.items() if value not in (None, "")
]
if populated:
raise ValueError(
f"{self.action_type.value} does not accept extra fields: {', '.join(populated)}"
)
return self
class PythonReviewObservation(Observation):
"""Observation returned by reset/step, including trainer-visible metrics."""
snippet_id: str = ""
code: str = ""
filename: str = ""
language: str = "python"
context: Optional[str] = None
diff: Optional[str] = None
line_count: int = Field(default=0, ge=0)
current_step: int = Field(default=0, ge=0)
max_steps: int = Field(default=1, ge=1)
task_id: str = ""
review_history: List[ReviewComment] = Field(default_factory=list)
lines: List[str] = Field(default_factory=list)
reward_summary: RewardSummary = Field(default_factory=RewardSummary)
metrics: EpisodeMetrics = Field(default_factory=EpisodeMetrics)
feedback: str = ""
review_time_ms: float = 0.0
# Template compatibility
task: Optional[TaskDescriptor] = None
instructions: str = ""
submitted_findings: List[ReviewFinding] = Field(default_factory=list)
hints_used: int = 0
attempts_remaining: int = 0
evaluation: Optional[TaskEvaluation] = None
score: float = 0.0
class PythonReviewState(State):
"""Full server-side state exposed by `/state`."""
task_id: Optional[str] = None
difficulty: Optional[Difficulty] = None
snippet_id: Optional[str] = None
current_step: int = Field(default=0, ge=0)
max_steps: int = Field(default=0, ge=0)
done: bool = False
filename: Optional[str] = None
review_history: List[ReviewComment] = Field(default_factory=list)
metrics: EpisodeMetrics = Field(default_factory=EpisodeMetrics)
last_feedback: str = ""
class TaskListResponse(BaseModel):
tasks: List[TaskMetadata] = Field(default_factory=list)
class MetricsResponse(BaseModel):
task_id: Optional[str] = None
snippet_id: Optional[str] = None
done: bool = False
metrics: EpisodeMetrics = Field(default_factory=EpisodeMetrics)
class HealthResponse(BaseModel):
status: str = "ok"
environment: str = "python_code_review_env"
task_count: int = Field(default=0, ge=0)
active_task_id: Optional[str] = None
active_snippet_id: Optional[str] = None
active_episode_id: Optional[str] = None
PythonAction = PythonReviewAction
PythonObservation = PythonReviewObservation
PythonState = PythonReviewState
CodeReviewAction = PythonReviewAction
CodeReviewObservation = PythonReviewObservation
CodeReviewConfig = PythonEnvConfig
|