Spaces:
Sleeping
Sleeping
| """Shared models for the Python code-review OpenEnv benchmark.""" | |
| from __future__ import annotations | |
| from enum import Enum | |
| from typing import Dict, List, Optional | |
| from pydantic import BaseModel, Field, model_validator | |
| from openenv.core.env_server.types import Action, Observation, State | |
| class Difficulty(str, Enum): | |
| EASY = "easy" | |
| MEDIUM = "medium" | |
| HARD = "hard" | |
| class ActionType(str, Enum): | |
| ADD_COMMENT = "ADD_COMMENT" | |
| APPROVE = "APPROVE" | |
| REQUEST_CHANGES = "REQUEST_CHANGES" | |
| ASK_CONTEXT = "ASK_CONTEXT" | |
| SKIP_LINE = "SKIP_LINE" | |
| class IssueType(str, Enum): | |
| STYLE = "STYLE" | |
| LOGIC = "LOGIC" | |
| SECURITY = "SECURITY" | |
| PERFORMANCE = "PERFORMANCE" | |
| DOCS = "DOCS" | |
| class Severity(str, Enum): | |
| LOW = "LOW" | |
| MEDIUM = "MEDIUM" | |
| HIGH = "HIGH" | |
| CRITICAL = "CRITICAL" | |
| class GoldIssue(BaseModel): | |
| """Hidden benchmark annotation for one issue in a snippet.""" | |
| issue_id: str | |
| line: int = Field(..., ge=1) | |
| issue_type: IssueType | |
| severity: Severity | |
| description: str | |
| required: bool = True | |
| explanation_keywords: List[str] = Field(default_factory=list) | |
| fix_keywords: List[str] = Field(default_factory=list) | |
| owasp_category: Optional[str] = None | |
| owasp_keywords: List[str] = Field(default_factory=list) | |
| class ReviewComment(BaseModel): | |
| """Stored review action visible to the agent in `review_history`.""" | |
| step_index: int = Field(..., ge=1) | |
| action_type: ActionType | |
| line_number: Optional[int] = Field(default=None, ge=1) | |
| issue_type: Optional[IssueType] = None | |
| severity: Optional[Severity] = None | |
| comment: Optional[str] = None | |
| suggestion: Optional[str] = None | |
| question: Optional[str] = None | |
| matched_issue_ids: List[str] = Field(default_factory=list) | |
| reward_delta: float = 0.0 | |
| class CodeReviewSnippet(BaseModel): | |
| """Benchmark sample loaded from JSON.""" | |
| snippet_id: str | |
| filename: str | |
| code: str | |
| context: Optional[str] = None | |
| diff: Optional[str] = None | |
| gold_issues: List[GoldIssue] | |
| must_approve: bool = False | |
| must_reject: bool = True | |
| class TaskMetadata(BaseModel): | |
| """Visible task-family metadata.""" | |
| task_id: str | |
| name: str | |
| difficulty: Difficulty | |
| description: str | |
| snippet_count: int = Field(..., ge=0) | |
| max_steps: int = Field(..., ge=1) | |
| min_score: float = Field(default=0.0, ge=0.0, le=1.0) | |
| max_score: float = Field(default=1.0, ge=0.0, le=1.0) | |
| class ReviewFinding(BaseModel): | |
| """Compatibility shim for earlier template-derived environment code.""" | |
| title: str = "" | |
| line: Optional[int] = Field(default=None, ge=1) | |
| category: str = "bug" | |
| severity: str = "warning" | |
| rationale: str = "" | |
| recommendation: Optional[str] = None | |
| rule_id: Optional[str] = None | |
| class TaskDescriptor(BaseModel): | |
| """Compatibility shim for earlier template-derived environment code.""" | |
| task_id: str | |
| difficulty: str | |
| title: str | |
| objective: str | |
| code: str | |
| max_steps: int = Field(..., ge=1) | |
| success_threshold: float = Field(default=0.0, ge=0.0, le=1.0) | |
| class TaskEvaluation(BaseModel): | |
| """Compatibility shim for earlier template-derived environment code.""" | |
| matched_reference_ids: List[str] = Field(default_factory=list) | |
| matched_findings: int = 0 | |
| total_findings: int = 0 | |
| false_positives: int = 0 | |
| duplicate_findings: int = 0 | |
| weighted_recall: float = 0.0 | |
| patch_score: float = 0.0 | |
| score: float = 0.0 | |
| passed: bool = False | |
| class PythonEnvConfig(BaseModel): | |
| """Environment configuration used by the benchmark runtime.""" | |
| task_order: List[str] = Field( | |
| default_factory=lambda: ["task_easy", "task_medium", "task_hard"] | |
| ) | |
| max_steps_per_task: int = Field(default=25, ge=1, le=100) | |
| max_history_entries: int = Field(default=200, ge=1, le=1000) | |
| rotate_tasks: bool = True | |
| # Evaluation parameters | |
| patch_bonus_multiplier: float = 0.2 | |
| false_positive_penalty: float = 0.05 | |
| duplicate_penalty: float = 0.02 | |
| hint_penalty: float = 0.1 | |
| class EpisodeMetrics(BaseModel): | |
| """Current episode metrics for UI, evaluation, and RL logging.""" | |
| precision: float = Field(default=0.0, ge=0.0, le=1.0) | |
| recall: float = Field(default=0.0, ge=0.0, le=1.0) | |
| f1: float = Field(default=0.0, ge=0.0, le=1.0) | |
| true_positives: int = Field(default=0, ge=0) | |
| false_positives: int = Field(default=0, ge=0) | |
| missed_issues: int = Field(default=0, ge=0) | |
| required_found: int = Field(default=0, ge=0) | |
| required_total: int = Field(default=0, ge=0) | |
| bonus_found: int = Field(default=0, ge=0) | |
| duplicate_comments: int = Field(default=0, ge=0) | |
| context_requests: int = Field(default=0, ge=0) | |
| skipped_clean_lines: int = Field(default=0, ge=0) | |
| skipped_issue_lines: int = Field(default=0, ge=0) | |
| current_score: float = Field(default=0.0, ge=0.0, le=1.0) | |
| cumulative_reward: float = 0.0 | |
| breakdown: Dict[str, float] = Field(default_factory=dict) | |
| class RewardSummary(BaseModel): | |
| """Reward details from the most recent step.""" | |
| step_reward: float = 0.0 | |
| cumulative_reward: float = 0.0 | |
| breakdown: Dict[str, float] = Field(default_factory=dict) | |
| false_positives: int = Field(default=0, ge=0) | |
| true_positives: int = Field(default=0, ge=0) | |
| missed_issues: int = Field(default=0, ge=0) | |
| class PythonReviewAction(Action): | |
| """Structured review action emitted by a model or trainer.""" | |
| operation: str = Field(default="ADD_COMMENT", description="The operation to perform.") | |
| findings: List[ReviewFinding] = Field(default_factory=list, description="The findings list.") | |
| patched_code: Optional[str] = Field(default=None, description="The fixed source code.") | |
| action_type: ActionType = Field( | |
| default=ActionType.ADD_COMMENT, | |
| description="Choose the review action: comment on a line, skip a clean line, ask for context, approve, or request changes.", | |
| ) | |
| line_number: Optional[int] = Field( | |
| default=None, | |
| ge=1, | |
| description="Required for ADD_COMMENT and SKIP_LINE. Enter the code line number you are acting on.", | |
| ) | |
| issue_type: Optional[IssueType] = Field( | |
| default=None, | |
| description="Required for ADD_COMMENT. Pick the issue category for the selected line.", | |
| ) | |
| severity: Optional[Severity] = Field( | |
| default=None, | |
| description="Required for ADD_COMMENT. Pick how serious the issue is.", | |
| ) | |
| comment: Optional[str] = Field( | |
| default=None, | |
| description="Required for ADD_COMMENT. Also used for ASK_CONTEXT if you want to ask a question in plain text.", | |
| ) | |
| suggestion: Optional[str] = None | |
| question: Optional[str] = None | |
| def model_json_schema(cls, *args, **kwargs): | |
| """Trim legacy fields from the generated UI schema.""" | |
| schema = super().model_json_schema(*args, **kwargs) | |
| properties = schema.get("properties", {}) | |
| visible_fields = { | |
| "action_type", | |
| "line_number", | |
| "issue_type", | |
| "severity", | |
| "comment", | |
| } | |
| schema["properties"] = { | |
| name: value for name, value in properties.items() if name in visible_fields | |
| } | |
| schema["required"] = [ | |
| name for name in schema.get("required", []) if name in visible_fields | |
| ] | |
| return schema | |
| def validate_action_shape(self) -> "PythonReviewAction": | |
| """Require the right fields for each action type.""" | |
| # Legacy template actions still use string operations like `submit_findings`. | |
| # Benchmark actions should validate against `action_type`. | |
| if self.operation != "ADD_COMMENT": | |
| return self | |
| if self.action_type == ActionType.ADD_COMMENT: | |
| missing = [] | |
| if self.line_number is None: | |
| missing.append("line_number") | |
| if self.issue_type is None: | |
| missing.append("issue_type") | |
| if self.severity is None: | |
| missing.append("severity") | |
| if not (self.comment or "").strip(): | |
| missing.append("comment") | |
| if missing: | |
| raise ValueError("ADD_COMMENT requires: " + ", ".join(missing)) | |
| elif self.action_type == ActionType.SKIP_LINE: | |
| if self.line_number is None: | |
| raise ValueError("SKIP_LINE requires line_number") | |
| elif self.action_type == ActionType.ASK_CONTEXT: | |
| if not (self.question or self.comment or "").strip(): | |
| raise ValueError("ASK_CONTEXT requires question or comment") | |
| elif self.action_type in {ActionType.APPROVE, ActionType.REQUEST_CHANGES}: | |
| noisy_fields = { | |
| "line_number": self.line_number, | |
| "issue_type": self.issue_type, | |
| "severity": self.severity, | |
| "comment": self.comment, | |
| "suggestion": self.suggestion, | |
| "question": self.question, | |
| } | |
| populated = [ | |
| name for name, value in noisy_fields.items() if value not in (None, "") | |
| ] | |
| if populated: | |
| raise ValueError( | |
| f"{self.action_type.value} does not accept extra fields: {', '.join(populated)}" | |
| ) | |
| return self | |
| class PythonReviewObservation(Observation): | |
| """Observation returned by reset/step, including trainer-visible metrics.""" | |
| snippet_id: str = "" | |
| code: str = "" | |
| filename: str = "" | |
| language: str = "python" | |
| context: Optional[str] = None | |
| diff: Optional[str] = None | |
| line_count: int = Field(default=0, ge=0) | |
| current_step: int = Field(default=0, ge=0) | |
| max_steps: int = Field(default=1, ge=1) | |
| task_id: str = "" | |
| review_history: List[ReviewComment] = Field(default_factory=list) | |
| lines: List[str] = Field(default_factory=list) | |
| reward_summary: RewardSummary = Field(default_factory=RewardSummary) | |
| metrics: EpisodeMetrics = Field(default_factory=EpisodeMetrics) | |
| feedback: str = "" | |
| review_time_ms: float = 0.0 | |
| # Template compatibility | |
| task: Optional[TaskDescriptor] = None | |
| instructions: str = "" | |
| submitted_findings: List[ReviewFinding] = Field(default_factory=list) | |
| hints_used: int = 0 | |
| attempts_remaining: int = 0 | |
| evaluation: Optional[TaskEvaluation] = None | |
| score: float = 0.0 | |
| class PythonReviewState(State): | |
| """Full server-side state exposed by `/state`.""" | |
| task_id: Optional[str] = None | |
| difficulty: Optional[Difficulty] = None | |
| snippet_id: Optional[str] = None | |
| current_step: int = Field(default=0, ge=0) | |
| max_steps: int = Field(default=0, ge=0) | |
| done: bool = False | |
| filename: Optional[str] = None | |
| review_history: List[ReviewComment] = Field(default_factory=list) | |
| metrics: EpisodeMetrics = Field(default_factory=EpisodeMetrics) | |
| last_feedback: str = "" | |
| class TaskListResponse(BaseModel): | |
| tasks: List[TaskMetadata] = Field(default_factory=list) | |
| class MetricsResponse(BaseModel): | |
| task_id: Optional[str] = None | |
| snippet_id: Optional[str] = None | |
| done: bool = False | |
| metrics: EpisodeMetrics = Field(default_factory=EpisodeMetrics) | |
| class HealthResponse(BaseModel): | |
| status: str = "ok" | |
| environment: str = "python_code_review_env" | |
| task_count: int = Field(default=0, ge=0) | |
| active_task_id: Optional[str] = None | |
| active_snippet_id: Optional[str] = None | |
| active_episode_id: Optional[str] = None | |
| PythonAction = PythonReviewAction | |
| PythonObservation = PythonReviewObservation | |
| PythonState = PythonReviewState | |
| CodeReviewAction = PythonReviewAction | |
| CodeReviewObservation = PythonReviewObservation | |
| CodeReviewConfig = PythonEnvConfig | |