Spaces:

darshanajudiya7
/

python_env

Sleeping

App Files Files Community

python_env / server /python_env_environment.py

darshanajudiya7

Upload folder using huggingface_hub

83bfb8f verified 10 days ago

raw

history blame contribute delete

16.7 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the BSD-style license found in the
	# LICENSE file in the root directory of this source tree.

	"""Python code-review environment implementation."""

	from __future__ import annotations

	from dataclasses import dataclass, field
	from datetime import UTC, datetime
	from typing import Dict, Iterable, List, Optional
	from uuid import uuid4

	from openenv.core.env_server.interfaces import Environment
	from openenv.core.env_server.types import State

	try:
	from ..models import (
	Difficulty,
	PythonAction,
	PythonEnvConfig,
	PythonObservation,
	PythonState,
	ReviewFinding,
	TaskDescriptor,
	TaskEvaluation,
	TaskMetadata,
	)
	except ImportError:
	from models import ( # type: ignore
	Difficulty,
	PythonAction,
	PythonEnvConfig,
	PythonObservation,
	PythonState,
	ReviewFinding,
	TaskDescriptor,
	TaskEvaluation,
	TaskMetadata,
	)


	@dataclass(frozen=True)
	class ReferenceFinding:
	"""Hidden finding metadata used for deterministic grading."""

	rule_id: str
	title: str
	line: int
	category: str
	severity: str
	rationale: str
	recommendation: str
	weight: float
	keywords: List[str] = field(default_factory=list)


	@dataclass(frozen=True)
	class ReviewTask:
	"""A visible task plus its hidden grading references."""

	descriptor: TaskDescriptor
	references: tuple[ReferenceFinding, ...]
	hint: str
	patched_code: Optional[str] = None


	TASK_BANK: Dict[str, ReviewTask] = {
	"py-review-easy": ReviewTask(
	descriptor=TaskDescriptor(
	task_id="py-review-easy",
	difficulty="easy",
	title="Mutable default argument",
	objective="Find the correctness issue and explain a safe fix.",
	code=(
	"def add_tag(tag, tags=[]):\n"
	" tags.append(tag)\n"
	" return tags\n"
	),
	max_steps=4,
	success_threshold=0.7,
	),
	references=(
	ReferenceFinding(
	rule_id="mutable-default",
	title="Mutable default list is shared across calls",
	line=1,
	category="bug",
	severity="warning",
	rationale="The list persists between calls and leaks state.",
	recommendation="Use None as the default and create a new list inside the function.",
	weight=1.0,
	keywords=["mutable", "default", "list", "shared", "persists", "leaks"],
	),
	),
	hint="Look for state that survives between separate function calls.",
	patched_code=(
	"def add_tag(tag, tags=None):\n"
	" if tags is None:\n"
	" tags = []\n"
	" tags.append(tag)\n"
	" return tags\n"
	),
	),
	"py-review-medium": ReviewTask(
	descriptor=TaskDescriptor(
	task_id="py-review-medium",
	difficulty="medium",
	title="Unsafe shell invocation",
	objective="Review the snippet for security-sensitive behavior.",
	code=(
	"import os\n\n"
	"def run_backup(path):\n"
	" os.system(f\"tar -czf backup.tgz {path}\")\n"
	),
	max_steps=4,
	success_threshold=0.72,
	),
	references=(
	ReferenceFinding(
	rule_id="shell-injection",
	title="User input is interpolated into a shell command",
	line=4,
	category="security",
	severity="critical",
	rationale="An attacker can inject shell metacharacters through the path argument.",
	recommendation="Use subprocess with an argument list instead of os.system.",
	weight=1.0,
	keywords=["shell", "injection", "os.system", "subprocess", "input", "unsantized", "escaping"],
	),
	),
	hint="Check how external commands are invoked and whether user input is escaped.",
	patched_code=(
	"import subprocess\n\n"
	"def run_backup(path):\n"
	" subprocess.run([\"tar\", \"-czf\", \"backup.tgz\", path], check=True)\n"
	),
	),
	"py-review-hard": ReviewTask(
	descriptor=TaskDescriptor(
	task_id="py-review-hard",
	difficulty="hard",
	title="Retry helper hides failures",
	objective="Identify correctness and maintainability issues in the retry logic.",
	code=(
	"import time\n\n"
	"def fetch_with_retry(client, url, retries=3):\n"
	" last_error = None\n"
	" for _ in range(retries):\n"
	" try:\n"
	" return client.get(url, timeout=1)\n"
	" except Exception as exc:\n"
	" last_error = exc\n"
	" time.sleep(0.1)\n"
	" return None\n"
	),
	max_steps=4,
	success_threshold=0.74,
	),
	references=(
	ReferenceFinding(
	rule_id="swallowed-error",
	title="Function swallows the final exception and returns None",
	line=10,
	category="bug",
	severity="warning",
	rationale="Callers cannot distinguish a failed request from a valid None result.",
	recommendation="Re-raise the last exception after retries are exhausted.",
	weight=0.65,
	keywords=["swallowed", "exception", "return none", "error handling"],
	),
	ReferenceFinding(
	rule_id="broad-except",
	title="Broad exception handler catches unexpected failures",
	line=7,
	category="maintainability",
	severity="info",
	rationale="Catching Exception masks programming errors and interrupts.",
	recommendation="Catch only the client or network exceptions you expect to retry.",
	weight=0.35,
	keywords=["broad", "except", "catch exception"],
	),
	),
	hint="Consider what happens to the final error after the retry loop finishes.",
	patched_code=(
	"import time\n\n"
	"def fetch_with_retry(client, url, retries=3):\n"
	" last_error = None\n"
	" for _ in range(retries):\n"
	" try:\n"
	" return client.get(url, timeout=1)\n"
	" except client.retryable_exceptions as exc:\n"
	" last_error = exc\n"
	" time.sleep(0.1)\n"
	" if last_error is not None:\n"
	" raise last_error\n"
	),
	),
	}


	def _utc_now() -> str:
	return datetime.now(UTC).isoformat()


	def _normalize_text(value: Optional[str]) -> str:
	return " ".join((value or "").strip().lower().split())


	def _normalize_code(value: Optional[str]) -> str:
	return "\n".join(line.rstrip() for line in (value or "").strip().splitlines())


	class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
	"""Deterministic benchmark environment for Python code review tasks."""

	SUPPORTS_CONCURRENT_SESSIONS: bool = True

	def __init__(self, config: Optional[PythonEnvConfig] = None):
	super().__init__()
	self._config = config or PythonEnvConfig()
	self._state = State(episode_id=str(uuid4()), step_count=0)
	self._task_cursor = -1
	self._current_task: Optional[ReviewTask] = None
	self._submitted_findings: List[ReviewFinding] = []
	self._hints_used = 0
	self._created_at = _utc_now()

	def reset(
	self,
	seed: Optional[int] = None,
	episode_id: Optional[str] = None,
	**kwargs,
	) -> PythonObservation:
	"""Start the next configured review task."""

	del seed, kwargs
	self._task_cursor = (self._task_cursor + 1) % len(self._config.task_order)
	task_id = self._config.task_order[self._task_cursor]
	self._current_task = TASK_BANK.get(task_id, TASK_BANK["py-review-easy"])
	self._state = State(
	episode_id=episode_id or str(uuid4()),
	step_count=0,
	)
	self._submitted_findings = []
	self._hints_used = 0
	self._created_at = _utc_now()
	return self._build_observation(
	feedback="New review task loaded. Submit findings or request a hint.",
	reward=0.0,
	done=False,
	)

	def step(
	self,
	action: PythonAction,
	timeout_s: Optional[float] = None,
	**kwargs,
	) -> PythonObservation:
	"""Process one review action and return updated feedback."""

	del timeout_s, kwargs
	if self._current_task is None:
	return self.reset()

	self._state.step_count += 1
	operation = action.operation
	feedback = ""
	reward = 0.0
	done = False

	if operation == "request_hint":
	self._hints_used += 1
	feedback = self._current_task.hint
	evaluation = self._evaluate(self._submitted_findings, action.patched_code)
	reward = evaluation.score
	else:
	if action.findings:
	self._submitted_findings.extend(action.findings)
	evaluation = self._evaluate(self._submitted_findings, action.patched_code)
	reward = evaluation.score
	if operation == "finalize":
	done = True
	feedback = (
	"Review finalized. "
	f"Matched {evaluation.matched_findings}/{evaluation.total_findings} "
	"reference findings."
	)
	else:
	feedback = (
	f"Progress saved. Matched {evaluation.matched_findings}/"
	f"{evaluation.total_findings} findings with score {evaluation.score:.2f}."
	)

	if self._state.step_count >= self._max_steps():
	done = True
	if operation != "finalize":
	feedback = (
	f"{feedback} Maximum steps reached."
	if feedback
	else "Maximum steps reached."
	)

	return self._build_observation(
	feedback=feedback,
	reward=reward,
	done=done,
	patched_code=action.patched_code,
	)

	def _build_observation(
	self,
	*,
	feedback: str,
	reward: float,
	done: bool,
	patched_code: Optional[str] = None,
	) -> PythonObservation:
	assert self._current_task is not None
	evaluation = self._evaluate(self._submitted_findings, patched_code)
	attempts_remaining = max(
	self._max_steps() - self._state.step_count,
	0,
	)
	return PythonObservation(
	task=self._current_task.descriptor,
	feedback=feedback,
	submitted_findings=list(self._submitted_findings),
	hints_used=self._hints_used,
	attempts_remaining=attempts_remaining,
	evaluation=evaluation,
	score=evaluation.score,
	review_time_ms=float(self._state.step_count * 125),
	done=done,
	reward=reward,
	metadata={
	"episode_id": self._state.episode_id,
	"created_at": self._created_at,
	"updated_at": _utc_now(),
	},
	)

	def _evaluate(
	self,
	findings: Iterable[ReviewFinding],
	patched_code: Optional[str],
	) -> TaskEvaluation:
	assert self._current_task is not None

	references = self._current_task.references
	matched_reference_ids: List[str] = []
	matched_weight = 0.0
	false_positives = 0
	duplicate_findings = 0

	seen_ids = set()
	for finding in findings:
	ref_id = self._match_reference(finding, references)
	if ref_id is None:
	false_positives += 1
	continue
	if ref_id in seen_ids:
	duplicate_findings += 1
	continue
	seen_ids.add(ref_id)
	matched_reference_ids.append(ref_id)
	matched_weight += next(ref.weight for ref in references if ref.rule_id == ref_id)

	total_weight = sum(ref.weight for ref in references) or 1.0
	weighted_recall = min(matched_weight / total_weight, 1.0)

	patch_score = 0.0
	if self._current_task.patched_code and patched_code:
	patch_score = float(
	_normalize_code(patched_code) == _normalize_code(self._current_task.patched_code)
	)

	raw_score = (
	weighted_recall
	+ (self._config.patch_bonus_multiplier * patch_score)
	- (self._config.false_positive_penalty * false_positives)
	- (self._config.duplicate_penalty * duplicate_findings)
	- (self._config.hint_penalty * self._hints_used)
	)
	score = max(0.0, min(raw_score, 1.0))

	return TaskEvaluation(
	matched_reference_ids=matched_reference_ids,
	matched_findings=len(matched_reference_ids),
	total_findings=len(references),
	false_positives=false_positives,
	duplicate_findings=duplicate_findings,
	weighted_recall=weighted_recall,
	patch_score=patch_score,
	score=score,
	passed=score >= self._current_task.descriptor.success_threshold,
	)

	def _match_reference(
	self,
	finding: ReviewFinding,
	references: Iterable[ReferenceFinding],
	) -> Optional[str]:
	finding_rule = _normalize_text(finding.rule_id)
	finding_title = _normalize_text(finding.title)
	for reference in references:
	if finding_rule and finding_rule == _normalize_text(reference.rule_id):
	return reference.rule_id
	line_matches = finding.line is not None and finding.line == reference.line
	category_matches = finding.category == reference.category
	title_matches = finding_title and (
	finding_title in _normalize_text(reference.title)
	or _normalize_text(reference.title) in finding_title
	)

	# Keyword match: check if any reference keywords are in the finding text
	keyword_match = any(
	_normalize_text(kw) in finding_title
	for kw in getattr(reference, "keywords", [])
	) if finding_title else False

	# Relaxed matching: allow matching if the title or keywords match even if the line is missing
	if (line_matches and (category_matches or title_matches)) or title_matches or keyword_match:
	return reference.rule_id
	return None

	def _max_steps(self) -> int:
	assert self._current_task is not None
	return min(
	self._current_task.descriptor.max_steps,
	self._config.max_steps_per_task,
	)

	@property
	def state(self) -> State:
	"""Return the current environment state."""

	return self._state


	# Compatibility bridge:
	# keep the old module path, but route the actual app/runtime through the
	# dataset-backed dense-reward benchmark implementation.
	try:
	from .review_runtime import (
	PythonReviewRuntime as _BenchmarkPythonEnvironment,
	get_current_state,
	get_health_response,
	get_metrics_response,
	get_tasks_response,
	)
	except ImportError:
	from server.review_runtime import ( # type: ignore
	PythonReviewRuntime as _BenchmarkPythonEnvironment,
	get_current_state,
	get_health_response,
	get_metrics_response,
	get_tasks_response,
	)


	_GLOBAL_ENV: Optional[_BenchmarkPythonEnvironment] = None


	def get_environment() -> _BenchmarkPythonEnvironment:
	"""Return the shared benchmark environment used by the HTTP app."""

	global _GLOBAL_ENV
	if _GLOBAL_ENV is None:
	_GLOBAL_ENV = _BenchmarkPythonEnvironment()
	return _GLOBAL_ENV


	PythonEnvironment = _BenchmarkPythonEnvironment