Spaces:

uvpatel7271
/

python_env

Build error

App Files Files Community

python_env / graders /optimization.py

uvpatel7271

Upload folder using huggingface_hub

c8e832f verified 9 days ago

raw

history blame contribute delete

6.39 kB

	"""Deterministic grading for optimization and refactor tasks."""

	from __future__ import annotations

	import json
	import subprocess
	import sys
	import tempfile
	from pathlib import Path

	from graders.common import clamp_score, compile_tree, nested_loop_depth, style_score
	from graders.pytest_runner import run_pytest_suite
	from models import TaskGrade
	from tasks.task_bank import TaskSpec


	def _benchmark_script(task: TaskSpec) -> str:
	return f"""import json
	import time
	from candidate import {task.benchmark_entrypoint}

	{task.benchmark_builder}

	events = build_benchmark_events()
	start = time.perf_counter()
	for _ in range({task.benchmark_repeats}):
	result = {task.benchmark_entrypoint}(events)
	elapsed = time.perf_counter() - start
	Path = __import__("pathlib").Path
	Path("benchmark.json").write_text(json.dumps({{"elapsed": elapsed, "rows": len(result)}}), encoding="utf-8")
	"""


	def benchmark_runtime(candidate_code: str, task: TaskSpec) -> tuple[float, bool, str]:
	"""Benchmark runtime deterministically against the starter implementation."""

	assert task.benchmark_entrypoint is not None
	try:
	with tempfile.TemporaryDirectory(prefix="python-code-review-bench-") as temp_dir:
	temp_path = Path(temp_dir)
	(temp_path / "candidate.py").write_text(candidate_code, encoding="utf-8")
	(temp_path / "starter.py").write_text(task.starter_code, encoding="utf-8")
	(temp_path / "candidate_runner.py").write_text(_benchmark_script(task), encoding="utf-8")

	starter_script = _benchmark_script(task).replace("from candidate import", "from starter import")
	(temp_path / "starter_runner.py").write_text(starter_script, encoding="utf-8")

	try:
	starter_run = subprocess.run(
	[sys.executable, "starter_runner.py"],
	cwd=temp_path,
	capture_output=True,
	text=True,
	timeout=task.benchmark_timeout_s,
	check=False,
	)
	starter_payload = json.loads((temp_path / "benchmark.json").read_text(encoding="utf-8"))

	candidate_run = subprocess.run(
	[sys.executable, "candidate_runner.py"],
	cwd=temp_path,
	capture_output=True,
	text=True,
	timeout=task.benchmark_timeout_s,
	check=False,
	)
	candidate_payload = json.loads((temp_path / "benchmark.json").read_text(encoding="utf-8"))
	except subprocess.TimeoutExpired as exc:
	output = (exc.stdout or "") + (exc.stderr or "")
	return 0.0, True, (output or "benchmark timed out").strip()
	except Exception as exc: # pragma: no cover
	return 0.0, False, str(exc)

	starter_elapsed = max(float(starter_payload["elapsed"]), 1e-9)
	candidate_elapsed = max(float(candidate_payload["elapsed"]), 1e-9)
	speedup = starter_elapsed / candidate_elapsed
	runtime_score = clamp_score(min((speedup - 1.0) / 3.0, 1.0))
	output = "\n".join(
	part
	for part in [
	starter_run.stdout.strip(),
	starter_run.stderr.strip(),
	candidate_run.stdout.strip(),
	candidate_run.stderr.strip(),
	f"starter={starter_elapsed:.6f}s candidate={candidate_elapsed:.6f}s speedup={speedup:.2f}x",
	]
	if part
	)
	return runtime_score, False, output
	except Exception as exc: # pragma: no cover
	return 0.0, False, str(exc)


	def ast_quality_score(code: str, task: TaskSpec) -> float:
	"""Score maintainability and algorithmic structure."""

	tree, parse_error = compile_tree(code)
	if tree is None:
	return 0.0

	import ast

	function_node = next(
	(node for node in tree.body if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef))),
	None,
	)
	docstring_points = 0.2 if function_node and ast.get_docstring(function_node, clean=False) else 0.0
	nested_points = 0.4 if nested_loop_depth(tree) <= 1 else 0.0
	marker_points = 0.0
	for marker in task.expected_quality_markers:
	if marker in code:
	marker_points += 0.2
	return clamp_score(docstring_points + nested_points + marker_points)


	def grade_optimization_task(candidate_code: str, task: TaskSpec) -> TaskGrade:
	"""Grade optimization tasks using correctness, runtime, AST quality, and style."""

	execution = run_pytest_suite(
	candidate_code,
	[task.visible_tests, task.hidden_tests],
	timeout_s=task.benchmark_timeout_s,
	)
	test_fraction = execution.passed / execution.total if execution.total else 0.0

	if execution.timed_out:
	return TaskGrade(
	score=0.0,
	tests_passed=execution.passed,
	tests_total=execution.total,
	timed_out=True,
	details={"tests": execution.output},
	)

	runtime_score, timed_out, benchmark_output = benchmark_runtime(candidate_code, task)
	if timed_out:
	return TaskGrade(
	score=0.0,
	tests_passed=execution.passed,
	tests_total=execution.total,
	timed_out=True,
	details={"tests": execution.output, "benchmark": benchmark_output},
	)

	quality_score = ast_quality_score(candidate_code, task)
	pep8_score = style_score(candidate_code, task.style_max_line_length)
	score = clamp_score(
	(0.5 * test_fraction)
	+ (0.3 * runtime_score)
	+ (0.15 * quality_score)
	+ (0.05 * pep8_score)
	)
	return TaskGrade(
	score=score,
	syntax_score=1.0,
	tests_passed=execution.passed,
	tests_total=execution.total,
	quality_score=quality_score,
	runtime_score=runtime_score,
	details={
	"tests": execution.output,
	"benchmark": benchmark_output,
	"test_fraction": round(test_fraction, 4),
	"runtime_score": round(runtime_score, 4),
	"style_score": round(pep8_score, 4),
	},
	)