Spaces:

uvpatel7271
/

python_env

Build error

File size: 6,394 Bytes

c8e832f

"""Deterministic grading for optimization and refactor tasks."""

from __future__ import annotations

import json
import subprocess
import sys
import tempfile
from pathlib import Path

from graders.common import clamp_score, compile_tree, nested_loop_depth, style_score
from graders.pytest_runner import run_pytest_suite
from models import TaskGrade
from tasks.task_bank import TaskSpec


def _benchmark_script(task: TaskSpec) -> str:
    return f"""import json

import time

from candidate import {task.benchmark_entrypoint}



{task.benchmark_builder}



events = build_benchmark_events()

start = time.perf_counter()

for _ in range({task.benchmark_repeats}):

    result = {task.benchmark_entrypoint}(events)

elapsed = time.perf_counter() - start

Path = __import__("pathlib").Path

Path("benchmark.json").write_text(json.dumps({{"elapsed": elapsed, "rows": len(result)}}), encoding="utf-8")

"""


def benchmark_runtime(candidate_code: str, task: TaskSpec) -> tuple[float, bool, str]:
    """Benchmark runtime deterministically against the starter implementation."""

    assert task.benchmark_entrypoint is not None
    try:
        with tempfile.TemporaryDirectory(prefix="python-code-review-bench-") as temp_dir:
            temp_path = Path(temp_dir)
            (temp_path / "candidate.py").write_text(candidate_code, encoding="utf-8")
            (temp_path / "starter.py").write_text(task.starter_code, encoding="utf-8")
            (temp_path / "candidate_runner.py").write_text(_benchmark_script(task), encoding="utf-8")

            starter_script = _benchmark_script(task).replace("from candidate import", "from starter import")
            (temp_path / "starter_runner.py").write_text(starter_script, encoding="utf-8")

            try:
                starter_run = subprocess.run(
                    [sys.executable, "starter_runner.py"],
                    cwd=temp_path,
                    capture_output=True,
                    text=True,
                    timeout=task.benchmark_timeout_s,
                    check=False,
                )
                starter_payload = json.loads((temp_path / "benchmark.json").read_text(encoding="utf-8"))

                candidate_run = subprocess.run(
                    [sys.executable, "candidate_runner.py"],
                    cwd=temp_path,
                    capture_output=True,
                    text=True,
                    timeout=task.benchmark_timeout_s,
                    check=False,
                )
                candidate_payload = json.loads((temp_path / "benchmark.json").read_text(encoding="utf-8"))
            except subprocess.TimeoutExpired as exc:
                output = (exc.stdout or "") + (exc.stderr or "")
                return 0.0, True, (output or "benchmark timed out").strip()
            except Exception as exc:  # pragma: no cover
                return 0.0, False, str(exc)

            starter_elapsed = max(float(starter_payload["elapsed"]), 1e-9)
            candidate_elapsed = max(float(candidate_payload["elapsed"]), 1e-9)
            speedup = starter_elapsed / candidate_elapsed
            runtime_score = clamp_score(min((speedup - 1.0) / 3.0, 1.0))
            output = "\n".join(
                part
                for part in [
                    starter_run.stdout.strip(),
                    starter_run.stderr.strip(),
                    candidate_run.stdout.strip(),
                    candidate_run.stderr.strip(),
                    f"starter={starter_elapsed:.6f}s candidate={candidate_elapsed:.6f}s speedup={speedup:.2f}x",
                ]
                if part
            )
            return runtime_score, False, output
    except Exception as exc:  # pragma: no cover
        return 0.0, False, str(exc)


def ast_quality_score(code: str, task: TaskSpec) -> float:
    """Score maintainability and algorithmic structure."""

    tree, parse_error = compile_tree(code)
    if tree is None:
        return 0.0

    import ast

    function_node = next(
        (node for node in tree.body if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef))),
        None,
    )
    docstring_points = 0.2 if function_node and ast.get_docstring(function_node, clean=False) else 0.0
    nested_points = 0.4 if nested_loop_depth(tree) <= 1 else 0.0
    marker_points = 0.0
    for marker in task.expected_quality_markers:
        if marker in code:
            marker_points += 0.2
    return clamp_score(docstring_points + nested_points + marker_points)


def grade_optimization_task(candidate_code: str, task: TaskSpec) -> TaskGrade:
    """Grade optimization tasks using correctness, runtime, AST quality, and style."""

    execution = run_pytest_suite(
        candidate_code,
        [*task.visible_tests, *task.hidden_tests],
        timeout_s=task.benchmark_timeout_s,
    )
    test_fraction = execution.passed / execution.total if execution.total else 0.0

    if execution.timed_out:
        return TaskGrade(
            score=0.0,
            tests_passed=execution.passed,
            tests_total=execution.total,
            timed_out=True,
            details={"tests": execution.output},
        )

    runtime_score, timed_out, benchmark_output = benchmark_runtime(candidate_code, task)
    if timed_out:
        return TaskGrade(
            score=0.0,
            tests_passed=execution.passed,
            tests_total=execution.total,
            timed_out=True,
            details={"tests": execution.output, "benchmark": benchmark_output},
        )

    quality_score = ast_quality_score(candidate_code, task)
    pep8_score = style_score(candidate_code, task.style_max_line_length)
    score = clamp_score(
        (0.5 * test_fraction)
        + (0.3 * runtime_score)
        + (0.15 * quality_score)
        + (0.05 * pep8_score)
    )
    return TaskGrade(
        score=score,
        syntax_score=1.0,
        tests_passed=execution.passed,
        tests_total=execution.total,
        quality_score=quality_score,
        runtime_score=runtime_score,
        details={
            "tests": execution.output,
            "benchmark": benchmark_output,
            "test_fraction": round(test_fraction, 4),
            "runtime_score": round(runtime_score, 4),
            "style_score": round(pep8_score, 4),
        },
    )