File size: 4,169 Bytes
0695520
 
 
 
 
692f802
0695520
 
692f802
0695520
 
 
 
 
 
5d806ad
0695520
 
5d806ad
0695520
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d806ad
0695520
 
 
5d806ad
 
0695520
 
5d806ad
 
0695520
 
 
 
 
 
 
 
 
5d806ad
0695520
5d806ad
 
0695520
 
 
5d806ad
0695520
 
 
 
 
 
5d806ad
0695520
5d806ad
 
0695520
 
 
5d806ad
0695520
 
 
 
 
 
5d806ad
0695520
 
 
 
 
 
 
 
 
5d806ad
0695520
 
 
 
 
5d806ad
0695520
5d806ad
0695520
5d806ad
0695520
 
5d806ad
 
0695520
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""Optimization task grader."""

from __future__ import annotations

try:
    from ..models import TaskGrade
    from ..tasks.catalog import ReviewTask
except ImportError:
    from models import TaskGrade
    from tasks.catalog import ReviewTask

from .shared import (
    base_grade,
    benchmark_candidate,
    compile_code,
    component_score,
    execute_cases,
    quality_metrics,
    shaped_score,
    similarity_score,
    summarize_results,
)


def grade_optimization_task(
    task: ReviewTask,
    code: str,
    *,
    include_hidden: bool,
    timeout_s: float = 3.0,
) -> TaskGrade:
    """Grade an optimization/refactor task with correctness, quality, and runtime."""

    compiled, compile_error = compile_code(code)
    quality = quality_metrics(code, task.function_name)
    details = {
        "compile_error": compile_error,
        "quality_notes": quality["quality_notes"],
        "style_score": quality["style_score"],
        "visibility": "full" if include_hidden else "public",
    }

    if not compiled:
        progress = 0.02 + 0.1 * similarity_score(code, task.reference_code)
        details["test_results"] = []
        details["test_summary"] = "Code does not compile."
        return base_grade(
            score=shaped_score(progress),
            syntax_score=component_score(0.01),
            tests_passed=0,
            tests_total=len(task.public_cases) + (len(task.hidden_cases) if include_hidden else 0),
            quality_score=component_score(0.01),
            runtime_score=component_score(0.01),
            timed_out=False,
            details=details,
        )

    cases = task.public_cases + (task.hidden_cases if include_hidden else [])
    result = execute_cases(code, task.function_name, cases, timeout_s=timeout_s)
    if result.get("timed_out"):
        details["test_results"] = []
        details["test_summary"] = result["error"]
        progress = 0.1 + 0.18 * quality["score"]
        return base_grade(
            score=shaped_score(progress),
            syntax_score=component_score(0.95),
            tests_passed=0,
            tests_total=len(cases),
            quality_score=quality["score"],
            runtime_score=component_score(0.01),
            timed_out=True,
            details=details,
        )
    if "error" in result:
        details["test_results"] = []
        details["test_summary"] = result["error"]
        progress = 0.1 + 0.2 * quality["score"]
        return base_grade(
            score=shaped_score(progress),
            syntax_score=component_score(0.95),
            tests_passed=0,
            tests_total=len(cases),
            quality_score=quality["score"],
            runtime_score=component_score(0.01),
            timed_out=False,
            details=details,
        )

    data = result["data"]
    pass_rate = data["passed"] / max(data["total"], 1)
    runtime_score = component_score(0.01)
    benchmark_summary = "Benchmark deferred until hidden evaluation."
    timed_out = False

    if include_hidden and pass_rate == 1.0:
        benchmark = benchmark_candidate(task, code, timeout_s=timeout_s)
        runtime_score = benchmark["runtime_score"]
        timed_out = benchmark.get("timed_out", False)
        benchmark_summary = benchmark["details"]
        if timed_out:
            runtime_score = component_score(0.01)

    details["test_results"] = data["results"]
    details["test_summary"] = summarize_results("Test results", data["results"])
    details["benchmark"] = benchmark_summary

    runtime_progress = 0.0 if benchmark_summary == "Benchmark deferred until hidden evaluation." else runtime_score
    if include_hidden:
        progress = min(1.0, 0.05 + 0.6 * pass_rate + 0.2 * quality["score"] + 0.15 * runtime_progress)
    else:
        progress = min(1.0, 0.05 + 0.7 * pass_rate + 0.25 * quality["score"])

    return base_grade(
        score=shaped_score(progress),
        syntax_score=component_score(0.95),
        tests_passed=data["passed"],
        tests_total=data["total"],
        quality_score=quality["score"],
        runtime_score=runtime_score,
        timed_out=timed_out,
        details=details,
    )