File size: 3,824 Bytes
035d186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9fe493b
035d186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
"""
Rubric-based evaluation following the "Rubrics as Rewards" paper.

Implements RaR-Explicit: Weighted sum of individual criterion scores (Equation 1)
"""

from typing import List, Optional

import litellm
from pydantic import BaseModel


class CriterionCheck(BaseModel):
    """Result of checking a single rubric criterion."""

    title: str
    description: str
    weight: int
    satisfied: bool
    reasoning: Optional[str] = None


class RubricEvaluation(BaseModel):
    """Complete rubric-based evaluation result."""

    criterion_checks: List[CriterionCheck]
    raw_score: float  # Unnormalized score
    normalized_score: float  # Score normalized to [0, 1]


CRITERION_PROMPT = """You are evaluating whether a response satisfies a specific evaluation criterion.

Question: {question}

Response to evaluate: {response}

Evaluation Criterion:
{criterion_description}

Your task: Determine if the response satisfies this criterion.

Output a JSON object with:
- "satisfied": true or false
- "reasoning": Brief explanation (1-2 sentences) of why it does or doesn't satisfy the criterion

Be strict but fair. The criterion must be clearly satisfied for you to answer true."""


class RubricData(BaseModel):
    """Rubric data loaded from file."""

    title: str
    description: str
    weight: int


def check_criterion(
    question: str, response: str, criterion: RubricData, model: str = "gpt-4o-mini"
) -> CriterionCheck:
    """
    Check if response satisfies a single criterion.

    Args:
        question: The question being answered
        response: The response to evaluate
        criterion: The rubric criterion to check
        model: LLM model for judging

    Returns:
        CriterionCheck with satisfaction result
    """
    prompt = CRITERION_PROMPT.format(
        question=question,
        response=response,
        criterion_description=criterion.description,
    )

    llm_response = litellm.completion(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "You are an expert evaluator for rubric-based assessment.",
            },
            {"role": "user", "content": prompt},
        ],
        temperature=0.0,
        response_format=CriterionCheck,
    )

    result = CriterionCheck.model_validate_json(llm_response.choices[0].message.content)

    return result


def evaluate_with_rubrics(
    question: str,
    response: str,
    rubrics: List[RubricData],
    model: str = "gpt-5-nano",
) -> RubricEvaluation:
    """
    Evaluate response using RaR-Explicit method (weighted sum).

    Implements Equation 1 from paper:
    r(x, ŷ) = Σ(w_j * c_j(x, ŷ)) / Σ(w_j)

    Args:
        question: The question
        response: Response to evaluate
        reference_answer: Reference answer (not directly used, but available)
        rubrics: List of rubric criteria
        model: LLM model for judging

    Returns:
        RubricEvaluation with normalized score
    """
    # Check each criterion independently
    checks = []
    for rubric in rubrics:
        check = check_criterion(question, response, rubric, model)
        checks.append(check)

    # Calculate weighted score (Equation 1)
    # Only positive weights contribute to denominator
    positive_weights = sum(abs(r.weight) for r in rubrics if r.weight > 0)

    raw_score = 0.0
    for check in checks:
        if check.satisfied:
            raw_score += check.weight

    # Normalize to [0, 1]
    normalized_score = raw_score / positive_weights if positive_weights > 0 else 0.0
    # Clip to [0, 1] in case pitfalls make it negative
    normalized_score = max(0.0, min(1.0, normalized_score))

    return RubricEvaluation(
        raw_score=raw_score,
        normalized_score=normalized_score,
        criterion_checks=checks,
    )