Spaces:

smolagents
/

ml-intern

Running on CPU Upgrade

File size: 3,824 Bytes

"""
Rubric-based evaluation following the "Rubrics as Rewards" paper.

Implements RaR-Explicit: Weighted sum of individual criterion scores (Equation 1)
"""

from typing import List, Optional

import litellm
from pydantic import BaseModel


class CriterionCheck(BaseModel):
    """Result of checking a single rubric criterion."""

    title: str
    description: str
    weight: int
    satisfied: bool
    reasoning: Optional[str] = None


class RubricEvaluation(BaseModel):
    """Complete rubric-based evaluation result."""

    criterion_checks: List[CriterionCheck]
    raw_score: float  # Unnormalized score
    normalized_score: float  # Score normalized to [0, 1]


CRITERION_PROMPT = """You are evaluating whether a response satisfies a specific evaluation criterion.

Question: {question}

Response to evaluate: {response}

Evaluation Criterion:
{criterion_description}

Your task: Determine if the response satisfies this criterion.

Output a JSON object with:
- "satisfied": true or false
- "reasoning": Brief explanation (1-2 sentences) of why it does or doesn't satisfy the criterion

Be strict but fair. The criterion must be clearly satisfied for you to answer true."""


class RubricData(BaseModel):
    """Rubric data loaded from file."""

    title: str
    description: str
    weight: int


def check_criterion(
    question: str, response: str, criterion: RubricData, model: str = "gpt-4o-mini"
) -> CriterionCheck:
    """
    Check if response satisfies a single criterion.

    Args:
        question: The question being answered
        response: The response to evaluate
        criterion: The rubric criterion to check
        model: LLM model for judging

    Returns:
        CriterionCheck with satisfaction result
    """
    prompt = CRITERION_PROMPT.format(
        question=question,
        response=response,
        criterion_description=criterion.description,
    )

    llm_response = litellm.completion(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "You are an expert evaluator for rubric-based assessment.",
            },
            {"role": "user", "content": prompt},
        ],
        temperature=0.0,
        response_format=CriterionCheck,
    )

    result = CriterionCheck.model_validate_json(llm_response.choices[0].message.content)

    return result


def evaluate_with_rubrics(
    question: str,
    response: str,
    rubrics: List[RubricData],
    model: str = "gpt-5-nano",
) -> RubricEvaluation:
    """
    Evaluate response using RaR-Explicit method (weighted sum).

    Implements Equation 1 from paper:
    r(x, ŷ) = Σ(w_j * c_j(x, ŷ)) / Σ(w_j)

    Args:
        question: The question
        response: Response to evaluate
        reference_answer: Reference answer (not directly used, but available)
        rubrics: List of rubric criteria
        model: LLM model for judging

    Returns:
        RubricEvaluation with normalized score
    """
    # Check each criterion independently
    checks = []
    for rubric in rubrics:
        check = check_criterion(question, response, rubric, model)
        checks.append(check)

    # Calculate weighted score (Equation 1)
    # Only positive weights contribute to denominator
    positive_weights = sum(abs(r.weight) for r in rubrics if r.weight > 0)

    raw_score = 0.0
    for check in checks:
        if check.satisfied:
            raw_score += check.weight

    # Normalize to [0, 1]
    normalized_score = raw_score / positive_weights if positive_weights > 0 else 0.0
    # Clip to [0, 1] in case pitfalls make it negative
    normalized_score = max(0.0, min(1.0, normalized_score))

    return RubricEvaluation(
        raw_score=raw_score,
        normalized_score=normalized_score,
        criterion_checks=checks,
    )