File size: 3,290 Bytes

9daa0e5

"""
Bloom's Level: Evaluate
Assess or judge based on criteria from buried text.
"""
import logging
import os
import random
import time
from typing import List, Dict, Any

from tqdm import tqdm

from src.generator import generate_text
from src.metrics import exact_match_score, compute_accuracy
from src.utils import ensure_dir, save_jsonl, save_json

logger = logging.getLogger(__name__)

FILLERS = [
    "The museum houses artifacts from the ancient world.",
    "Coral reefs support diverse marine ecosystems.",
    "Railway gauges vary between countries.",
]

EVALUATIONS = [
    ("Candidate X has 5 years experience and a strong portfolio.", "Candidate Y has 3 years but excellent references.", "X is more experienced", "Candidate X"),
    ("Plan A costs $10M with high risk.", "Plan B costs $8M with moderate risk.", "B has lower cost", "Plan B"),
    ("Option 1 scores 85 on quality but 60 on price.", "Option 2 scores 70 on quality but 90 on price.", "Option 2 has better price", "Option 2"),
]


def run_evaluate(
    model_name: str,
    num_sentences: int,
    num_examples: int,
    out_dir: str,
    depths: List[float] = None,
) -> Dict[str, Any]:
    ensure_dir(out_dir)
    if depths is None:
        depths = [0.0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0]

    results = {}
    start = time.time()

    for depth in depths:
        logger.info(f"[EVALUATE] Depth {depth:.1%}")
        preds = []
        for _ in tqdm(range(num_examples), desc=f"Evaluate {depth:.1%}", leave=False):
            sents = [random.choice(FILLERS) for _ in range(num_sentences)]
            option_a, option_b, criteria, expected = random.choice(EVALUATIONS)
            idx = int(depth * len(sents))
            sents.insert(idx, f"Evaluation criteria: {criteria}.")
            sents.insert(0, f"Option A: {option_a}")
            sents.append(f"Option B: {option_b}")
            doc = " ".join(sents)
            prompt = (
                f"Read the options and evaluation criteria, then choose the better option.\n\n"
                f"{doc}\n\n"
                f"Which option is better? Answer with 'Option A' or 'Option B'."
            )
            ans = generate_text(
                [{"role": "user", "content": prompt}],
                model_name=model_name,
                max_new_tokens=15,
            )
            correct = 1.0 if expected.lower() in ans.lower() else 0.0
            preds.append({
                "model_answer": ans,
                "correct": correct,
                "expected": expected,
                "depth": depth,
            })

        save_jsonl(os.path.join(out_dir, f"evaluate_depth_{depth}.jsonl"), preds)
        acc = compute_accuracy(preds)
        results[depth] = {"accuracy": acc, "predictions": preds}
        logger.info(f"[EVALUATE] Depth {depth:.1%}: acc={acc:.3f}")

    summary = {
        "experiment": "evaluate",
        "cognitive_level": "evaluate",
        "num_sentences": num_sentences,
        "num_examples": num_examples,
        "depths": {str(d): results[d]["accuracy"] for d in depths},
        "time_minutes": (time.time() - start) / 60,
    }
    save_json(os.path.join(out_dir, "evaluate_summary.json"), summary)
    logger.info(f"[EVALUATE] Time={(time.time()-start)/60:.1f} min")
    return summary