| """ |
| Bloom's Level: Analyze |
| Compare or contrast information from buried text. |
| """ |
| import logging |
| import os |
| import random |
| import time |
| from typing import List, Dict, Any |
|
|
| from tqdm import tqdm |
|
|
| from src.generator import generate_text |
| from src.metrics import exact_match_score, compute_accuracy |
| from src.utils import ensure_dir, save_jsonl, save_json |
|
|
| logger = logging.getLogger(__name__) |
|
|
| FILLERS = [ |
| "The museum houses artifacts from the ancient world.", |
| "Coral reefs support diverse marine ecosystems.", |
| "Railway gauges vary between countries.", |
| ] |
|
|
| COMPARE_PAIRS = [ |
| ("Solar power is renewable but intermittent.", "Coal is reliable but polluting.", "solar is cleaner"), |
| ("Online learning is flexible.", "Classroom learning is interactive.", "online is more flexible"), |
| ("Electric cars have zero emissions.", "Gas cars have longer range.", "electric has zero emissions"), |
| ] |
|
|
|
|
| def run_analyze( |
| model_name: str, |
| num_sentences: int, |
| num_examples: int, |
| out_dir: str, |
| depths: List[float] = None, |
| ) -> Dict[str, Any]: |
| ensure_dir(out_dir) |
| if depths is None: |
| depths = [0.0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0] |
|
|
| results = {} |
| start = time.time() |
|
|
| for depth in depths: |
| logger.info(f"[ANALYZE] Depth {depth:.1%}") |
| preds = [] |
| for _ in tqdm(range(num_examples), desc=f"Analyze {depth:.1%}", leave=False): |
| sents = [random.choice(FILLERS) for _ in range(num_sentences)] |
| stmt_a, stmt_b, expected_comparison = random.choice(COMPARE_PAIRS) |
|
|
| |
| idx_a = int(depth * len(sents)) |
| idx_b = (idx_a + 10) % len(sents) |
| sents.insert(idx_a, f"Statement A: {stmt_a}") |
| if idx_b >= len(sents): |
| sents.append(f"Statement B: {stmt_b}") |
| else: |
| sents.insert(idx_b, f"Statement B: {stmt_b}") |
|
|
| doc = " ".join(sents) |
| prompt = ( |
| f"Read the text and compare Statement A and Statement B.\n\n" |
| f"{doc}\n\n" |
| f"Which statement represents the better option? Explain in one sentence." |
| ) |
| ans = generate_text( |
| [{"role": "user", "content": prompt}], |
| model_name=model_name, |
| max_new_tokens=40, |
| ) |
| |
| correct = 1.0 if any(kw in ans.lower() for kw in expected_comparison.split()) else 0.0 |
| preds.append({ |
| "model_answer": ans, |
| "correct": correct, |
| "expected": expected_comparison, |
| "depth": depth, |
| }) |
|
|
| save_jsonl(os.path.join(out_dir, f"analyze_depth_{depth}.jsonl"), preds) |
| acc = compute_accuracy(preds) |
| results[depth] = {"accuracy": acc, "predictions": preds} |
| logger.info(f"[ANALYZE] Depth {depth:.1%}: acc={acc:.3f}") |
|
|
| summary = { |
| "experiment": "analyze", |
| "cognitive_level": "analyze", |
| "num_sentences": num_sentences, |
| "num_examples": num_examples, |
| "depths": {str(d): results[d]["accuracy"] for d in depths}, |
| "time_minutes": (time.time() - start) / 60, |
| } |
| save_json(os.path.join(out_dir, "analyze_summary.json"), summary) |
| logger.info(f"[ANALYZE] Time={(time.time()-start)/60:.1f} min") |
| return summary |
|
|