""" Bloom's Level: Understand Require model to explain or summarize a key concept from buried text. """ import logging import os import random import time from typing import List, Dict, Any from tqdm import tqdm from src.generator import generate_text from src.metrics import exact_match_score, compute_accuracy from src.utils import ensure_dir, save_jsonl, save_json logger = logging.getLogger(__name__) FILLERS = [ "The museum houses artifacts from the ancient world.", "Coral reefs support diverse marine ecosystems.", "Railway gauges vary between countries.", "The periodic table organizes elements systematically.", "Cloud formation depends on atmospheric pressure.", ] CONCEPTS = [ ("photosynthesis", "the process by which plants convert light energy into chemical energy"), ("gravity", "the force that attracts objects toward each other"), ("mitosis", "cell division that produces two identical daughter cells"), ("photosynthesis", "plants using sunlight to make food"), ("evaporation", "water changing from liquid to gas"), ] def run_understand( model_name: str, num_sentences: int, num_examples: int, out_dir: str, depths: List[float] = None, ) -> Dict[str, Any]: ensure_dir(out_dir) if depths is None: depths = [0.0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0] results = {} start = time.time() for depth in depths: logger.info(f"[UNDERSTAND] Depth {depth:.1%}") preds = [] for _ in tqdm(range(num_examples), desc=f"Understand {depth:.1%}", leave=False): sents = [random.choice(FILLERS) for _ in range(num_sentences)] concept, definition = random.choice(CONCEPTS) idx = int(depth * len(sents)) sents.insert(idx, f"{concept} is defined as: {definition}.") doc = " ".join(sents) prompt = ( f"Read the text and explain what {concept} means, using your own words.\n\n" f"{doc}\n\nExplanation:" ) ans = generate_text( [{"role": "user", "content": prompt}], model_name=model_name, max_new_tokens=50, ) # Check if key terms from definition appear in explanation key_terms = definition.split() match_count = sum(1 for term in key_terms[:5] if term.lower() in ans.lower()) correct = 1.0 if match_count >= 3 else 0.0 preds.append({ "model_answer": ans, "correct": correct, "concept": concept, "depth": depth, }) save_jsonl(os.path.join(out_dir, f"understand_depth_{depth}.jsonl"), preds) acc = compute_accuracy(preds) results[depth] = {"accuracy": acc, "predictions": preds} logger.info(f"[UNDERSTAND] Depth {depth:.1%}: acc={acc:.3f}") summary = { "experiment": "understand", "cognitive_level": "understand", "num_sentences": num_sentences, "num_examples": num_examples, "depths": {str(d): results[d]["accuracy"] for d in depths}, "time_minutes": (time.time() - start) / 60, } save_json(os.path.join(out_dir, "understand_summary.json"), summary) logger.info(f"[UNDERSTAND] Time={(time.time()-start)/60:.1f} min") return summary