File size: 2,988 Bytes
9daa0e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""
Bloom's Level: Apply
Apply a rule or formula from buried text to a new scenario.
"""
import logging
import os
import random
import re
import time
from typing import List, Dict, Any

from tqdm import tqdm

from src.generator import generate_text
from src.metrics import compute_accuracy
from src.utils import ensure_dir, save_jsonl, save_json

logger = logging.getLogger(__name__)

FILLERS = [
    "The museum houses artifacts from the ancient world.",
    "Coral reefs support diverse marine ecosystems.",
    "Railway gauges vary between countries.",
]

RULES = [
    ("If temperature is above 30C, turn on cooling.", "32", "cooling on"),
    ("If stock price drops below $50, sell immediately.", "48", "sell"),
    ("If pH is below 7, add base solution.", "6.2", "add base"),
    ("If battery is below 20%, charge now.", "15", "charge now"),
]


def run_apply(
    model_name: str,
    num_sentences: int,
    num_examples: int,
    out_dir: str,
    depths: List[float] = None,
) -> Dict[str, Any]:
    ensure_dir(out_dir)
    if depths is None:
        depths = [0.0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0]

    results = {}
    start = time.time()

    for depth in depths:
        logger.info(f"[APPLY] Depth {depth:.1%}")
        preds = []
        for _ in tqdm(range(num_examples), desc=f"Apply {depth:.1%}", leave=False):
            sents = [random.choice(FILLERS) for _ in range(num_sentences)]
            rule, test_value, expected = random.choice(RULES)
            idx = int(depth * len(sents))
            sents.insert(idx, f"Rule: {rule}")
            doc = " ".join(sents)
            prompt = (
                f"Read the rules below and apply them.\n\n"
                f"{doc}\n\n"
                f"Scenario: The current reading is {test_value}. What should you do? "
                f"Answer with only the action."
            )
            ans = generate_text(
                [{"role": "user", "content": prompt}],
                model_name=model_name,
                max_new_tokens=15,
            )
            correct = 1.0 if expected.lower() in ans.lower() else 0.0
            preds.append({
                "model_answer": ans,
                "correct": correct,
                "expected": expected,
                "depth": depth,
            })

        save_jsonl(os.path.join(out_dir, f"apply_depth_{depth}.jsonl"), preds)
        acc = compute_accuracy(preds)
        results[depth] = {"accuracy": acc, "predictions": preds}
        logger.info(f"[APPLY] Depth {depth:.1%}: acc={acc:.3f}")

    summary = {
        "experiment": "apply",
        "cognitive_level": "apply",
        "num_sentences": num_sentences,
        "num_examples": num_examples,
        "depths": {str(d): results[d]["accuracy"] for d in depths},
        "time_minutes": (time.time() - start) / 60,
    }
    save_json(os.path.join(out_dir, "apply_summary.json"), summary)
    logger.info(f"[APPLY] Time={(time.time()-start)/60:.1f} min")
    return summary