abhshkp
/

cognitive-hierarchy-position-bias

Model card Files Files and versions

cognitive-hierarchy-position-bias / experiments /apply_task.py

abhshkp's picture

Upload folder using huggingface_hub

9daa0e5 verified 2 days ago

history blame contribute delete

2.99 kB

	"""
	Bloom's Level: Apply
	Apply a rule or formula from buried text to a new scenario.
	"""
	import logging
	import os
	import random
	import re
	import time
	from typing import List, Dict, Any

	from tqdm import tqdm

	from src.generator import generate_text
	from src.metrics import compute_accuracy
	from src.utils import ensure_dir, save_jsonl, save_json

	logger = logging.getLogger(__name__)

	FILLERS = [
	"The museum houses artifacts from the ancient world.",
	"Coral reefs support diverse marine ecosystems.",
	"Railway gauges vary between countries.",
	]

	RULES = [
	("If temperature is above 30C, turn on cooling.", "32", "cooling on"),
	("If stock price drops below $50, sell immediately.", "48", "sell"),
	("If pH is below 7, add base solution.", "6.2", "add base"),
	("If battery is below 20%, charge now.", "15", "charge now"),
	]


	def run_apply(
	model_name: str,
	num_sentences: int,
	num_examples: int,
	out_dir: str,
	depths: List[float] = None,
	) -> Dict[str, Any]:
	ensure_dir(out_dir)
	if depths is None:
	depths = [0.0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0]

	results = {}
	start = time.time()

	for depth in depths:
	logger.info(f"[APPLY] Depth {depth:.1%}")
	preds = []
	for _ in tqdm(range(num_examples), desc=f"Apply {depth:.1%}", leave=False):
	sents = [random.choice(FILLERS) for _ in range(num_sentences)]
	rule, test_value, expected = random.choice(RULES)
	idx = int(depth * len(sents))
	sents.insert(idx, f"Rule: {rule}")
	doc = " ".join(sents)
	prompt = (
	f"Read the rules below and apply them.\n\n"
	f"{doc}\n\n"
	f"Scenario: The current reading is {test_value}. What should you do? "
	f"Answer with only the action."
	)
	ans = generate_text(
	[{"role": "user", "content": prompt}],
	model_name=model_name,
	max_new_tokens=15,
	)
	correct = 1.0 if expected.lower() in ans.lower() else 0.0
	preds.append({
	"model_answer": ans,
	"correct": correct,
	"expected": expected,
	"depth": depth,
	})

	save_jsonl(os.path.join(out_dir, f"apply_depth_{depth}.jsonl"), preds)
	acc = compute_accuracy(preds)
	results[depth] = {"accuracy": acc, "predictions": preds}
	logger.info(f"[APPLY] Depth {depth:.1%}: acc={acc:.3f}")

	summary = {
	"experiment": "apply",
	"cognitive_level": "apply",
	"num_sentences": num_sentences,
	"num_examples": num_examples,
	"depths": {str(d): results[d]["accuracy"] for d in depths},
	"time_minutes": (time.time() - start) / 60,
	}
	save_json(os.path.join(out_dir, "apply_summary.json"), summary)
	logger.info(f"[APPLY] Time={(time.time()-start)/60:.1f} min")
	return summary