""" Temporal Order Experiment Tests position bias when information has inherent chronological ordering: Chronological, Reverse-chronological, Scrambled. """ import logging import os import random import re import time from typing import List, Dict, Any from tqdm import tqdm from src.generator import generate_text from src.utils import ensure_dir, save_jsonl, save_json logger = logging.getLogger(__name__) EVENTS = [ "the king issued a decree", "a comet appeared in the sky", "the bridge was completed", "a treaty was signed", "the harvest festival began", "a stranger arrived at the gates", "the library burned down", "a new star was discovered", "the river flooded the town", "the army marched north", "a peace envoy was sent", "the market was opened", "a plague swept the city", "the old temple was restored", "a fleet set sail", "the academy admitted its first students", "a rebellion broke out", "the queen gave birth to twins", "a dragon was spotted", "the great bell tolled", ] def _make_timeline_chronological(n: int, target: str, target_year: int) -> str: """Events in chronological order with target embedded.""" events = random.sample(EVENTS, min(n - 1, len(EVENTS))) while len(events) < n - 1: events.append(f"the people gathered for a ceremony") events.append(target) random.shuffle(events) # Will be sorted by year # Insert target at target_year position return "\n".join(f"Year {1000+i}: {e}." for i, e in enumerate(events)) def _make_timeline_reverse(n: int, target: str, target_pos: float) -> str: events = random.sample(EVENTS, min(n - 1, len(EVENTS))) while len(events) < n - 1: events.append(f"the people gathered for a ceremony") idx = int(target_pos * len(events)) events.insert(idx, target) # Reverse chronological: Year 2000 -> Year 1000 return "\n".join(f"Year {2000-i}: {e}." for i, e in enumerate(events)) def _make_timeline_scrambled(n: int, target: str, target_pos: float) -> str: events = random.sample(EVENTS, min(n - 1, len(EVENTS))) while len(events) < n - 1: events.append(f"the people gathered for a ceremony") idx = int(target_pos * len(events)) events.insert(idx, target) random.shuffle(events) years = random.sample(range(1000, 2000), len(events)) return "\n".join(f"Year {y}: {e}." for y, e in zip(years, events)) def _run_timeline_ordering( model_name: str, num_events: int, num_examples: int, out_dir: str, order_type: str, target_year: int = None, depths: List[float] = None, ) -> Dict[str, Any]: ensure_dir(out_dir) if depths is None: depths = [0.0, 0.25, 0.5, 0.75, 1.0] results = {} start = time.time() for depth in depths: logger.info(f"[{order_type.upper()}] Depth {depth:.1%}") preds = [] for i in tqdm(range(num_examples), desc=f"{order_type} {depth:.1%}", leave=False): target = "a golden statue was unveiled in the central square" if order_type == "chronological": timeline = _make_timeline_chronological(num_events, target, 1000 + int(depth * num_events)) expected = 1000 + int(depth * num_events) elif order_type == "reverse": timeline = _make_timeline_reverse(num_events, target, depth) expected = 2000 - int(depth * num_events) elif order_type == "scrambled": timeline = _make_timeline_scrambled(num_events, target, depth) expected = None # No clear expected year else: raise ValueError(f"Unknown order_type: {order_type}") prompt = ( f"Read the following timeline of events.\n\n{timeline}\n\n" f"Question: In which year was a golden statue unveiled in the central square? " f"Answer with only the year number." ) ans = generate_text( [{"role": "user", "content": prompt}], model_name=model_name, max_new_tokens=15, ) years = re.findall(r"\b\d{4}\b", ans) if expected is not None: correct = 1.0 if any(abs(int(y) - expected) < 5 for y in years) else 0.0 else: # For scrambled, check if any year mentioned is reasonable correct = 1.0 if years else 0.0 preds.append({ "model_answer": ans, "correct": correct, "expected_year": expected, "depth": depth, "order_type": order_type, }) save_jsonl(os.path.join(out_dir, f"{order_type}_depth_{depth}.jsonl"), preds) acc = sum(p["correct"] for p in preds) / len(preds) if preds else 0.0 results[depth] = {"accuracy": acc, "predictions": preds} logger.info(f"[{order_type.upper()}] Depth {depth:.1%}: acc={acc:.3f}") summary = { "experiment": f"temporal_{order_type}", "num_events": num_events, "num_examples": num_examples, "order_type": order_type, "depths": {str(d): results[d]["accuracy"] for d in depths}, "time_minutes": (time.time() - start) / 60, } save_json(os.path.join(out_dir, f"{order_type}_summary.json"), summary) return summary def run_all_temporal( model_name: str, num_events: int, num_examples: int, out_dir: str, ) -> Dict[str, Any]: """Run all three temporal ordering conditions.""" ensure_dir(out_dir) all_results = {} for order in ["chronological", "reverse", "scrambled"]: logger.info(f"\n--- Temporal Ordering: {order.upper()} ---") all_results[order] = _run_timeline_ordering( model_name, num_events, num_examples, os.path.join(out_dir, order), order, ) save_json(os.path.join(out_dir, "temporal_master_summary.json"), all_results) # Compare logger.info("\n--- Temporal Ordering Comparison ---") for order, res in all_results.items(): depths = list(res["depths"].keys()) accs = list(res["depths"].values()) if len(accs) >= 3: mid_idx = len(accs) // 2 pbi = (accs[0] + accs[-1]) / 2 - accs[mid_idx] logger.info(f" {order:15s} PBI={pbi:+.3f}") return all_results