Upload folder using huggingface_hub

Browse files

Files changed (16) hide show

README.md +0 -24
experiments/__init__.py +1 -0
experiments/conversation_memory.py +147 -0
experiments/fact_reasoning.py +117 -0
experiments/kv_retrieval.py +156 -0
experiments/multi_needle.py +84 -0
experiments/needle_in_haystack.py +122 -0
experiments/semantic_distractors.py +141 -0
experiments/temporal_narrative.py +122 -0
run_all.py +168 -0
src/__init__.py +3 -0
src/generator.py +39 -0
src/metrics.py +38 -0
src/model_loader.py +51 -0
src/plotting.py +65 -0
src/utils.py +40 -0

README.md CHANGED Viewed

@@ -1,7 +1,3 @@
----
-tags:
-- ml-intern
----
 # Lost in the Middle — Benchmark Suite v4
 A modular, reproducible benchmark suite for evaluating **position bias** in long-context language models, extending the original Liu et al. (2023) experiments.
@@ -101,23 +97,3 @@ To add a new experiment:
   url={https://huggingface.co/abhshkp/litm-benchmark-suite-v4}
 }
 ```
-<!-- ml-intern-provenance -->
-## Generated by ML Intern
-This model repository was generated by [ML Intern](https://github.com/huggingface/ml-intern), an agent for machine learning research and development on the Hugging Face Hub.
-- Try ML Intern: https://smolagents-ml-intern.hf.space
-- Source code: https://github.com/huggingface/ml-intern
-## Usage
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-model_id = "abhshkp/litm-benchmark-suite-v4"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(model_id)
-```
-For non-causal architectures, replace `AutoModelForCausalLM` with the appropriate `AutoModel` class.

 # Lost in the Middle — Benchmark Suite v4
 A modular, reproducible benchmark suite for evaluating **position bias** in long-context language models, extending the original Liu et al. (2023) experiments.
   url={https://huggingface.co/abhshkp/litm-benchmark-suite-v4}
 }
 ```

experiments/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Experiment modules for LITM Benchmark Suite v4."""

experiments/conversation_memory.py ADDED Viewed

	@@ -0,0 +1,147 @@

+"""
+Experiment 7: Conversation Memory
+Critical instruction buried in long chat history.
+"""
+import logging
+import os
+import random
+import time
+from typing import List, Dict, Any
+from tqdm import tqdm
+from src.generator import generate_text
+from src.metrics import exact_match_score, compute_accuracy, position_bias_index
+from src.plotting import plot_curve
+from src.utils import ensure_dir, save_jsonl, save_json
+logger = logging.getLogger(__name__)
+USER_MSGS = [
+    "Hello, how are you?",
+    "What is the weather like today?",
+    "Tell me about quantum physics.",
+    "Can you recommend a good book?",
+    "What are the health benefits of green tea?",
+    "Explain how airplanes fly.",
+    "What is the history of the internet?",
+    "How do I bake sourdough bread?",
+    "What are the best hiking trails in Europe?",
+    "Explain neural networks simply.",
+    "What is blockchain technology?",
+    "How does photosynthesis work?",
+    "Tell me a joke.",
+    "What is the theory of relativity?",
+    "How do vaccines work?",
+    "What causes earthquakes?",
+    "Explain the water cycle.",
+    "What is artificial intelligence?",
+    "How do I learn a new language?",
+    "What are black holes?",
+]
+ASSISTANT_MSGS = [
+    "I'm doing well, thank you!",
+    "The weather varies by location and season.",
+    "Quantum physics studies matter at the smallest scales.",
+    "I recommend 'Sapiens' by Yuval Noah Harari.",
+    "Green tea contains antioxidants that may boost metabolism.",
+    "Airplanes fly due to lift generated by their wings.",
+    "The internet evolved from ARPANET in the 1960s.",
+    "Sourdough requires flour, water, salt, and a starter culture.",
+    "The Tour du Mont Blanc is a spectacular alpine trail.",
+    "Neural networks learn patterns from data through layers.",
+    "Blockchain is a decentralized digital ledger.",
+    "Plants convert sunlight into chemical energy.",
+    "Why don't scientists trust atoms? Because they make up everything!",
+    "Relativity describes how space and time are interconnected.",
+    "Vaccines train the immune system to recognize pathogens.",
+    "Earthquakes occur when tectonic plates shift.",
+    "Water evaporates, condenses, and precipitates in a cycle.",
+    "AI enables machines to perform tasks requiring human intelligence.",
+    "Practice daily, immerse yourself, and use spaced repetition.",
+    "Black holes have gravitational fields so strong nothing escapes.",
+]
+def _make_conversation(num_turns: int, instruction: str, ratio: float) -> str:
+    convo = []
+    for i in range(num_turns):
+        convo.append(f"User: {random.choice(USER_MSGS)}")
+        convo.append(f"Assistant: {random.choice(ASSISTANT_MSGS)}")
+    idx = int(ratio * len(convo))
+    convo.insert(idx, f"User: {instruction}")
+    convo.insert(idx + 1, "Assistant: I will remember that.")
+    return "\n\n".join(convo)
+def run_conversation_memory(
+    model_name: str,
+    num_turns: int,
+    num_examples: int,
+    out_dir: str,
+    depths: List[float] = None,
+) -> Dict[str, Any]:
+    """Run conversation memory experiment."""
+    ensure_dir(out_dir)
+    if depths is None:
+        depths = [0.0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0]
+    results = {}
+    start = time.time()
+    for depth in depths:
+        logger.info(f"[CONVERSATION] Depth {depth:.1%}")
+        preds = []
+        for i in tqdm(range(num_examples), desc=f"Conversation {depth:.1%}", leave=False):
+            secret = f"MYFAVCOLOR-{i:03d}"
+            instruction = (
+                f"Please always remember that my favorite color is {secret}. "
+                f"This is very important."
+            )
+            convo = _make_conversation(num_turns, instruction, depth)
+            prompt = (
+                f"Here is our conversation history:\n\n{convo}\n\n"
+                f"Based on our conversation, what is my favorite color? "
+                f"Answer with only the color code."
+            )
+            ans = generate_text(
+                [{"role": "user", "content": prompt}],
+                model_name=model_name,
+                max_new_tokens=20,
+            )
+            correct = exact_match_score(ans, secret)
+            preds.append({
+                "model_answer": ans,
+                "correct": correct,
+                "secret": secret,
+                "depth": depth,
+            })
+        save_jsonl(os.path.join(out_dir, f"conversation_depth_{depth}.jsonl"), preds)
+        acc = compute_accuracy(preds)
+        results[depth] = {"accuracy": acc, "predictions": preds}
+        logger.info(f"[CONVERSATION] Depth {depth:.1%}: acc={acc:.3f}")
+    summary = {
+        "experiment": "conversation_memory",
+        "num_turns": num_turns,
+        "num_examples": num_examples,
+        "depths": {str(d): results[d]["accuracy"] for d in depths},
+        "pbi": position_bias_index(depths, [results[d]["accuracy"] for d in depths]),
+        "time_minutes": (time.time() - start) / 60,
+    }
+    save_json(os.path.join(out_dir, "conversation_summary.json"), summary)
+    plot_curve(
+        depths,
+        [results[d]["accuracy"] for d in depths],
+        f"Exp 7: Conversation Memory ({num_turns} turns)",
+        os.path.join(out_dir, "conversation_curve.png"),
+        xlabel="Depth in Chat History (0=start, 1=end)",
+    )
+    logger.info(f"[CONVERSATION] Time={(time.time()-start)/60:.1f} min")
+    return summary

experiments/fact_reasoning.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""
+Experiment 4: Fact-Dependent Reasoning
+Math problem requiring a fact hidden at varying depths.
+"""
+import logging
+import os
+import random
+import re
+import time
+from typing import List, Dict, Any
+from tqdm import tqdm
+from src.generator import generate_text
+from src.metrics import numeric_match, compute_accuracy, position_bias_index
+from src.plotting import plot_curve
+from src.utils import ensure_dir, save_jsonl, save_json
+logger = logging.getLogger(__name__)
+DISTRACTORS = [
+    "The museum opens at 9 AM.",
+    "Temperature is recorded hourly.",
+    "The container weighs 2,400 kg.",
+    "Ordinances ban construction near rivers.",
+    "Q3 revenue increased twelve percent.",
+    "The database has four million records.",
+    "Solar panels generate 45 kWh daily.",
+    "The manuscript was translated in the 1800s.",
+    "Airport traffic peaks in summer.",
+    "The compound melts at 342 Celsius.",
+    "Robotic arms have 0.1mm precision.",
+    "Fourteen subspecies were identified.",
+    "The hall seats 2,800 guests.",
+    "Wastewater uses filtration and aeration.",
+    "Satellites show drought vegetation.",
+]
+def _make_doc(n: int, fact: str, ratio: float) -> str:
+    sents = [random.choice(DISTRACTORS) + f" [Doc {i+1}]" for i in range(n)]
+    idx = int(ratio * len(sents))
+    sents.insert(idx, fact)
+    return " ".join(sents)
+def run_fact_reasoning(
+    model_name: str,
+    num_sentences: int,
+    num_examples: int,
+    out_dir: str,
+    depths: List[float] = None,
+) -> Dict[str, Any]:
+    """Run fact-dependent reasoning experiment."""
+    ensure_dir(out_dir)
+    if depths is None:
+        depths = [0.0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0]
+    results = {}
+    start = time.time()
+    for depth in depths:
+        logger.info(f"[REASON] Depth {depth:.1%}")
+        preds = []
+        for i in tqdm(range(num_examples), desc=f"Reason {depth:.1%}", leave=False):
+            price = random.randint(2, 15)
+            qty = random.randint(3, 20)
+            discount = random.randint(5, 30)
+            answer = round(price * qty * (1 - discount / 100), 2)
+            fact = f"For this order, apples cost ${price}/kg with a {discount}% discount."
+            doc = _make_doc(num_sentences, fact, depth)
+            prompt = (
+                f"Use ONLY the document below.\n\n{doc}\n\n"
+                f"Question: I buy {qty} kg of apples. What is my total cost? "
+                f"Answer with only the dollar amount."
+            )
+            ans = generate_text(
+                [{"role": "user", "content": prompt}],
+                model_name=model_name,
+                max_new_tokens=30,
+            )
+            correct = numeric_match(ans, answer, tolerance=0.5)
+            preds.append({
+                "model_answer": ans,
+                "predicted": float(re.findall(r"[\d,]+\.?\d*", ans.replace(",", ""))[0]) if re.findall(r"[\d,]+\.?\d*", ans.replace(",", "")) else -1.0,
+                "correct_answer": answer,
+                "correct": correct,
+                "depth": depth,
+            })
+        save_jsonl(os.path.join(out_dir, f"reason_depth_{depth}.jsonl"), preds)
+        acc = compute_accuracy(preds)
+        results[depth] = {"accuracy": acc, "predictions": preds}
+        logger.info(f"[REASON] Depth {depth:.1%}: acc={acc:.3f}")
+    summary = {
+        "experiment": "fact_reasoning",
+        "num_sentences": num_sentences,
+        "num_examples": num_examples,
+        "depths": {str(d): results[d]["accuracy"] for d in depths},
+        "pbi": position_bias_index(depths, [results[d]["accuracy"] for d in depths]),
+        "time_minutes": (time.time() - start) / 60,
+    }
+    save_json(os.path.join(out_dir, "reason_summary.json"), summary)
+    plot_curve(
+        depths,
+        [results[d]["accuracy"] for d in depths],
+        f"Exp 4: Fact-Dependent Reasoning ({num_sentences} sentences)",
+        os.path.join(out_dir, "reason_curve.png"),
+        xlabel="Depth in Document (0=start, 1=end)",
+        ylabel="Problem-Solving Accuracy",
+    )
+    logger.info(f"[REASON] Time={(time.time()-start)/60:.1f} min")
+    return summary

experiments/kv_retrieval.py ADDED Viewed

	@@ -0,0 +1,156 @@

+"""
+Experiment 1: Key-Value Retrieval
+Replicates Liu et al. (2023) with expanded position granularity.
+Generates UUID key-value pairs, places gold pair at controlled depths.
+"""
+import json
+import logging
+import os
+import random
+import time
+import uuid
+from typing import List, Dict, Any
+from tqdm import tqdm
+from src.generator import generate_text
+from src.metrics import exact_match_score, compute_accuracy, position_bias_index
+from src.plotting import plot_curve
+from src.utils import ensure_dir, save_jsonl, save_json
+logger = logging.getLogger(__name__)
+def _gen_kv_data(num_keys: int, num_examples: int) -> List[Dict[str, Any]]:
+    """Generate key-value pair examples."""
+    examples = []
+    for _ in tqdm(range(num_examples), desc=f"Gen KV data ({num_keys} keys)"):
+        kv = {}
+        while len(kv) != num_keys:
+            kv[str(uuid.uuid4())] = str(uuid.uuid4())
+        ordered = list(kv.items())
+        gold = random.choice(ordered)
+        examples.append({"ordered_kv_records": ordered, "key": gold[0], "value": gold[1]})
+    return examples
+def _format_prompt(data: List[tuple], key: str) -> str:
+    """Format KV data into prompt template."""
+    template = """Extract the value corresponding to the specified key in the JSON object below.
+JSON data:
+{formatted}
+Key: "{key}"
+Corresponding value:"""
+    formatted = ""
+    for i, (k, v) in enumerate(data):
+        sc = "{" if i == 0 else " "
+        ec = ",\n" if i != len(data) - 1 else "}"
+        formatted += sc + f'"{k}": "{v}"' + ec
+    return template.format(formatted=formatted, key=key)
+def _reorder(example: Dict[str, Any], gold_pos: int) -> Dict[str, Any]:
+    """Move gold pair to specified position."""
+    ordered = example["ordered_kv_records"]
+    key = example["key"]
+    value = example["value"]
+    gi = next(i for i, (k, v) in enumerate(ordered) if k == key)
+    new = ordered[:gi] + ordered[gi + 1:]
+    new = new[:gold_pos] + [(key, value)] + new[gold_pos:]
+    return {"ordered_kv_records": new, "key": key, "value": value}
+def run_kv_retrieval(
+    model_name: str,
+    num_keys: int,
+    num_examples: int,
+    out_dir: str,
+    positions: List[int] = None,
+    prefix: str = "kv",
+) -> Dict[str, Any]:
+    """
+    Run KV retrieval experiment.
+    Args:
+        model_name: HF model identifier
+        num_keys: Number of KV pairs
+        num_examples: Examples per position
+        out_dir: Output directory
+        positions: Custom position list (default: 9 positions)
+        prefix: Filename prefix
+    Returns:
+        Summary dict with accuracy per position and PBI
+    """
+    ensure_dir(out_dir)
+    if positions is None:
+        positions = sorted(set([
+            0,
+            num_keys // 8,
+            num_keys // 4,
+            3 * num_keys // 8,
+            num_keys // 2,
+            5 * num_keys // 8,
+            3 * num_keys // 4,
+            7 * num_keys // 8,
+            num_keys - 1,
+        ]))
+    # Generate data once, then reorder for each position
+    data_path = os.path.join(out_dir, f"{prefix}_data.jsonl")
+    examples = _gen_kv_data(num_keys, num_examples)
+    save_jsonl(data_path, examples)
+    results = {}
+    start = time.time()
+    for pos in positions:
+        logger.info(f"[{prefix}] Position {pos}/{num_keys - 1}")
+        preds = []
+        for ex in tqdm(examples, desc=f"{prefix} pos={pos}", leave=False):
+            ro = _reorder(ex, pos)
+            prompt = _format_prompt(ro["ordered_kv_records"], ro["key"])
+            ans = generate_text(
+                [{"role": "user", "content": prompt}],
+                model_name=model_name,
+                max_new_tokens=80,
+            )
+            correct = exact_match_score(ans, ro["value"])
+            preds.append({
+                "model_answer": ans,
+                "correct": correct,
+                "value": ro["value"],
+                "gold_position": pos,
+            })
+        save_jsonl(os.path.join(out_dir, f"{prefix}_pos_{pos}.jsonl"), preds)
+        acc = compute_accuracy(preds)
+        results[pos] = {"accuracy": acc, "predictions": preds}
+        logger.info(f"[{prefix}] Pos {pos}: acc={acc:.3f}")
+    # Summary
+    norm_pos = [p / (num_keys - 1) for p in sorted(results.keys())]
+    accs = [results[p]["accuracy"] for p in sorted(results.keys())]
+    pbi = position_bias_index(norm_pos, accs)
+    summary = {
+        "experiment": "kv_retrieval",
+        "num_keys": num_keys,
+        "num_examples": num_examples,
+        "positions": {str(p): results[p]["accuracy"] for p in sorted(results.keys())},
+        "pbi": pbi,
+        "time_minutes": (time.time() - start) / 60,
+    }
+    save_json(os.path.join(out_dir, f"{prefix}_summary.json"), summary)
+    plot_curve(
+        norm_pos, accs,
+        f"Exp 1: KV Retrieval ({num_keys} keys)",
+        os.path.join(out_dir, f"{prefix}_curve.png"),
+    )
+    logger.info(f"[{prefix}] PBI={pbi:.3f} | Time={(time.time()-start)/60:.1f} min")
+    return summary

experiments/multi_needle.py ADDED Viewed

	@@ -0,0 +1,84 @@

+"""
+Experiment 3: Multi-Needle Retrieval
+Tests ability to retrieve ALL of multiple needles placed at start, middle, and end.
+"""
+import logging
+import os
+import random
+import time
+from typing import Dict, Any
+from tqdm import tqdm
+from src.generator import generate_text
+from src.metrics import exact_match_score, compute_accuracy
+from src.plotting import plot_bar
+from src.utils import ensure_dir, save_json
+logger = logging.getLogger(__name__)
+from .needle_in_haystack import FILLERS
+def _make_haystack(n: int) -> str:
+    return " ".join(random.choice(FILLERS) + f" [{i+1}]." for i in range(n))
+def run_multi_needle(
+    model_name: str,
+    num_sentences: int,
+    num_examples: int,
+    out_dir: str,
+) -> Dict[str, Any]:
+    """Run multi-needle experiment."""
+    ensure_dir(out_dir)
+    start = time.time()
+    start_ok, mid_ok, end_ok = [], [], []
+    for i in tqdm(range(num_examples), desc="Multi-needle"):
+        filler = _make_haystack(num_sentences)
+        sents = [s.strip() + "." for s in filler.split(".") if s.strip()]
+        n = len(sents)
+        ca, cb, cc = f"ALPHA-{i:03d}", f"BETA-{i:03d}", f"GAMMA-{i:03d}"
+        sents.insert(0, f"The first secret code is {ca}.")
+        sents.insert(n // 2, f"The second secret code is {cb}.")
+        sents.append(f"The third secret code is {cc}.")
+        prompt = (
+            f"Read the text and list ALL three secret codes in order.\n\n"
+            f"{' '.join(sents)}\n\nCodes:"
+        )
+        ans = generate_text(
+            [{"role": "user", "content": prompt}],
+            model_name=model_name,
+            max_new_tokens=60,
+        )
+        start_ok.append(exact_match_score(ans, ca))
+        mid_ok.append(exact_match_score(ans, cb))
+        end_ok.append(exact_match_score(ans, cc))
+    summary = {
+        "experiment": "multi_needle",
+        "num_sentences": num_sentences,
+        "num_examples": num_examples,
+        "start": compute_accuracy([{"correct": c} for c in start_ok]),
+        "middle": compute_accuracy([{"correct": c} for c in mid_ok]),
+        "end": compute_accuracy([{"correct": c} for c in end_ok]),
+        "time_minutes": (time.time() - start) / 60,
+    }
+    logger.info(
+        f"[MULTI] Start={summary['start']:.3f} Mid={summary['middle']:.3f} End={summary['end']:.3f}"
+    )
+    save_json(os.path.join(out_dir, "multi_summary.json"), summary)
+    plot_bar(
+        ["Start", "Middle", "End"],
+        [summary["start"], summary["middle"], summary["end"]],
+        f"Exp 3: Multi-Needle (n={num_examples})",
+        os.path.join(out_dir, "multi_bar.png"),
+    )
+    return summary

experiments/needle_in_haystack.py ADDED Viewed

	@@ -0,0 +1,122 @@

+"""
+Experiment 2: Needle in Haystack (text)
+Tests retrieval of a secret code hidden at varying depths in filler text.
+"""
+import logging
+import os
+import random
+import time
+from typing import List, Dict, Any
+from tqdm import tqdm
+from src.generator import generate_text
+from src.metrics import exact_match_score, compute_accuracy, position_bias_index
+from src.plotting import plot_curve
+from src.utils import ensure_dir, save_jsonl, save_json
+logger = logging.getLogger(__name__)
+FILLERS = [
+    "The history of pottery spans thousands of years.",
+    "Marine biologists study coral reef ecosystems.",
+    "Railway engineering requires precise curvature.",
+    "The periodic table arranges elements by number.",
+    "Clouds are classified as cumulus and stratus.",
+    "Beekeeping traditions differ between continents.",
+    "The Great Wall was built over many dynasties.",
+    "Thermodynamics governs heat transfer.",
+    "Impressionist painters captured fleeting light.",
+    "Volcanic activity is tracked with seismographs.",
+    "The Dewey Decimal System organizes libraries.",
+    "Irrigation evolved from canals to drip systems.",
+    "Neural networks are inspired by biological brains.",
+    "Light speed is 299,792,458 meters per second.",
+    "Classical composition follows harmonic rules.",
+    "Urban planning addresses zoning and transport.",
+    "Photosynthesis converts CO2 into glucose.",
+    "The Fibonacci sequence appears in nature.",
+    "GPS uses triangulation from satellites.",
+    "Cryptography secures digital communication.",
+]
+def _make_haystack(n: int) -> str:
+    """Generate n sentences of filler text."""
+    return " ".join(random.choice(FILLERS) + f" [{i+1}]." for i in range(n))
+def _insert_needle(text: str, needle: str, ratio: float) -> str:
+    """Insert needle at specified depth ratio."""
+    sents = [s.strip() + "." for s in text.split(".") if s.strip()]
+    idx = int(ratio * len(sents))
+    sents.insert(idx, needle)
+    return " ".join(sents)
+def run_needle_in_haystack(
+    model_name: str,
+    num_sentences: int,
+    num_examples: int,
+    out_dir: str,
+    depths: List[float] = None,
+) -> Dict[str, Any]:
+    """Run needle-in-haystack experiment."""
+    ensure_dir(out_dir)
+    if depths is None:
+        depths = [0.0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0]
+    results = {}
+    start = time.time()
+    for depth in depths:
+        logger.info(f"[NEEDLE] Depth {depth:.1%}")
+        preds = []
+        for i in tqdm(range(num_examples), desc=f"Needle {depth:.1%}", leave=False):
+            filler = _make_haystack(num_sentences)
+            code = f"SECRET-{i:04d}"
+            needle = f"The secret code is {code}."
+            text = _insert_needle(filler, needle, depth)
+            prompt = (
+                f"Read the text and find the secret code.\n\n{text}\n\n"
+                f"What is the secret code? Answer with only the code."
+            )
+            ans = generate_text(
+                [{"role": "user", "content": prompt}],
+                model_name=model_name,
+                max_new_tokens=20,
+            )
+            correct = exact_match_score(ans, code)
+            preds.append({
+                "model_answer": ans,
+                "correct": correct,
+                "secret": code,
+                "depth": depth,
+            })
+        save_jsonl(os.path.join(out_dir, f"needle_depth_{depth}.jsonl"), preds)
+        acc = compute_accuracy(preds)
+        results[depth] = {"accuracy": acc, "predictions": preds}
+        logger.info(f"[NEEDLE] Depth {depth:.1%}: acc={acc:.3f}")
+    summary = {
+        "experiment": "needle_in_haystack",
+        "num_sentences": num_sentences,
+        "num_examples": num_examples,
+        "depths": {str(d): results[d]["accuracy"] for d in depths},
+        "pbi": position_bias_index(depths, [results[d]["accuracy"] for d in depths]),
+        "time_minutes": (time.time() - start) / 60,
+    }
+    save_json(os.path.join(out_dir, "needle_summary.json"), summary)
+    plot_curve(
+        depths,
+        [results[d]["accuracy"] for d in depths],
+        f"Exp 2: Needle in Haystack ({num_sentences} sentences)",
+        os.path.join(out_dir, "needle_curve.png"),
+        xlabel="Depth in Document (0=start, 1=end)",
+    )
+    logger.info(f"[NEEDLE] Time={(time.time()-start)/60:.1f} min")
+    return summary

experiments/semantic_distractors.py ADDED Viewed

	@@ -0,0 +1,141 @@

+"""
+Experiment 5: Semantic Similarity Distractors
+Gold fact ("capital of France is Paris") among semantically similar facts.
+"""
+import logging
+import os
+import random
+import time
+from typing import List, Dict, Any
+from tqdm import tqdm
+from src.generator import generate_text
+from src.metrics import exact_match_score, compute_accuracy, position_bias_index
+from src.plotting import plot_curve
+from src.utils import ensure_dir, save_jsonl, save_json
+logger = logging.getLogger(__name__)
+TEMPLATES = [
+    "The capital of {country} is {city}.",
+    "The population of {country} is approximately {num} million.",
+    "The official language of {country} is {lang}.",
+    "The currency of {country} is the {currency}.",
+    "The largest city in {country} is {city}.",
+]
+COUNTRIES = [
+    "Germany", "Spain", "Italy", "Brazil", "Argentina", "Canada",
+    "Australia", "Japan", "China", "India", "Russia", "Egypt",
+    "Turkey", "Mexico", "South Korea", "Thailand", "Vietnam",
+    "Poland", "Sweden", "Norway", "Denmark", "Finland", "Greece",
+    "Portugal", "Ireland", "Austria", "Switzerland", "Belgium",
+    "Netherlands", "Czech Republic", "Hungary", "Romania",
+]
+CITIES = [
+    "Berlin", "Madrid", "Rome", "Brasilia", "Buenos Aires", "Ottawa",
+    "Canberra", "Tokyo", "Beijing", "New Delhi", "Moscow", "Cairo",
+    "Ankara", "Mexico City", "Seoul", "Bangkok", "Hanoi",
+    "Warsaw", "Stockholm", "Oslo", "Copenhagen", "Helsinki", "Athens",
+    "Lisbon", "Dublin", "Vienna", "Bern", "Brussels",
+    "Amsterdam", "Prague", "Budapest", "Bucharest",
+]
+LANGS = [
+    "German", "Spanish", "Italian", "Portuguese", "French",
+    "English", "Japanese", "Mandarin", "Hindi", "Russian",
+    "Arabic", "Turkish", "Korean", "Thai", "Vietnamese",
+    "Polish", "Swedish", "Norwegian", "Danish", "Finnish",
+    "Greek", "Irish", "Dutch", "Czech", "Hungarian", "Romanian",
+]
+CURRENCIES = [
+    "Euro", "Peso", "Real", "Dollar", "Yen", "Yuan", "Rupee",
+    "Ruble", "Pound", "Won", "Baht", "Dong", "Zloty",
+    "Krone", "Krona", "Forint", "Leu", "Franc",
+]
+def _make_doc(num_facts: int, gold_fact: str, ratio: float) -> str:
+    facts = []
+    for _ in range(num_facts):
+        t = random.choice(TEMPLATES)
+        fact = t.format(
+            country=random.choice(COUNTRIES),
+            city=random.choice(CITIES),
+            num=random.randint(10, 1400),
+            lang=random.choice(LANGS),
+            currency=random.choice(CURRENCIES),
+        )
+        facts.append(fact)
+    idx = int(ratio * len(facts))
+    facts.insert(idx, gold_fact)
+    return "\n".join(f"{i+1}. {f}" for i, f in enumerate(facts))
+def run_semantic_distractors(
+    model_name: str,
+    num_facts: int,
+    num_examples: int,
+    out_dir: str,
+    depths: List[float] = None,
+) -> Dict[str, Any]:
+    """Run semantic distractor experiment."""
+    ensure_dir(out_dir)
+    if depths is None:
+        depths = [0.0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0]
+    results = {}
+    start = time.time()
+    for depth in depths:
+        logger.info(f"[SEMANTIC] Depth {depth:.1%}")
+        preds = []
+        for i in tqdm(range(num_examples), desc=f"Semantic {depth:.1%}", leave=False):
+            gold = "The capital of France is Paris."
+            doc = _make_doc(num_facts, gold, depth)
+            prompt = (
+                f"Read the following list of facts and answer the question.\n\n{doc}\n\n"
+                f"Question: What is the capital of France? Answer with only the city name."
+            )
+            ans = generate_text(
+                [{"role": "user", "content": prompt}],
+                model_name=model_name,
+                max_new_tokens=20,
+            )
+            correct = exact_match_score(ans, "paris")
+            preds.append({
+                "model_answer": ans,
+                "correct": correct,
+                "depth": depth,
+            })
+        save_jsonl(os.path.join(out_dir, f"semantic_depth_{depth}.jsonl"), preds)
+        acc = compute_accuracy(preds)
+        results[depth] = {"accuracy": acc, "predictions": preds}
+        logger.info(f"[SEMANTIC] Depth {depth:.1%}: acc={acc:.3f}")
+    summary = {
+        "experiment": "semantic_distractors",
+        "num_facts": num_facts,
+        "num_examples": num_examples,
+        "depths": {str(d): results[d]["accuracy"] for d in depths},
+        "pbi": position_bias_index(depths, [results[d]["accuracy"] for d in depths]),
+        "time_minutes": (time.time() - start) / 60,
+    }
+    save_json(os.path.join(out_dir, "semantic_summary.json"), summary)
+    plot_curve(
+        depths,
+        [results[d]["accuracy"] for d in depths],
+        f"Exp 5: Semantic Similarity Distractors ({num_facts} facts)",
+        os.path.join(out_dir, "semantic_curve.png"),
+        xlabel="Depth in Document (0=start, 1=end)",
+    )
+    logger.info(f"[SEMANTIC] Time={(time.time()-start)/60:.1f} min")
+    return summary

experiments/temporal_narrative.py ADDED Viewed

	@@ -0,0 +1,122 @@

+"""
+Experiment 6: Temporal Narrative
+Recall an event from a long chronological timeline.
+"""
+import logging
+import os
+import random
+import re
+import time
+from typing import List, Dict, Any
+from tqdm import tqdm
+from src.generator import generate_text
+from src.metrics import exact_match_score, compute_accuracy, position_bias_index
+from src.plotting import plot_curve
+from src.utils import ensure_dir, save_jsonl, save_json
+logger = logging.getLogger(__name__)
+EVENTS_POOL = [
+    "the king issued a decree",
+    "a comet appeared in the sky",
+    "the bridge was completed",
+    "a treaty was signed",
+    "the harvest festival began",
+    "a stranger arrived at the gates",
+    "the library burned down",
+    "a new star was discovered",
+    "the river flooded the town",
+    "the army marched north",
+    "a peace envoy was sent",
+    "the market was opened",
+    "a plague swept the city",
+    "the old temple was restored",
+    "a fleet set sail for distant lands",
+    "the academy admitted its first students",
+    "a rebellion broke out in the east",
+    "the queen gave birth to twins",
+    "a dragon was spotted in the mountains",
+    "the great bell tolled for the first time",
+]
+def _make_timeline(num_events: int, target_event: str, ratio: float) -> str:
+    events = random.sample(EVENTS_POOL, min(num_events, len(EVENTS_POOL)))
+    while len(events) < num_events:
+        events.append(
+            f"the people gathered for the {random.choice(['morning', 'evening', 'midday'])} ceremony"
+        )
+    idx = int(ratio * len(events))
+    events.insert(idx, target_event)
+    return "\n".join(f"Year {1000+i}: {e}." for i, e in enumerate(events))
+def run_temporal_narrative(
+    model_name: str,
+    num_events: int,
+    num_examples: int,
+    out_dir: str,
+    depths: List[float] = None,
+) -> Dict[str, Any]:
+    """Run temporal narrative experiment."""
+    ensure_dir(out_dir)
+    if depths is None:
+        depths = [0.0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0]
+    results = {}
+    start = time.time()
+    for depth in depths:
+        logger.info(f"[NARRATIVE] Depth {depth:.1%}")
+        preds = []
+        for i in tqdm(range(num_examples), desc=f"Narrative {depth:.1%}", leave=False):
+            target = "a golden statue was unveiled in the central square"
+            timeline = _make_timeline(num_events, target, depth)
+            prompt = (
+                f"Read the following timeline of historical events.\n\n{timeline}\n\n"
+                f"Question: In which year was a golden statue unveiled in the central square? "
+                f"Answer with only the year number."
+            )
+            ans = generate_text(
+                [{"role": "user", "content": prompt}],
+                model_name=model_name,
+                max_new_tokens=15,
+            )
+            expected_year = 1000 + int(depth * num_events)
+            years = re.findall(r"\b\d{4}\b", ans)
+            correct = 1.0 if any(abs(int(y) - expected_year) < 5 for y in years) else 0.0
+            preds.append({
+                "model_answer": ans,
+                "correct": correct,
+                "expected_year": expected_year,
+                "depth": depth,
+            })
+        save_jsonl(os.path.join(out_dir, f"narrative_depth_{depth}.jsonl"), preds)
+        acc = compute_accuracy(preds)
+        results[depth] = {"accuracy": acc, "predictions": preds}
+        logger.info(f"[NARRATIVE] Depth {depth:.1%}: acc={acc:.3f}")
+    summary = {
+        "experiment": "temporal_narrative",
+        "num_events": num_events,
+        "num_examples": num_examples,
+        "depths": {str(d): results[d]["accuracy"] for d in depths},
+        "pbi": position_bias_index(depths, [results[d]["accuracy"] for d in depths]),
+        "time_minutes": (time.time() - start) / 60,
+    }
+    save_json(os.path.join(out_dir, "narrative_summary.json"), summary)
+    plot_curve(
+        depths,
+        [results[d]["accuracy"] for d in depths],
+        f"Exp 6: Temporal Narrative ({num_events} events)",
+        os.path.join(out_dir, "narrative_curve.png"),
+        xlabel="Depth in Timeline (0=start, 1=end)",
+    )
+    logger.info(f"[NARRATIVE] Time={(time.time()-start)/60:.1f} min")
+    return summary

run_all.py ADDED Viewed

	@@ -0,0 +1,168 @@

+#!/usr/bin/env python3
+"""
+================================================================================
+LOST IN THE MIDDLE — Benchmark Suite v4 (Master Runner)
+================================================================================
+Runs all 7 experiments with configurable model, counts, and output directory.
+Usage:
+    python run_all.py --model Qwen/Qwen2.5-1.5B-Instruct --output ./results
+================================================================================
+"""
+import argparse
+import json
+import logging
+import os
+import shutil
+import sys
+import time
+from experiments.kv_retrieval import run_kv_retrieval
+from experiments.needle_in_haystack import run_needle_in_haystack
+from experiments.multi_needle import run_multi_needle
+from experiments.fact_reasoning import run_fact_reasoning
+from experiments.semantic_distractors import run_semantic_distractors
+from experiments.temporal_narrative import run_temporal_narrative
+from experiments.conversation_memory import run_conversation_memory
+from src.utils import save_json
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    level=logging.INFO,
+    stream=sys.stdout,
+)
+logger = logging.getLogger(__name__)
+def parse_args():
+    p = argparse.ArgumentParser(description="LITM Benchmark Suite v4")
+    p.add_argument("--model", default="Qwen/Qwen2.5-1.5B-Instruct", help="HF model name")
+    p.add_argument("--output", default="./results", help="Output directory")
+    p.add_argument("--n-examples", type=int, default=50, help="Examples per position")
+    p.add_argument("--n-keys-100", type=int, default=100)
+    p.add_argument("--n-keys-200", type=int, default=200)
+    p.add_argument("--needle-sentences", type=int, default=500)
+    p.add_argument("--multi-sentences", type=int, default=300)
+    p.add_argument("--reason-sentences", type=int, default=300)
+    p.add_argument("--semantic-facts", type=int, default=80)
+    p.add_argument("--narrative-events", type=int, default=100)
+    p.add_argument("--convo-turns", type=int, default=100)
+    p.add_argument("--experiments", default="all", help="Comma-separated list or 'all'")
+    p.add_argument("--zip", action="store_true", help="Create zip archive of results")
+    return p.parse_args()
+def main():
+    args = parse_args()
+    model = args.model
+    out_root = args.output
+    os.makedirs(out_root, exist_ok=True)
+    wanted = set(args.experiments.split(",")) if args.experiments != "all" else {"all"}
+    logger.info("=" * 70)
+    logger.info("LITM BENCHMARK SUITE v4")
+    logger.info(f"Model: {model} | Output: {out_root}")
+    logger.info("=" * 70)
+    all_results = {}
+    t0 = time.time()
+    def should_run(name):
+        return "all" in wanted or name in wanted
+    if should_run("kv100"):
+        logger.info("\n--- EXP 1A: KV Retrieval (100 keys) ---")
+        all_results["kv_100"] = run_kv_retrieval(
+            model_name=model,
+            num_keys=args.n_keys_100,
+            num_examples=args.n_examples,
+            out_dir=os.path.join(out_root, "exp1a_kv100"),
+            prefix="kv100",
+        )
+    if should_run("kv200"):
+        logger.info("\n--- EXP 1B: KV Retrieval (200 keys) ---")
+        all_results["kv_200"] = run_kv_retrieval(
+            model_name=model,
+            num_keys=args.n_keys_200,
+            num_examples=args.n_examples,
+            out_dir=os.path.join(out_root, "exp1b_kv200"),
+            prefix="kv200",
+        )
+    if should_run("needle"):
+        logger.info("\n--- EXP 2: Needle in Haystack ---")
+        all_results["needle"] = run_needle_in_haystack(
+            model_name=model,
+            num_sentences=args.needle_sentences,
+            num_examples=30,
+            out_dir=os.path.join(out_root, "exp2_needle"),
+        )
+    if should_run("multi"):
+        logger.info("\n--- EXP 3: Multi-Needle ---")
+        all_results["multi"] = run_multi_needle(
+            model_name=model,
+            num_sentences=args.multi_sentences,
+            num_examples=30,
+            out_dir=os.path.join(out_root, "exp3_multi"),
+        )
+    if should_run("reason"):
+        logger.info("\n--- EXP 4: Fact-Dependent Reasoning ---")
+        all_results["reason"] = run_fact_reasoning(
+            model_name=model,
+            num_sentences=args.reason_sentences,
+            num_examples=30,
+            out_dir=os.path.join(out_root, "exp4_reason"),
+        )
+    if should_run("semantic"):
+        logger.info("\n--- EXP 5: Semantic Similarity Distractors ---")
+        all_results["semantic"] = run_semantic_distractors(
+            model_name=model,
+            num_facts=args.semantic_facts,
+            num_examples=30,
+            out_dir=os.path.join(out_root, "exp5_semantic"),
+        )
+    if should_run("narrative"):
+        logger.info("\n--- EXP 6: Temporal Narrative ---")
+        all_results["narrative"] = run_temporal_narrative(
+            model_name=model,
+            num_events=args.narrative_events,
+            num_examples=30,
+            out_dir=os.path.join(out_root, "exp6_narrative"),
+        )
+    if should_run("conversation"):
+        logger.info("\n--- EXP 7: Conversation Memory ---")
+        all_results["conversation"] = run_conversation_memory(
+            model_name=model,
+            num_turns=args.convo_turns,
+            num_examples=30,
+            out_dir=os.path.join(out_root, "exp7_conversation"),
+        )
+    elapsed = (time.time() - t0) / 3600
+    logger.info(f"\n{'='*70}")
+    logger.info(f"COMPLETE. Total time: {elapsed:.2f} hours")
+    logger.info(f"Results: {out_root}")
+    logger.info(f"{'='*70}")
+    save_json(os.path.join(out_root, "master_summary.json"), all_results)
+    # Print PBI table
+    logger.info("\n--- Position Bias Index (PBI) Summary ---")
+    for k, v in all_results.items():
+        if isinstance(v, dict) and "pbi" in v:
+            logger.info(f"  {k:20s} PBI = {v['pbi']:+.3f}")
+    if args.zip:
+        zip_path = os.path.join(os.path.dirname(out_root), "litm_results_all")
+        shutil.make_archive(zip_path, "zip", out_root)
+        logger.info(f"Zipped: {zip_path}.zip")
+if __name__ == "__main__":
+    main()

src/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""LITM Benchmark Suite v4 — Core Library"""
+__version__ = "4.0.0"
+__author__ = "abhshkp"

src/generator.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""Text generation wrapper with chat-template support."""
+import logging
+import torch
+from typing import List, Dict, Any
+from .model_loader import load_model
+logger = logging.getLogger(__name__)
+def generate_text(
+    messages: List[Dict[str, str]],
+    model_name: str,
+    max_new_tokens: int = 80,
+    load_in_4bit: bool = True,
+) -> str:
+    """Generate text from a chat-formatted message list."""
+    model, tokenizer = load_model(model_name, load_in_4bit=load_in_4bit)
+    inputs = tokenizer.apply_chat_template(
+        messages,
+        tokenize=True,
+        return_tensors="pt",
+        add_generation_prompt=True,
+        return_dict=True,
+    )
+    dev = next(model.parameters()).device
+    inputs = {k: v.to(dev) for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            do_sample=False,
+            pad_token_id=tokenizer.pad_token_id,
+        )
+    gen = outputs[0][inputs["input_ids"].shape[1]:]
+    return tokenizer.decode(gen, skip_special_tokens=True).strip()

src/metrics.py ADDED Viewed

	@@ -0,0 +1,38 @@

+"""Metrics and scoring utilities."""
+import re
+import statistics
+from typing import List, Dict, Any
+def exact_match_score(prediction: str, target: str) -> float:
+    """Binary exact-match score."""
+    return 1.0 if target.lower() in prediction.lower() else 0.0
+def numeric_match(prediction: str, target: float, tolerance: float = 0.5) -> float:
+    """Extract first number from prediction and compare with tolerance."""
+    nums = re.findall(r"[\d,]+\.?\d*", prediction.replace(",", ""))
+    if not nums:
+        return 0.0
+    pred = float(nums[0])
+    return 1.0 if abs(pred - target) < tolerance else 0.0
+def compute_accuracy(predictions: List[Dict[str, Any]], key: str = "correct") -> float:
+    """Mean accuracy from a list of prediction records."""
+    vals = [p[key] for p in predictions]
+    return statistics.mean(vals) if vals else 0.0
+def position_bias_index(positions: List[float], accuracies: List[float]) -> float:
+    """
+    Compute Position Bias Index (PBI):
+    PBI = (acc_first + acc_last) / 2 - acc_middle
+    Higher PBI = stronger U-shape (worse).
+    """
+    if len(positions) < 3:
+        return 0.0
+    mid_idx = len(positions) // 2
+    edge_acc = (accuracies[0] + accuracies[-1]) / 2.0
+    mid_acc = accuracies[mid_idx]
+    return edge_acc - mid_acc

src/model_loader.py ADDED Viewed

	@@ -0,0 +1,51 @@

+"""Model loading with 4-bit quantization for T4/GPU inference."""
+import logging
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+logger = logging.getLogger(__name__)
+_model_cache = {}
+_tok_cache = {}
+def load_model(model_name: str, load_in_4bit: bool = True, device_map: str = "auto"):
+    """Load model with optional 4-bit quantization. Cached for reuse."""
+    cache_key = f"{model_name}:{load_in_4bit}:{device_map}"
+    if cache_key in _model_cache:
+        return _model_cache[cache_key], _tok_cache[cache_key]
+    logger.info(f"Loading model: {model_name}")
+    tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    if tok.pad_token is None:
+        tok.pad_token = tok.eos_token
+    if load_in_4bit:
+        bnb = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            quantization_config=bnb,
+            device_map=device_map,
+            trust_remote_code=True,
+            torch_dtype=torch.bfloat16,
+        )
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            device_map=device_map,
+            trust_remote_code=True,
+            torch_dtype=torch.bfloat16,
+        )
+    model.eval()
+    dev = next(model.parameters()).device
+    logger.info(f"Model loaded on {dev}")
+    _model_cache[cache_key] = model
+    _tok_cache[cache_key] = tok
+    return model, tok

src/plotting.py ADDED Viewed

	@@ -0,0 +1,65 @@

+"""Plotting utilities for position-bias curves."""
+import logging
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+logger = logging.getLogger(__name__)
+def plot_curve(
+    x_values,
+    y_values,
+    title: str,
+    save_path: str,
+    xlabel: str = "Position (0=start, 1=end)",
+    ylabel: str = "Accuracy",
+    ylim: tuple = (-0.05, 1.05),
+    color: str = "#E63946",
+):
+    """Plot a standard position-bias accuracy curve."""
+    plt.figure(figsize=(8, 5))
+    plt.plot(x_values, y_values, marker="o", linewidth=2.5, markersize=10, color=color)
+    plt.xlabel(xlabel, fontsize=13)
+    plt.ylabel(ylabel, fontsize=13)
+    plt.title(title, fontsize=13)
+    plt.ylim(ylim)
+    plt.grid(True, alpha=0.3)
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=200)
+    plt.close()
+    logger.info(f"Plot saved: {save_path}")
+def plot_bar(categories, values, title: str, save_path: str, ylabel: str = "Accuracy", ylim=(0, 1.05), colors=None):
+    """Plot a bar chart (e.g., for multi-needle start/middle/end)."""
+    if colors is None:
+        colors = ["#2E86AB", "#E63946", "#2E86AB"]
+    plt.figure(figsize=(6, 5))
+    plt.bar(categories, values, color=colors, edgecolor="black", linewidth=1.2)
+    plt.ylabel(ylabel, fontsize=13)
+    plt.title(title, fontsize=13)
+    plt.ylim(ylim)
+    plt.grid(True, alpha=0.3, axis="y")
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=200)
+    plt.close()
+    logger.info(f"Bar plot saved: {save_path}")
+def plot_multi_curves(curves, labels, title, save_path, xlabel="Position", ylabel="Accuracy"):
+    """Overlay multiple curves for comparison."""
+    plt.figure(figsize=(10, 6))
+    cmap = plt.get_cmap("tab10")
+    for i, (x, y, label) in enumerate(zip(curves["x"], curves["y"], labels)):
+        plt.plot(x, y, marker="o", linewidth=2.0, markersize=8, label=label, color=cmap(i))
+    plt.xlabel(xlabel, fontsize=13)
+    plt.ylabel(ylabel, fontsize=13)
+    plt.title(title, fontsize=13)
+    plt.ylim(-0.05, 1.05)
+    plt.legend()
+    plt.grid(True, alpha=0.3)
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=200)
+    plt.close()
+    logger.info(f"Multi-curve plot saved: {save_path}")

src/utils.py ADDED Viewed

	@@ -0,0 +1,40 @@

+"""Common utilities."""
+import json
+import os
+import logging
+from typing import List, Dict, Any
+logger = logging.getLogger(__name__)
+def ensure_dir(path: str):
+    """Create directory if it doesn't exist."""
+    os.makedirs(path, exist_ok=True)
+def save_jsonl(path: str, records: List[Dict[str, Any]]):
+    """Save records as JSONL."""
+    with open(path, "w") as f:
+        for r in records:
+            f.write(json.dumps(r) + "\n")
+def load_jsonl(path: str) -> List[Dict[str, Any]]:
+    """Load JSONL records."""
+    records = []
+    with open(path) as f:
+        for line in f:
+            records.append(json.loads(line))
+    return records
+def save_json(path: str, data: Any):
+    """Save data as pretty-printed JSON."""
+    with open(path, "w") as f:
+        json.dump(data, f, indent=2)
+def load_json(path: str) -> Any:
+    """Load JSON file."""
+    with open(path) as f:
+        return json.load(f)