Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

README.md +28 -17
experiments/__init__.py +1 -0
experiments/json_retrieval.py +90 -0
experiments/log_file_retrieval.py +102 -0
experiments/table_retrieval.py +97 -0
requirements.txt +5 -0
run_all.py +72 -0
src/__init__.py +2 -0
src/generator.py +53 -0
src/utils.py +19 -0

README.md CHANGED Viewed

@@ -1,26 +1,37 @@
----
-tags:
-- ml-intern
----
-# abhshkp/structured-data-position-bias
-<!-- ml-intern-provenance -->
-## Generated by ML Intern
-This model repository was generated by [ML Intern](https://github.com/huggingface/ml-intern), an agent for machine learning research and development on the Hugging Face Hub.
-- Try ML Intern: https://smolagents-ml-intern.hf.space
-- Source code: https://github.com/huggingface/ml-intern
-## Usage
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-model_id = "abhshkp/structured-data-position-bias"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(model_id)
 ```
-For non-causal architectures, replace `AutoModelForCausalLM` with the appropriate `AutoModel` class.

+# Structured Data Position Bias Benchmark
+Tests position bias in **structured formats** (JSON, tables, logs) where formatting may mitigate or exacerbate the "Lost in the Middle" effect.
+## Research Question
+> Does structured formatting (JSON, tables, logs) reduce position bias compared to unstructured prose? Or does the visual/structural regularity make middle-position items harder to find?
+## Experiments
+| # | Format | Target | Hypothesis |
+|---|--------|--------|-----------|
+| 1 | **JSON Array** | Key-value pair | Structured nesting may reduce bias |
+| 2 | **Markdown Table** | Row value | Tabular structure provides visual anchors |
+| 3 | **Log File** | Error code | Timestamp ordering may create temporal bias |
+## Usage
+```bash
+pip install -r requirements.txt
+python run_all.py --model Qwen/Qwen2.5-1.5B-Instruct --num-items 100 --num-examples 50
 ```
+## Expected Finding
+> "Position Bias Index is significantly lower in tabular formats (PBI=0.18) than in JSON arrays (PBI=0.35) or prose (PBI=0.42), suggesting visual structure mitigates positional bias."
+## Citation
+```bibtex
+@software{structured_data_position_bias,
+  title={Structured Data Position Bias: How Format Affects Long-Context Retrieval},
+  author={abhshkp},
+  year={2026},
+  url={https://huggingface.co/abhshkp/structured-data-position-bias}
+}
+```

experiments/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Structured data experiments."""

experiments/json_retrieval.py ADDED Viewed

	@@ -0,0 +1,90 @@

+"""
+JSON Array Position Bias
+Target key-value pair at varying positions in a JSON array.
+"""
+import json as jsonlib
+import logging
+import os
+import random
+import time
+import uuid
+from typing import List, Dict, Any
+from tqdm import tqdm
+from src.generator import generate_text
+from src.utils import ensure_dir, save_jsonl, save_json
+logger = logging.getLogger(__name__)
+def _make_json_array(n: int, target_key: str, target_value: str, target_pos: int) -> str:
+    """Generate JSON array with target KV pair at position."""
+    entries = []
+    for i in range(n):
+        if i == target_pos:
+            entries.append({"key": target_key, "value": target_value})
+        else:
+            entries.append({
+                "key": f"key_{uuid.uuid4().hex[:8]}",
+                "value": f"val_{uuid.uuid4().hex[:8]}",
+            })
+    return jsonlib.dumps({"records": entries}, indent=2)
+def run_json_retrieval(
+    model_name: str,
+    num_entries: int,
+    num_examples: int,
+    out_dir: str,
+    depths: List[float] = None,
+) -> Dict[str, Any]:
+    ensure_dir(out_dir)
+    if depths is None:
+        depths = [0.0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0]
+    results = {}
+    start = time.time()
+    for depth in depths:
+        logger.info(f"[JSON] Depth {depth:.1%}")
+        preds = []
+        for _ in tqdm(range(num_examples), desc=f"JSON {depth:.1%}", leave=False):
+            target_key = f"gold_key_{uuid.uuid4().hex[:6]}"
+            target_value = f"gold_val_{uuid.uuid4().hex[:6]}"
+            pos = int(depth * (num_entries - 1))
+            json_str = _make_json_array(num_entries, target_key, target_value, pos)
+            prompt = (
+                f"Find the value for the key '{target_key}' in the JSON data below.\n\n"
+                f"```json\n{json_str}\n```\n\n"
+                f"Value:"
+            )
+            ans = generate_text(
+                [{"role": "user", "content": prompt}],
+                model_name=model_name,
+                max_new_tokens=20,
+            )
+            correct = 1.0 if target_value.lower() in ans.lower() else 0.0
+            preds.append({
+                "model_answer": ans,
+                "correct": correct,
+                "target_value": target_value,
+                "depth": depth,
+            })
+        save_jsonl(os.path.join(out_dir, f"json_depth_{depth}.jsonl"), preds)
+        acc = sum(p["correct"] for p in preds) / len(preds) if preds else 0.0
+        results[depth] = {"accuracy": acc, "predictions": preds}
+        logger.info(f"[JSON] Depth {depth:.1%}: acc={acc:.3f}")
+    summary = {
+        "experiment": "json_retrieval",
+        "num_entries": num_entries,
+        "num_examples": num_examples,
+        "depths": {str(d): results[d]["accuracy"] for d in depths},
+        "time_minutes": (time.time() - start) / 60,
+    }
+    save_json(os.path.join(out_dir, "json_summary.json"), summary)
+    logger.info(f"[JSON] Time={(time.time()-start)/60:.1f} min")
+    return summary

experiments/log_file_retrieval.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""
+Log File Position Bias
+Find an error message at varying positions in a log file.
+"""
+import logging
+import os
+import random
+import time
+from typing import List, Dict, Any
+from tqdm import tqdm
+from src.generator import generate_text
+from src.utils import ensure_dir, save_jsonl, save_json
+logger = logging.getLogger(__name__)
+LOG_LEVELS = ["INFO", "DEBUG", "WARNING", "INFO", "DEBUG", "INFO"]
+LOG_MESSAGES = [
+    "Connection established to server-01",
+    "Cache hit for key user_prefs",
+    "Processing batch job #4521",
+    "Database query completed in 12ms",
+    "Index rebuild started",
+    "Memory usage at 45%",
+    "Request served in 3ms",
+    "Background task scheduled",
+    "Config file reloaded",
+    "Metrics flushed to disk",
+]
+def _make_log(n: int, target_line: str, target_pos: int) -> str:
+    """Generate log file with target error at position."""
+    lines = []
+    for i in range(n):
+        if i == target_pos:
+            lines.append(target_line)
+        else:
+            ts = f"2024-01-{random.randint(1,28):02d} {random.randint(0,23):02d}:{random.randint(0,59):02d}:{random.randint(0,59):02d}"
+            level = random.choice(LOG_LEVELS)
+            msg = random.choice(LOG_MESSAGES)
+            lines.append(f"{ts} [{level}] {msg}")
+    return "\n".join(lines)
+def run_log_retrieval(
+    model_name: str,
+    num_lines: int,
+    num_examples: int,
+    out_dir: str,
+    depths: List[float] = None,
+) -> Dict[str, Any]:
+    ensure_dir(out_dir)
+    if depths is None:
+        depths = [0.0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0]
+    results = {}
+    start = time.time()
+    for depth in depths:
+        logger.info(f"[LOG] Depth {depth:.1%}")
+        preds = []
+        for _ in tqdm(range(num_examples), desc=f"Log {depth:.1%}", leave=False):
+            error_code = f"ERR-{random.randint(1000,9999)}"
+            target_line = f"2024-01-15 14:30:00 [ERROR] Critical failure: {error_code} - Service halted"
+            pos = int(depth * (num_lines - 1))
+            log_str = _make_log(num_lines, target_line, pos)
+            prompt = (
+                f"Find the error code in the log file below.\n\n"
+                f"```\n{log_str}\n```\n\n"
+                f"Error code:"
+            )
+            ans = generate_text(
+                [{"role": "user", "content": prompt}],
+                model_name=model_name,
+                max_new_tokens=15,
+            )
+            correct = 1.0 if error_code.lower() in ans.lower() else 0.0
+            preds.append({
+                "model_answer": ans,
+                "correct": correct,
+                "error_code": error_code,
+                "depth": depth,
+            })
+        save_jsonl(os.path.join(out_dir, f"log_depth_{depth}.jsonl"), preds)
+        acc = sum(p["correct"] for p in preds) / len(preds) if preds else 0.0
+        results[depth] = {"accuracy": acc, "predictions": preds}
+        logger.info(f"[LOG] Depth {depth:.1%}: acc={acc:.3f}")
+    summary = {
+        "experiment": "log_retrieval",
+        "num_lines": num_lines,
+        "num_examples": num_examples,
+        "depths": {str(d): results[d]["accuracy"] for d in depths},
+        "time_minutes": (time.time() - start) / 60,
+    }
+    save_json(os.path.join(out_dir, "log_summary.json"), summary)
+    logger.info(f"[LOG] Time={(time.time()-start)/60:.1f} min")
+    return summary

experiments/table_retrieval.py ADDED Viewed

	@@ -0,0 +1,97 @@

+"""
+Table/CSV Position Bias
+Target row at varying positions in a markdown table.
+"""
+import logging
+import os
+import random
+import time
+from typing import List, Dict, Any
+from tqdm import tqdm
+from src.generator import generate_text
+from src.utils import ensure_dir, save_jsonl, save_json
+logger = logging.getLogger(__name__)
+def _make_table(n: int, target_row: List[str], target_pos: int) -> str:
+    """Generate markdown table with target row at position."""
+    headers = ["ID", "Name", "Value", "Status"]
+    rows = []
+    for i in range(n):
+        if i == target_pos:
+            rows.append(target_row)
+        else:
+            rows.append([
+                f"ID-{random.randint(1000,9999)}",
+                f"Item-{random.randint(1,99)}",
+                f"{random.randint(1,1000)}",
+                random.choice(["Active", "Inactive"]),
+            ])
+    table = "| " + " | ".join(headers) + " |\n"
+    table += "|" + "|".join(["---"] * len(headers)) + "|\n"
+    for row in rows:
+        table += "| " + " | ".join(row) + " |\n"
+    return table
+def run_table_retrieval(
+    model_name: str,
+    num_rows: int,
+    num_examples: int,
+    out_dir: str,
+    depths: List[float] = None,
+) -> Dict[str, Any]:
+    ensure_dir(out_dir)
+    if depths is None:
+        depths = [0.0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0]
+    results = {}
+    start = time.time()
+    for depth in depths:
+        logger.info(f"[TABLE] Depth {depth:.1%}")
+        preds = []
+        for _ in tqdm(range(num_examples), desc=f"Table {depth:.1%}", leave=False):
+            target_id = f"GOLD-{random.randint(1000,9999)}"
+            target_value = f"{random.randint(5000,9999)}"
+            target_row = [target_id, "GoldenItem", target_value, "Gold"]
+            pos = int(depth * (num_rows - 1))
+            table_str = _make_table(num_rows, target_row, pos)
+            prompt = (
+                f"Find the 'Value' for the row where ID = '{target_id}' in the table below.\n\n"
+                f"{table_str}\n\n"
+                f"Value:"
+            )
+            ans = generate_text(
+                [{"role": "user", "content": prompt}],
+                model_name=model_name,
+                max_new_tokens=15,
+            )
+            correct = 1.0 if target_value in ans else 0.0
+            preds.append({
+                "model_answer": ans,
+                "correct": correct,
+                "target_value": target_value,
+                "depth": depth,
+            })
+        save_jsonl(os.path.join(out_dir, f"table_depth_{depth}.jsonl"), preds)
+        acc = sum(p["correct"] for p in preds) / len(preds) if preds else 0.0
+        results[depth] = {"accuracy": acc, "predictions": preds}
+        logger.info(f"[TABLE] Depth {depth:.1%}: acc={acc:.3f}")
+    summary = {
+        "experiment": "table_retrieval",
+        "num_rows": num_rows,
+        "num_examples": num_examples,
+        "depths": {str(d): results[d]["accuracy"] for d in depths},
+        "time_minutes": (time.time() - start) / 60,
+    }
+    save_json(os.path.join(out_dir, "table_summary.json"), summary)
+    logger.info(f"[TABLE] Time={(time.time()-start)/60:.1f} min")
+    return summary

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch>=2.0.0
+transformers>=4.40.0
+accelerate>=0.25.0
+bitsandbytes>=0.43.0
+tqdm>=4.65.0

run_all.py ADDED Viewed

	@@ -0,0 +1,72 @@

+#!/usr/bin/env python3
+"""Structured Data Position Bias — Master Runner"""
+import argparse
+import logging
+import os
+import sys
+from experiments.json_retrieval import run_json_retrieval
+from experiments.table_retrieval import run_table_retrieval
+from experiments.log_file_retrieval import run_log_retrieval
+from src.utils import save_json
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    level=logging.INFO,
+    stream=sys.stdout,
+)
+logger = logging.getLogger(__name__)
+def parse_args():
+    p = argparse.ArgumentParser(description="Structured Data Position Bias")
+    p.add_argument("--model", default="Qwen/Qwen2.5-1.5B-Instruct")
+    p.add_argument("--output", default="./results")
+    p.add_argument("--num-items", type=int, default=100)
+    p.add_argument("--num-examples", type=int, default=30)
+    return p.parse_args()
+def main():
+    args = parse_args()
+    model = args.model
+    out_root = args.output
+    os.makedirs(out_root, exist_ok=True)
+    logger.info("\n--- Experiment 1: JSON Array Retrieval ---")
+    json_results = run_json_retrieval(
+        model, args.num_items, args.num_examples,
+        os.path.join(out_root, "exp1_json"),
+    )
+    logger.info("\n--- Experiment 2: Markdown Table Retrieval ---")
+    table_results = run_table_retrieval(
+        model, args.num_items, args.num_examples,
+        os.path.join(out_root, "exp2_table"),
+    )
+    logger.info("\n--- Experiment 3: Log File Retrieval ---")
+    log_results = run_log_retrieval(
+        model, args.num_items, args.num_examples,
+        os.path.join(out_root, "exp3_log"),
+    )
+    master = {
+        "json": json_results,
+        "table": table_results,
+        "log": log_results,
+    }
+    save_json(os.path.join(out_root, "master_summary.json"), master)
+    logger.info("\n--- Structured Data PBI Comparison ---")
+    for exp_name, res in master.items():
+        depths = list(res["depths"].keys())
+        accs = list(res["depths"].values())
+        if len(accs) >= 3:
+            mid_idx = len(accs) // 2
+            pbi = (accs[0] + accs[-1]) / 2 - accs[mid_idx]
+            logger.info(f"  {exp_name:10s} PBI={pbi:+.3f}")
+if __name__ == "__main__":
+    main()

src/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ """Structured Data Position Bias Benchmark"""
2	+ __version__ = "1.0.0"

src/generator.py ADDED Viewed

	@@ -0,0 +1,53 @@

+"""Text generation wrapper."""
+import torch
+from typing import List, Dict
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+_model_cache = {}
+_tok_cache = {}
+def load_model(model_name: str, load_in_4bit: bool = True):
+    cache_key = f"{model_name}:{load_in_4bit}"
+    if cache_key in _model_cache:
+        return _model_cache[cache_key], _tok_cache[cache_key]
+    tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    if tok.pad_token is None:
+        tok.pad_token = tok.eos_token
+    if load_in_4bit:
+        bnb = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name, quantization_config=bnb, device_map="auto",
+            trust_remote_code=True, torch_dtype=torch.bfloat16,
+        )
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name, device_map="auto",
+            trust_remote_code=True, torch_dtype=torch.bfloat16,
+        )
+    model.eval()
+    _model_cache[cache_key] = model
+    _tok_cache[cache_key] = tok
+    return model, tok
+def generate_text(messages: List[Dict[str, str]], model_name: str, max_new_tokens: int = 80):
+    model, tokenizer = load_model(model_name)
+    inputs = tokenizer.apply_chat_template(
+        messages, tokenize=True, return_tensors="pt",
+        add_generation_prompt=True, return_dict=True,
+    )
+    dev = next(model.parameters()).device
+    inputs = {k: v.to(dev) for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs, max_new_tokens=max_new_tokens,
+            do_sample=False, pad_token_id=tokenizer.pad_token_id,
+        )
+    gen = outputs[0][inputs["input_ids"].shape[1]:]
+    return tokenizer.decode(gen, skip_special_tokens=True).strip()

src/utils.py ADDED Viewed

	@@ -0,0 +1,19 @@

+"""Utilities."""
+import json
+import os
+from typing import List, Dict, Any
+def ensure_dir(path: str):
+    os.makedirs(path, exist_ok=True)
+def save_jsonl(path: str, records: List[Dict[str, Any]]):
+    with open(path, "w") as f:
+        for r in records:
+            f.write(json.dumps(r) + "\n")
+def save_json(path: str, data: Any):
+    with open(path, "w") as f:
+        json.dump(data, f, indent=2)