echo-ultimate / scripts /publish_echobench.py
Vikaspandey582003's picture
Upload folder using huggingface_hub
acb327b verified
"""
EchoBench Publisher
Converts ECHO task bank to HuggingFace Dataset and publishes to the Hub.
Usage:
python scripts/publish_echobench.py --token YOUR_HF_TOKEN
python scripts/publish_echobench.py --token YOUR_HF_TOKEN --repo your-username/echobench
"""
import argparse
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
def load_tasks_from_bank():
"""Load all tasks from ECHO's task bank."""
from env.task_bank import TaskBank
from config import cfg
bank = TaskBank()
print("Loading task bank (downloads datasets if not cached)…")
bank.ensure_loaded()
all_tasks = []
for domain in cfg.DOMAINS:
for difficulty in cfg.DIFFICULTIES:
bucket = bank._tasks.get(domain, {}).get(difficulty, [])
all_tasks.extend(bucket)
print(f" {domain}/{difficulty}: {len(bucket)} tasks")
print(f"\nTotal tasks: {len(all_tasks)}")
return all_tasks
def tasks_to_hf_dataset(tasks):
"""Convert task dicts to HuggingFace DatasetDict split by domain."""
from datasets import Dataset, DatasetDict
records = []
for task in tasks:
records.append({
"id": str(task.get("id", "")),
"domain": str(task.get("domain", "")),
"difficulty": str(task.get("difficulty", "")),
"difficulty_score": float(task.get("difficulty_score", 0.5)),
"question": str(task.get("question", "")),
"answer": str(task.get("answer", "")),
"answer_aliases": [str(a) for a in task.get("answer_aliases", [])],
"source_dataset": str(task.get("source_dataset", "")),
})
splits = {}
domains = sorted({r["domain"] for r in records})
for domain in domains:
subset = [r for r in records if r["domain"] == domain]
splits[domain] = Dataset.from_list(subset)
print(f" Split '{domain}': {len(subset)} rows")
splits["all"] = Dataset.from_list(records)
print(f" Split 'all': {len(records)} rows")
return DatasetDict(splits)
_DATASET_CARD = """\
---
license: apache-2.0
task_categories:
- question-answering
- text-classification
language:
- en
tags:
- calibration
- metacognition
- llm-evaluation
- grpo
- openenv
size_categories:
- 10K<n<100K
---
# EchoBench
**The first public benchmark for LLM metacognitive calibration.**
EchoBench contains questions across 7 domains for training and evaluating
whether language models accurately predict their own probability of being correct.
## Domains
| Domain | Source | Description |
|--------|--------|-------------|
| Math | GSM8K | Grade-school math word problems |
| Logic | AI2-ARC | Multiple-choice science reasoning |
| Factual | TriviaQA | Open-domain factual questions |
| Science | SciQ | Multiple-choice science questions |
| Medical | MedMCQA | Medical licensing exam questions |
| Coding | Synthetic | Code output/complexity prediction |
| Creative | Synthetic | Wordplay, synonyms, literary devices |
## Usage
```python
from datasets import load_dataset
# Load all tasks
ds = load_dataset("revti126/echobench", "all")
# Load a specific domain
math_ds = load_dataset("revti126/echobench", "math")
print(math_ds["train"][0])
```
## Task Format
Each row contains:
- `id` — unique task identifier (`math_easy_00042`)
- `domain` — one of math/logic/factual/science/medical/coding/creative
- `difficulty` — easy / medium / hard
- `difficulty_score` — float 0.0 (hardest) → 1.0 (easiest)
- `question` — the question text
- `answer` — canonical correct answer
- `answer_aliases` — all accepted answer strings
- `source_dataset` — originating HuggingFace dataset
## Citation
```bibtex
@misc{echobench-2025,
title = {EchoBench: A Benchmark for LLM Metacognitive Calibration},
author = {Tripathi, Revtiraman and Pandey, Vikas Dev},
year = {2025},
url = {https://huggingface.co/datasets/revti126/echobench},
note = {Created for ECHO ULTIMATE — OpenEnv Hackathon 2025}
}
```
*Part of the [ECHO ULTIMATE](https://huggingface.co/spaces/revti126/echo-ultimate) project.*
"""
def publish_to_hub(dataset_dict, repo_id: str, token: str):
"""Push dataset to HuggingFace Hub and upload the dataset card."""
from huggingface_hub import HfApi
api = HfApi(token=token)
print(f"\nCreating repository: {repo_id}")
try:
api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True)
except Exception as exc:
print(f" Note: {exc}")
print("Pushing dataset…")
dataset_dict.push_to_hub(repo_id, token=token)
print("Uploading dataset card…")
api.upload_file(
path_or_fileobj=_DATASET_CARD.encode(),
path_in_repo="README.md",
repo_id=repo_id,
repo_type="dataset",
token=token,
)
url = f"https://huggingface.co/datasets/{repo_id}"
print(f"\n✅ EchoBench published: {url}")
return url
def main():
parser = argparse.ArgumentParser(
description="Publish ECHO task bank as EchoBench HuggingFace dataset."
)
parser.add_argument("--token", required=True, help="HuggingFace API write token")
parser.add_argument("--repo", default="revti126/echobench",
help="HuggingFace repo ID (default: revti126/echobench)")
parser.add_argument("--quiet", action="store_true")
args = parser.parse_args()
if not args.quiet:
print("=== EchoBench Publisher ===\n")
tasks = load_tasks_from_bank()
if not tasks:
print("❌ No tasks loaded. Run `python run.py download` first.")
sys.exit(1)
dataset_dict = tasks_to_hf_dataset(tasks)
url = publish_to_hub(dataset_dict, args.repo, args.token)
print(f"\n=== Done ===")
print(f"Dataset URL: {url}")
print(f"Add to README.md and openenv.yaml:")
print(f" dataset: {args.repo}")
if __name__ == "__main__":
main()