Spaces:

Vikaspandey582003
/

echo-ultimate

Sleeping

File size: 6,000 Bytes

acb327b

"""
EchoBench Publisher
Converts ECHO task bank to HuggingFace Dataset and publishes to the Hub.

Usage:
  python scripts/publish_echobench.py --token YOUR_HF_TOKEN
  python scripts/publish_echobench.py --token YOUR_HF_TOKEN --repo your-username/echobench
"""

import argparse
import sys
import os

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))


def load_tasks_from_bank():
    """Load all tasks from ECHO's task bank."""
    from env.task_bank import TaskBank
    from config import cfg

    bank = TaskBank()
    print("Loading task bank (downloads datasets if not cached)…")
    bank.ensure_loaded()

    all_tasks = []
    for domain in cfg.DOMAINS:
        for difficulty in cfg.DIFFICULTIES:
            bucket = bank._tasks.get(domain, {}).get(difficulty, [])
            all_tasks.extend(bucket)
            print(f"  {domain}/{difficulty}: {len(bucket)} tasks")

    print(f"\nTotal tasks: {len(all_tasks)}")
    return all_tasks


def tasks_to_hf_dataset(tasks):
    """Convert task dicts to HuggingFace DatasetDict split by domain."""
    from datasets import Dataset, DatasetDict

    records = []
    for task in tasks:
        records.append({
            "id":               str(task.get("id", "")),
            "domain":           str(task.get("domain", "")),
            "difficulty":       str(task.get("difficulty", "")),
            "difficulty_score": float(task.get("difficulty_score", 0.5)),
            "question":         str(task.get("question", "")),
            "answer":           str(task.get("answer", "")),
            "answer_aliases":   [str(a) for a in task.get("answer_aliases", [])],
            "source_dataset":   str(task.get("source_dataset", "")),
        })

    splits = {}
    domains = sorted({r["domain"] for r in records})
    for domain in domains:
        subset = [r for r in records if r["domain"] == domain]
        splits[domain] = Dataset.from_list(subset)
        print(f"  Split '{domain}': {len(subset)} rows")

    splits["all"] = Dataset.from_list(records)
    print(f"  Split 'all':    {len(records)} rows")
    return DatasetDict(splits)


_DATASET_CARD = """\
---
license: apache-2.0
task_categories:
- question-answering
- text-classification
language:
- en
tags:
- calibration
- metacognition
- llm-evaluation
- grpo
- openenv
size_categories:
- 10K<n<100K
---

# EchoBench

**The first public benchmark for LLM metacognitive calibration.**

EchoBench contains questions across 7 domains for training and evaluating
whether language models accurately predict their own probability of being correct.

## Domains

| Domain | Source | Description |
|--------|--------|-------------|
| Math | GSM8K | Grade-school math word problems |
| Logic | AI2-ARC | Multiple-choice science reasoning |
| Factual | TriviaQA | Open-domain factual questions |
| Science | SciQ | Multiple-choice science questions |
| Medical | MedMCQA | Medical licensing exam questions |
| Coding | Synthetic | Code output/complexity prediction |
| Creative | Synthetic | Wordplay, synonyms, literary devices |

## Usage

```python
from datasets import load_dataset

# Load all tasks
ds = load_dataset("revti126/echobench", "all")

# Load a specific domain
math_ds = load_dataset("revti126/echobench", "math")
print(math_ds["train"][0])
```

## Task Format

Each row contains:
- `id` — unique task identifier (`math_easy_00042`)
- `domain` — one of math/logic/factual/science/medical/coding/creative
- `difficulty` — easy / medium / hard
- `difficulty_score` — float 0.0 (hardest) → 1.0 (easiest)
- `question` — the question text
- `answer` — canonical correct answer
- `answer_aliases` — all accepted answer strings
- `source_dataset` — originating HuggingFace dataset

## Citation

```bibtex
@misc{echobench-2025,
  title  = {EchoBench: A Benchmark for LLM Metacognitive Calibration},
  author = {Tripathi, Revtiraman and Pandey, Vikas Dev},
  year   = {2025},
  url    = {https://huggingface.co/datasets/revti126/echobench},
  note   = {Created for ECHO ULTIMATE — OpenEnv Hackathon 2025}
}
```

*Part of the [ECHO ULTIMATE](https://huggingface.co/spaces/revti126/echo-ultimate) project.*
"""


def publish_to_hub(dataset_dict, repo_id: str, token: str):
    """Push dataset to HuggingFace Hub and upload the dataset card."""
    from huggingface_hub import HfApi

    api = HfApi(token=token)

    print(f"\nCreating repository: {repo_id}")
    try:
        api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True)
    except Exception as exc:
        print(f"  Note: {exc}")

    print("Pushing dataset…")
    dataset_dict.push_to_hub(repo_id, token=token)

    print("Uploading dataset card…")
    api.upload_file(
        path_or_fileobj=_DATASET_CARD.encode(),
        path_in_repo="README.md",
        repo_id=repo_id,
        repo_type="dataset",
        token=token,
    )

    url = f"https://huggingface.co/datasets/{repo_id}"
    print(f"\n✅  EchoBench published: {url}")
    return url


def main():
    parser = argparse.ArgumentParser(
        description="Publish ECHO task bank as EchoBench HuggingFace dataset."
    )
    parser.add_argument("--token",  required=True, help="HuggingFace API write token")
    parser.add_argument("--repo",   default="revti126/echobench",
                        help="HuggingFace repo ID (default: revti126/echobench)")
    parser.add_argument("--quiet",  action="store_true")
    args = parser.parse_args()

    if not args.quiet:
        print("=== EchoBench Publisher ===\n")

    tasks       = load_tasks_from_bank()
    if not tasks:
        print("❌  No tasks loaded. Run `python run.py download` first.")
        sys.exit(1)

    dataset_dict = tasks_to_hf_dataset(tasks)
    url          = publish_to_hub(dataset_dict, args.repo, args.token)

    print(f"\n=== Done ===")
    print(f"Dataset URL: {url}")
    print(f"Add to README.md and openenv.yaml:")
    print(f"  dataset: {args.repo}")


if __name__ == "__main__":
    main()