File size: 6,000 Bytes
acb327b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
"""
EchoBench Publisher
Converts ECHO task bank to HuggingFace Dataset and publishes to the Hub.

Usage:
  python scripts/publish_echobench.py --token YOUR_HF_TOKEN
  python scripts/publish_echobench.py --token YOUR_HF_TOKEN --repo your-username/echobench
"""

import argparse
import sys
import os

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))


def load_tasks_from_bank():
    """Load all tasks from ECHO's task bank."""
    from env.task_bank import TaskBank
    from config import cfg

    bank = TaskBank()
    print("Loading task bank (downloads datasets if not cached)…")
    bank.ensure_loaded()

    all_tasks = []
    for domain in cfg.DOMAINS:
        for difficulty in cfg.DIFFICULTIES:
            bucket = bank._tasks.get(domain, {}).get(difficulty, [])
            all_tasks.extend(bucket)
            print(f"  {domain}/{difficulty}: {len(bucket)} tasks")

    print(f"\nTotal tasks: {len(all_tasks)}")
    return all_tasks


def tasks_to_hf_dataset(tasks):
    """Convert task dicts to HuggingFace DatasetDict split by domain."""
    from datasets import Dataset, DatasetDict

    records = []
    for task in tasks:
        records.append({
            "id":               str(task.get("id", "")),
            "domain":           str(task.get("domain", "")),
            "difficulty":       str(task.get("difficulty", "")),
            "difficulty_score": float(task.get("difficulty_score", 0.5)),
            "question":         str(task.get("question", "")),
            "answer":           str(task.get("answer", "")),
            "answer_aliases":   [str(a) for a in task.get("answer_aliases", [])],
            "source_dataset":   str(task.get("source_dataset", "")),
        })

    splits = {}
    domains = sorted({r["domain"] for r in records})
    for domain in domains:
        subset = [r for r in records if r["domain"] == domain]
        splits[domain] = Dataset.from_list(subset)
        print(f"  Split '{domain}': {len(subset)} rows")

    splits["all"] = Dataset.from_list(records)
    print(f"  Split 'all':    {len(records)} rows")
    return DatasetDict(splits)


_DATASET_CARD = """\
---
license: apache-2.0
task_categories:
- question-answering
- text-classification
language:
- en
tags:
- calibration
- metacognition
- llm-evaluation
- grpo
- openenv
size_categories:
- 10K<n<100K
---

# EchoBench

**The first public benchmark for LLM metacognitive calibration.**

EchoBench contains questions across 7 domains for training and evaluating
whether language models accurately predict their own probability of being correct.

## Domains

| Domain | Source | Description |
|--------|--------|-------------|
| Math | GSM8K | Grade-school math word problems |
| Logic | AI2-ARC | Multiple-choice science reasoning |
| Factual | TriviaQA | Open-domain factual questions |
| Science | SciQ | Multiple-choice science questions |
| Medical | MedMCQA | Medical licensing exam questions |
| Coding | Synthetic | Code output/complexity prediction |
| Creative | Synthetic | Wordplay, synonyms, literary devices |

## Usage

```python
from datasets import load_dataset

# Load all tasks
ds = load_dataset("revti126/echobench", "all")

# Load a specific domain
math_ds = load_dataset("revti126/echobench", "math")
print(math_ds["train"][0])
```

## Task Format

Each row contains:
- `id` — unique task identifier (`math_easy_00042`)
- `domain` — one of math/logic/factual/science/medical/coding/creative
- `difficulty` — easy / medium / hard
- `difficulty_score` — float 0.0 (hardest) → 1.0 (easiest)
- `question` — the question text
- `answer` — canonical correct answer
- `answer_aliases` — all accepted answer strings
- `source_dataset` — originating HuggingFace dataset

## Citation

```bibtex
@misc{echobench-2025,
  title  = {EchoBench: A Benchmark for LLM Metacognitive Calibration},
  author = {Tripathi, Revtiraman and Pandey, Vikas Dev},
  year   = {2025},
  url    = {https://huggingface.co/datasets/revti126/echobench},
  note   = {Created for ECHO ULTIMATE — OpenEnv Hackathon 2025}
}
```

*Part of the [ECHO ULTIMATE](https://huggingface.co/spaces/revti126/echo-ultimate) project.*
"""


def publish_to_hub(dataset_dict, repo_id: str, token: str):
    """Push dataset to HuggingFace Hub and upload the dataset card."""
    from huggingface_hub import HfApi

    api = HfApi(token=token)

    print(f"\nCreating repository: {repo_id}")
    try:
        api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True)
    except Exception as exc:
        print(f"  Note: {exc}")

    print("Pushing dataset…")
    dataset_dict.push_to_hub(repo_id, token=token)

    print("Uploading dataset card…")
    api.upload_file(
        path_or_fileobj=_DATASET_CARD.encode(),
        path_in_repo="README.md",
        repo_id=repo_id,
        repo_type="dataset",
        token=token,
    )

    url = f"https://huggingface.co/datasets/{repo_id}"
    print(f"\n✅  EchoBench published: {url}")
    return url


def main():
    parser = argparse.ArgumentParser(
        description="Publish ECHO task bank as EchoBench HuggingFace dataset."
    )
    parser.add_argument("--token",  required=True, help="HuggingFace API write token")
    parser.add_argument("--repo",   default="revti126/echobench",
                        help="HuggingFace repo ID (default: revti126/echobench)")
    parser.add_argument("--quiet",  action="store_true")
    args = parser.parse_args()

    if not args.quiet:
        print("=== EchoBench Publisher ===\n")

    tasks       = load_tasks_from_bank()
    if not tasks:
        print("❌  No tasks loaded. Run `python run.py download` first.")
        sys.exit(1)

    dataset_dict = tasks_to_hf_dataset(tasks)
    url          = publish_to_hub(dataset_dict, args.repo, args.token)

    print(f"\n=== Done ===")
    print(f"Dataset URL: {url}")
    print(f"Add to README.md and openenv.yaml:")
    print(f"  dataset: {args.repo}")


if __name__ == "__main__":
    main()