Spaces:

Vikaspandey582003
/

echo-ultimate

Sleeping

App Files Files Community

echo-ultimate / scripts /publish_echobench.py

Vikaspandey582003

Upload folder using huggingface_hub

acb327b verified 13 days ago

raw

history blame contribute delete

6 kB

	"""
	EchoBench Publisher
	Converts ECHO task bank to HuggingFace Dataset and publishes to the Hub.

	Usage:
	python scripts/publish_echobench.py --token YOUR_HF_TOKEN
	python scripts/publish_echobench.py --token YOUR_HF_TOKEN --repo your-username/echobench
	"""

	import argparse
	import sys
	import os

	sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))


	def load_tasks_from_bank():
	"""Load all tasks from ECHO's task bank."""
	from env.task_bank import TaskBank
	from config import cfg

	bank = TaskBank()
	print("Loading task bank (downloads datasets if not cached)…")
	bank.ensure_loaded()

	all_tasks = []
	for domain in cfg.DOMAINS:
	for difficulty in cfg.DIFFICULTIES:
	bucket = bank._tasks.get(domain, {}).get(difficulty, [])
	all_tasks.extend(bucket)
	print(f" {domain}/{difficulty}: {len(bucket)} tasks")

	print(f"\nTotal tasks: {len(all_tasks)}")
	return all_tasks


	def tasks_to_hf_dataset(tasks):
	"""Convert task dicts to HuggingFace DatasetDict split by domain."""
	from datasets import Dataset, DatasetDict

	records = []
	for task in tasks:
	records.append({
	"id": str(task.get("id", "")),
	"domain": str(task.get("domain", "")),
	"difficulty": str(task.get("difficulty", "")),
	"difficulty_score": float(task.get("difficulty_score", 0.5)),
	"question": str(task.get("question", "")),
	"answer": str(task.get("answer", "")),
	"answer_aliases": [str(a) for a in task.get("answer_aliases", [])],
	"source_dataset": str(task.get("source_dataset", "")),
	})

	splits = {}
	domains = sorted({r["domain"] for r in records})
	for domain in domains:
	subset = [r for r in records if r["domain"] == domain]
	splits[domain] = Dataset.from_list(subset)
	print(f" Split '{domain}': {len(subset)} rows")

	splits["all"] = Dataset.from_list(records)
	print(f" Split 'all': {len(records)} rows")
	return DatasetDict(splits)


	_DATASET_CARD = """\
	---
	license: apache-2.0
	task_categories:
	- question-answering
	- text-classification
	language:
	- en
	tags:
	- calibration
	- metacognition
	- llm-evaluation
	- grpo
	- openenv
	size_categories:
	- 10K<n<100K
	---

	# EchoBench

	The first public benchmark for LLM metacognitive calibration.

	EchoBench contains questions across 7 domains for training and evaluating
	whether language models accurately predict their own probability of being correct.

	## Domains

	\| Domain \| Source \| Description \|
	\|--------\|--------\|-------------\|
	\| Math \| GSM8K \| Grade-school math word problems \|
	\| Logic \| AI2-ARC \| Multiple-choice science reasoning \|
	\| Factual \| TriviaQA \| Open-domain factual questions \|
	\| Science \| SciQ \| Multiple-choice science questions \|
	\| Medical \| MedMCQA \| Medical licensing exam questions \|
	\| Coding \| Synthetic \| Code output/complexity prediction \|
	\| Creative \| Synthetic \| Wordplay, synonyms, literary devices \|

	## Usage

	```python
	from datasets import load_dataset

	# Load all tasks
	ds = load_dataset("revti126/echobench", "all")

	# Load a specific domain
	math_ds = load_dataset("revti126/echobench", "math")
	print(math_ds["train"][0])
	```

	## Task Format

	Each row contains:
	- `id` — unique task identifier (`math_easy_00042`)
	- `domain` — one of math/logic/factual/science/medical/coding/creative
	- `difficulty` — easy / medium / hard
	- `difficulty_score` — float 0.0 (hardest) → 1.0 (easiest)
	- `question` — the question text
	- `answer` — canonical correct answer
	- `answer_aliases` — all accepted answer strings
	- `source_dataset` — originating HuggingFace dataset

	## Citation

	```bibtex
	@misc{echobench-2025,
	title = {EchoBench: A Benchmark for LLM Metacognitive Calibration},
	author = {Tripathi, Revtiraman and Pandey, Vikas Dev},
	year = {2025},
	url = {https://huggingface.co/datasets/revti126/echobench},
	note = {Created for ECHO ULTIMATE — OpenEnv Hackathon 2025}
	}
	```

	Part of the [ECHO ULTIMATE](https://huggingface.co/spaces/revti126/echo-ultimate) project.
	"""


	def publish_to_hub(dataset_dict, repo_id: str, token: str):
	"""Push dataset to HuggingFace Hub and upload the dataset card."""
	from huggingface_hub import HfApi

	api = HfApi(token=token)

	print(f"\nCreating repository: {repo_id}")
	try:
	api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True)
	except Exception as exc:
	print(f" Note: {exc}")

	print("Pushing dataset…")
	dataset_dict.push_to_hub(repo_id, token=token)

	print("Uploading dataset card…")
	api.upload_file(
	path_or_fileobj=_DATASET_CARD.encode(),
	path_in_repo="README.md",
	repo_id=repo_id,
	repo_type="dataset",
	token=token,
	)

	url = f"https://huggingface.co/datasets/{repo_id}"
	print(f"\n✅ EchoBench published: {url}")
	return url


	def main():
	parser = argparse.ArgumentParser(
	description="Publish ECHO task bank as EchoBench HuggingFace dataset."
	)
	parser.add_argument("--token", required=True, help="HuggingFace API write token")
	parser.add_argument("--repo", default="revti126/echobench",
	help="HuggingFace repo ID (default: revti126/echobench)")
	parser.add_argument("--quiet", action="store_true")
	args = parser.parse_args()

	if not args.quiet:
	print("=== EchoBench Publisher ===\n")

	tasks = load_tasks_from_bank()
	if not tasks:
	print("❌ No tasks loaded. Run `python run.py download` first.")
	sys.exit(1)

	dataset_dict = tasks_to_hf_dataset(tasks)
	url = publish_to_hub(dataset_dict, args.repo, args.token)

	print(f"\n=== Done ===")
	print(f"Dataset URL: {url}")
	print(f"Add to README.md and openenv.yaml:")
	print(f" dataset: {args.repo}")


	if __name__ == "__main__":
	main()