FasterDFlash
/

Hanrui

Model card Files Files and versions

Hanrui / SpecForge-ext /download_datasets.py

Lekr0's picture

Add files using upload-large-folder tool

7c50656 verified 6 days ago

history blame contribute delete

1.65 kB

	#!/usr/bin/env python3
	"""
	下载 GSM8K 和 HumanEval 数据集到本地
	"""
	import os
	import json
	import requests
	from datasets import load_dataset

	DATA_DIR = "/workspace/hanrui/datasets"
	os.makedirs(DATA_DIR, exist_ok=True)

	print("=" * 60)
	print("下载 GSM8K 数据集")
	print("=" * 60)

	try:
	# 下载 GSM8K
	gsm8k_dir = os.path.join(DATA_DIR, "gsm8k")
	os.makedirs(gsm8k_dir, exist_ok=True)

	print("Loading GSM8K from HuggingFace...")
	dataset = load_dataset("gsm8k", "main", split="test")

	# 保存为 jsonl
	output_file = os.path.join(gsm8k_dir, "test.jsonl")
	with open(output_file, 'w') as f:
	for item in dataset:
	f.write(json.dumps(item) + '\n')

	print(f"✓ GSM8K saved to {output_file}")
	print(f" Total samples: {len(dataset)}")

	except Exception as e:
	print(f"✗ GSM8K download failed: {e}")

	print("\n" + "=" * 60)
	print("下载 HumanEval 数据集")
	print("=" * 60)

	try:
	# 下载 HumanEval
	humaneval_dir = os.path.join(DATA_DIR, "humaneval")
	os.makedirs(humaneval_dir, exist_ok=True)

	print("Loading HumanEval from HuggingFace...")
	dataset = load_dataset("openai_humaneval", split="test")

	# 保存为 jsonl
	output_file = os.path.join(humaneval_dir, "test.jsonl")
	with open(output_file, 'w') as f:
	for item in dataset:
	f.write(json.dumps(item) + '\n')

	print(f"✓ HumanEval saved to {output_file}")
	print(f" Total samples: {len(dataset)}")

	except Exception as e:
	print(f"✗ HumanEval download failed: {e}")

	print("\n" + "=" * 60)
	print("下载完成")
	print("=" * 60)
	print(f"数据保存在: {DATA_DIR}")