Hanrui / SpecForge-ext /download_datasets.py
Lekr0's picture
Add files using upload-large-folder tool
7c50656 verified
#!/usr/bin/env python3
"""
下载 GSM8K 和 HumanEval 数据集到本地
"""
import os
import json
import requests
from datasets import load_dataset
DATA_DIR = "/workspace/hanrui/datasets"
os.makedirs(DATA_DIR, exist_ok=True)
print("=" * 60)
print("下载 GSM8K 数据集")
print("=" * 60)
try:
# 下载 GSM8K
gsm8k_dir = os.path.join(DATA_DIR, "gsm8k")
os.makedirs(gsm8k_dir, exist_ok=True)
print("Loading GSM8K from HuggingFace...")
dataset = load_dataset("gsm8k", "main", split="test")
# 保存为 jsonl
output_file = os.path.join(gsm8k_dir, "test.jsonl")
with open(output_file, 'w') as f:
for item in dataset:
f.write(json.dumps(item) + '\n')
print(f"✓ GSM8K saved to {output_file}")
print(f" Total samples: {len(dataset)}")
except Exception as e:
print(f"✗ GSM8K download failed: {e}")
print("\n" + "=" * 60)
print("下载 HumanEval 数据集")
print("=" * 60)
try:
# 下载 HumanEval
humaneval_dir = os.path.join(DATA_DIR, "humaneval")
os.makedirs(humaneval_dir, exist_ok=True)
print("Loading HumanEval from HuggingFace...")
dataset = load_dataset("openai_humaneval", split="test")
# 保存为 jsonl
output_file = os.path.join(humaneval_dir, "test.jsonl")
with open(output_file, 'w') as f:
for item in dataset:
f.write(json.dumps(item) + '\n')
print(f"✓ HumanEval saved to {output_file}")
print(f" Total samples: {len(dataset)}")
except Exception as e:
print(f"✗ HumanEval download failed: {e}")
print("\n" + "=" * 60)
print("下载完成")
print("=" * 60)
print(f"数据保存在: {DATA_DIR}")