#!/usr/bin/env python3 """ 下载 GSM8K 和 HumanEval 数据集到本地 """ import os import json import requests from datasets import load_dataset DATA_DIR = "/workspace/hanrui/datasets" os.makedirs(DATA_DIR, exist_ok=True) print("=" * 60) print("下载 GSM8K 数据集") print("=" * 60) try: # 下载 GSM8K gsm8k_dir = os.path.join(DATA_DIR, "gsm8k") os.makedirs(gsm8k_dir, exist_ok=True) print("Loading GSM8K from HuggingFace...") dataset = load_dataset("gsm8k", "main", split="test") # 保存为 jsonl output_file = os.path.join(gsm8k_dir, "test.jsonl") with open(output_file, 'w') as f: for item in dataset: f.write(json.dumps(item) + '\n') print(f"✓ GSM8K saved to {output_file}") print(f" Total samples: {len(dataset)}") except Exception as e: print(f"✗ GSM8K download failed: {e}") print("\n" + "=" * 60) print("下载 HumanEval 数据集") print("=" * 60) try: # 下载 HumanEval humaneval_dir = os.path.join(DATA_DIR, "humaneval") os.makedirs(humaneval_dir, exist_ok=True) print("Loading HumanEval from HuggingFace...") dataset = load_dataset("openai_humaneval", split="test") # 保存为 jsonl output_file = os.path.join(humaneval_dir, "test.jsonl") with open(output_file, 'w') as f: for item in dataset: f.write(json.dumps(item) + '\n') print(f"✓ HumanEval saved to {output_file}") print(f" Total samples: {len(dataset)}") except Exception as e: print(f"✗ HumanEval download failed: {e}") print("\n" + "=" * 60) print("下载完成") print("=" * 60) print(f"数据保存在: {DATA_DIR}")