| |
| """ |
| 下载 GSM8K 和 HumanEval 数据集到本地 |
| """ |
| import os |
| import json |
| import requests |
| from datasets import load_dataset |
|
|
| DATA_DIR = "/workspace/hanrui/datasets" |
| os.makedirs(DATA_DIR, exist_ok=True) |
|
|
| print("=" * 60) |
| print("下载 GSM8K 数据集") |
| print("=" * 60) |
|
|
| try: |
| |
| gsm8k_dir = os.path.join(DATA_DIR, "gsm8k") |
| os.makedirs(gsm8k_dir, exist_ok=True) |
|
|
| print("Loading GSM8K from HuggingFace...") |
| dataset = load_dataset("gsm8k", "main", split="test") |
|
|
| |
| output_file = os.path.join(gsm8k_dir, "test.jsonl") |
| with open(output_file, 'w') as f: |
| for item in dataset: |
| f.write(json.dumps(item) + '\n') |
|
|
| print(f"✓ GSM8K saved to {output_file}") |
| print(f" Total samples: {len(dataset)}") |
|
|
| except Exception as e: |
| print(f"✗ GSM8K download failed: {e}") |
|
|
| print("\n" + "=" * 60) |
| print("下载 HumanEval 数据集") |
| print("=" * 60) |
|
|
| try: |
| |
| humaneval_dir = os.path.join(DATA_DIR, "humaneval") |
| os.makedirs(humaneval_dir, exist_ok=True) |
|
|
| print("Loading HumanEval from HuggingFace...") |
| dataset = load_dataset("openai_humaneval", split="test") |
|
|
| |
| output_file = os.path.join(humaneval_dir, "test.jsonl") |
| with open(output_file, 'w') as f: |
| for item in dataset: |
| f.write(json.dumps(item) + '\n') |
|
|
| print(f"✓ HumanEval saved to {output_file}") |
| print(f" Total samples: {len(dataset)}") |
|
|
| except Exception as e: |
| print(f"✗ HumanEval download failed: {e}") |
|
|
| print("\n" + "=" * 60) |
| print("下载完成") |
| print("=" * 60) |
| print(f"数据保存在: {DATA_DIR}") |
|
|