| import argparse |
| import json |
| from pathlib import Path |
|
|
| def merge_json_dataset(dataset_dir: str, output_name: str = "ChemQA") -> None: |
| """ |
| 自动合并dataset_dir中所有JSON文件的数据部分 |
| |
| 参数: |
| dataset_dir : 存放.json文件的目录 |
| output_name : 合并后的数据集名称(默认ChemQA) |
| """ |
| dataset_path = Path(dataset_dir) |
| output_json = dataset_path / f"{output_name}.json" |
| |
| |
| json_files = sorted(dataset_path.glob("*.json")) |
| json_files = [f for f in json_files if f != output_json] |
| |
| if not json_files: |
| raise FileNotFoundError("未找到任何JSON文件") |
| |
| merged_data = [] |
| global_offset = 0 |
| |
| for json_file in json_files: |
| |
| with open(json_file, 'r', encoding='utf-8') as f: |
| part_data = json.load(f) |
| |
| |
| for item in part_data: |
| item["index"] = global_offset + item["index"] |
| |
| |
| merged_data.extend(part_data) |
| global_offset += len(part_data) |
| |
| |
| with open(output_json, 'w', encoding='utf-8') as f: |
| json.dump(merged_data, f, indent=2, ensure_ascii=False) |
| |
| print(f"\n合并完成!共处理 {len(json_files)} 个JSON文件") |
| print(f"生成数据集: {output_name}.json") |
| print(f"- 总条目数: {len(merged_data)} 条") |
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description="合并JSON格式的问答数据集") |
| parser.add_argument('-i',dest="dataset_dir", required=True, help="JSON文件所在的目录路径") |
| parser.add_argument('-o',"--output_name", default="ChemQA", help="输出数据集名称(默认为ChemQA)") |
| |
| args = parser.parse_args() |
| |
| merge_json_dataset( |
| dataset_dir=args.dataset_dir, |
| output_name=args.output_name |
| ) |