| import os |
| import json |
| import shutil |
|
|
| |
| keyword_file = '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI-MMbench-CoT/output_multi_column.txt' |
| keyword_dict = {} |
|
|
| with open(keyword_file, 'r', encoding='utf-8') as f: |
| for line in f: |
| line = line.strip() |
| if not line: |
| continue |
| parts = line.split(',') |
| if len(parts) != 4: |
| print(f"格式错误,跳过此行:{line}") |
| continue |
| keyword, department, task, modality = [p.strip() for p in parts] |
| keyword_dict[keyword] = { |
| 'department': department, |
| 'task': task, |
| 'modality': modality |
| } |
|
|
| print(f"总共加载了 {len(keyword_dict)} 个关键词。") |
|
|
| |
| departments = [ |
| 'Cardiovascular Surgery', |
| 'Dermatology', |
| 'Endocrinology', |
| 'Gastroenterology and Hepatology', |
| 'General Surgery', |
| 'Hematology', |
| 'Infectious Diseases', |
| 'Laboratory Medicine and Pathology', |
| 'Nephrology and Hypertension', |
| 'Neurosurgery', |
| 'Obstetrics and Gynecology', |
| 'Oncology (Medical)', |
| 'Ophthalmology', |
| 'Orthopedic Surgery', |
| 'Otolaryngology (ENT)/Head and Neck Surgery', |
| 'Pulmonary Medicine', |
| 'Sports Medicine', |
| 'Urology' |
| ] |
|
|
| |
| def get_department_dir_name(department): |
| if department == 'Otolaryngology (ENT)/Head and Neck Surgery': |
| return 'Otolaryngology (ENT)' |
| else: |
| return department |
|
|
| |
| departments_set = set(departments) |
|
|
| |
| source_dirs = [ |
| '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/cls_2d', |
| '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/det_2d', |
| '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/semantic_seg_2d', |
| '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/semantic_seg_3d' |
| ] |
|
|
| |
| destination_root = '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI-MMbench-CoT' |
|
|
| |
| total_files_processed = 0 |
| files_matched = 0 |
| images_copied = 0 |
|
|
| |
| department_file_counts = {dept: 0 for dept in departments} |
|
|
| |
| image_keys = ['img_mask_path', 'img_contour_path', 'img_bbox_path', 'img_path'] |
|
|
| |
| for source_dir in source_dirs: |
| print(f"正在遍历目录:{source_dir}") |
| for root, dirs, files in os.walk(source_dir): |
| for file in files: |
| if file.endswith('.json'): |
| total_files_processed += 1 |
| source_file_path = os.path.join(root, file) |
| try: |
| with open(source_file_path, 'r', encoding='utf-8') as f: |
| data = json.load(f) |
| answer_letter = data.get('answer', '').strip() |
| options = data.get('options', []) |
| if not answer_letter or not options: |
| print(f"文件缺少 'answer' 或 'options' 字段,跳过:{source_file_path}") |
| continue |
| |
| option_dict = {} |
| for opt in options: |
| if len(opt) > 2 and opt[1] == '.': |
| opt_letter = opt[0] |
| opt_text = opt[3:].strip() |
| option_dict[opt_letter] = opt_text |
| else: |
| print(f"选项格式错误,文件:{source_file_path},选项:{opt}") |
| |
| keyword = option_dict.get(answer_letter) |
| if not keyword: |
| print(f"答案字母 '{answer_letter}' 在选项中未找到,文件:{source_file_path}") |
| continue |
| print(f"处理文件:{source_file_path}") |
| print(f"关键词:'{keyword}'") |
| |
| if keyword in keyword_dict: |
| department_info = keyword_dict[keyword] |
| department = department_info['department'] |
| print(f"关键词 '{keyword}' 的科室为:'{department}'") |
| if department in departments_set: |
| files_matched += 1 |
| department_dir_name = get_department_dir_name(department) |
| destination_base = os.path.join(destination_root, department_dir_name) |
| |
| relative_path = os.path.relpath(source_file_path, '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI') |
| destination_file_path = os.path.join(destination_base, relative_path) |
| |
| destination_dir = os.path.dirname(destination_file_path) |
| if not os.path.exists(destination_dir): |
| os.makedirs(destination_dir) |
| print(f"创建目录:{destination_dir}") |
| |
| shutil.copy2(source_file_path, destination_file_path) |
| print(f"已复制文件到:{destination_file_path}") |
| |
| for image_key in image_keys: |
| if image_key in data: |
| image_path = data[image_key] |
| |
| source_image_path = os.path.join(source_dir, 'images', image_path) |
| if not os.path.exists(source_image_path): |
| print(f"源图片不存在,跳过:{source_image_path}") |
| continue |
| |
| relative_image_path = os.path.relpath(source_image_path, '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI') |
| |
| destination_image_path = os.path.join(destination_base, relative_image_path) |
| destination_image_dir = os.path.dirname(destination_image_path) |
| if not os.path.exists(destination_image_dir): |
| os.makedirs(destination_image_dir) |
| print(f"创建图片目录:{destination_image_dir}") |
| |
| shutil.copy2(source_image_path, destination_image_path) |
| images_copied += 1 |
| print(f"已复制图片到:{destination_image_path}") |
| |
| department_file_counts[department] += 1 |
| else: |
| print(f"科室 '{department}' 不在处理列表中,不复制文件。") |
| else: |
| print(f"关键词 '{keyword}' 不在关键词列表中。") |
| except Exception as e: |
| print(f"处理文件 {source_file_path} 时发生错误:{e}") |
|
|
| print(f"总共处理了 {total_files_processed} 个 JSON 文件。") |
| print(f"总共匹配并复制了 {files_matched} 个 JSON 文件。") |
| print(f"总共复制了 {images_copied} 张图片。") |
|
|
| |
| print("每个科室匹配并复制的文件数量:") |
| for dept in departments: |
| count = department_file_counts[dept] |
| dept_dir_name = get_department_dir_name(dept) |
| print(f"{dept_dir_name}: {count} 个文件") |
|
|