V2 / extract_questions.py

Upload extract_questions.py with huggingface_hub

a7d4c7b verified over 1 year ago

8.27 kB

	import os
	import json
	import shutil

	# 读取关键词文件并构建关键词映射字典
	keyword_file = '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI-MMbench-CoT/output_multi_column.txt'
	keyword_dict = {}

	with open(keyword_file, 'r', encoding='utf-8') as f:
	for line in f:
	line = line.strip()
	if not line:
	continue # 跳过空行
	parts = line.split(',')
	if len(parts) != 4:
	print(f"格式错误，跳过此行：{line}")
	continue
	keyword, department, task, modality = [p.strip() for p in parts]
	keyword_dict[keyword] = {
	'department': department,
	'task': task,
	'modality': modality
	}

	print(f"总共加载了 {len(keyword_dict)} 个关键词。")

	# 定义需要处理的科室列表
	departments = [
	'Cardiovascular Surgery',
	'Dermatology',
	'Endocrinology',
	'Gastroenterology and Hepatology',
	'General Surgery',
	'Hematology',
	'Infectious Diseases',
	'Laboratory Medicine and Pathology',
	'Nephrology and Hypertension',
	'Neurosurgery',
	'Obstetrics and Gynecology',
	'Oncology (Medical)',
	'Ophthalmology',
	'Orthopedic Surgery',
	'Otolaryngology (ENT)/Head and Neck Surgery',
	'Pulmonary Medicine',
	'Sports Medicine',
	'Urology'
	]

	# 创建科室到目录名称的映射，处理特殊情况
	def get_department_dir_name(department):
	if department == 'Otolaryngology (ENT)/Head and Neck Surgery':
	return 'Otolaryngology (ENT)'
	else:
	return department

	# 将科室列表转换为集合，方便查找
	departments_set = set(departments)

	# 定义源目录列表
	source_dirs = [
	'/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/cls_2d',
	'/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/det_2d',
	'/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/semantic_seg_2d',
	'/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/semantic_seg_3d'
	]

	# 定义目标基础目录
	destination_root = '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI-MMbench-CoT'

	# 用于统计和调试
	total_files_processed = 0
	files_matched = 0
	images_copied = 0

	# 用于统计每个科室的匹配文件数
	department_file_counts = {dept: 0 for dept in departments}

	# 要处理的图片键列表
	image_keys = ['img_mask_path', 'img_contour_path', 'img_bbox_path', 'img_path']

	# 遍历每个源目录
	for source_dir in source_dirs:
	print(f"正在遍历目录：{source_dir}")
	for root, dirs, files in os.walk(source_dir):
	for file in files:
	if file.endswith('.json'):
	total_files_processed += 1
	source_file_path = os.path.join(root, file)
	try:
	with open(source_file_path, 'r', encoding='utf-8') as f:
	data = json.load(f)
	answer_letter = data.get('answer', '').strip()
	options = data.get('options', [])
	if not answer_letter or not options:
	print(f"文件缺少 'answer' 或 'options' 字段，跳过：{source_file_path}")
	continue
	# 创建选项字典，映射字母到选项文本
	option_dict = {}
	for opt in options:
	if len(opt) > 2 and opt[1] == '.':
	opt_letter = opt[0]
	opt_text = opt[3:].strip()
	option_dict[opt_letter] = opt_text
	else:
	print(f"选项格式错误，文件：{source_file_path}，选项：{opt}")
	# 获取关键词
	keyword = option_dict.get(answer_letter)
	if not keyword:
	print(f"答案字母 '{answer_letter}' 在选项中未找到，文件：{source_file_path}")
	continue
	print(f"处理文件：{source_file_path}")
	print(f"关键词：'{keyword}'")
	# 检查关键词是否在关键词字典中
	if keyword in keyword_dict:
	department_info = keyword_dict[keyword]
	department = department_info['department']
	print(f"关键词 '{keyword}' 的科室为：'{department}'")
	if department in departments_set:
	files_matched += 1
	department_dir_name = get_department_dir_name(department)
	destination_base = os.path.join(destination_root, department_dir_name)
	# 构造目标文件路径
	relative_path = os.path.relpath(source_file_path, '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI')
	destination_file_path = os.path.join(destination_base, relative_path)
	# 创建目标目录（如果不存在）
	destination_dir = os.path.dirname(destination_file_path)
	if not os.path.exists(destination_dir):
	os.makedirs(destination_dir)
	print(f"创建目录：{destination_dir}")
	# 复制JSON文件
	shutil.copy2(source_file_path, destination_file_path)
	print(f"已复制文件到：{destination_file_path}")
	# 处理并复制图片
	for image_key in image_keys:
	if image_key in data:
	image_path = data[image_key]
	# 图片路径是相对于 source_dir + '/images' 的
	source_image_path = os.path.join(source_dir, 'images', image_path)
	if not os.path.exists(source_image_path):
	print(f"源图片不存在，跳过：{source_image_path}")
	continue
	# 构造相对路径，从 GMAI 之后开始，包括 'images' 目录
	relative_image_path = os.path.relpath(source_image_path, '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI')
	# 构造目标图片路径
	destination_image_path = os.path.join(destination_base, relative_image_path)
	destination_image_dir = os.path.dirname(destination_image_path)
	if not os.path.exists(destination_image_dir):
	os.makedirs(destination_image_dir)
	print(f"创建图片目录：{destination_image_dir}")
	# 复制图片文件
	shutil.copy2(source_image_path, destination_image_path)
	images_copied += 1
	print(f"已复制图片到：{destination_image_path}")
	# 增加对应科室的文件计数
	department_file_counts[department] += 1
	else:
	print(f"科室 '{department}' 不在处理列表中，不复制文件。")
	else:
	print(f"关键词 '{keyword}' 不在关键词列表中。")
	except Exception as e:
	print(f"处理文件 {source_file_path} 时发生错误：{e}")

	print(f"总共处理了 {total_files_processed} 个 JSON 文件。")
	print(f"总共匹配并复制了 {files_matched} 个 JSON 文件。")
	print(f"总共复制了 {images_copied} 张图片。")

	# 打印每个科室的文件计数
	print("每个科室匹配并复制的文件数量：")
	for dept in departments:
	count = department_file_counts[dept]
	dept_dir_name = get_department_dir_name(dept)
	print(f"{dept_dir_name}: {count} 个文件")