File size: 8,266 Bytes
a7d4c7b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
import os
import json
import shutil
# 读取关键词文件并构建关键词映射字典
keyword_file = '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI-MMbench-CoT/output_multi_column.txt'
keyword_dict = {}
with open(keyword_file, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if not line:
continue # 跳过空行
parts = line.split(',')
if len(parts) != 4:
print(f"格式错误,跳过此行:{line}")
continue
keyword, department, task, modality = [p.strip() for p in parts]
keyword_dict[keyword] = {
'department': department,
'task': task,
'modality': modality
}
print(f"总共加载了 {len(keyword_dict)} 个关键词。")
# 定义需要处理的科室列表
departments = [
'Cardiovascular Surgery',
'Dermatology',
'Endocrinology',
'Gastroenterology and Hepatology',
'General Surgery',
'Hematology',
'Infectious Diseases',
'Laboratory Medicine and Pathology',
'Nephrology and Hypertension',
'Neurosurgery',
'Obstetrics and Gynecology',
'Oncology (Medical)',
'Ophthalmology',
'Orthopedic Surgery',
'Otolaryngology (ENT)/Head and Neck Surgery',
'Pulmonary Medicine',
'Sports Medicine',
'Urology'
]
# 创建科室到目录名称的映射,处理特殊情况
def get_department_dir_name(department):
if department == 'Otolaryngology (ENT)/Head and Neck Surgery':
return 'Otolaryngology (ENT)'
else:
return department
# 将科室列表转换为集合,方便查找
departments_set = set(departments)
# 定义源目录列表
source_dirs = [
'/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/cls_2d',
'/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/det_2d',
'/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/semantic_seg_2d',
'/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/semantic_seg_3d'
]
# 定义目标基础目录
destination_root = '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI-MMbench-CoT'
# 用于统计和调试
total_files_processed = 0
files_matched = 0
images_copied = 0
# 用于统计每个科室的匹配文件数
department_file_counts = {dept: 0 for dept in departments}
# 要处理的图片键列表
image_keys = ['img_mask_path', 'img_contour_path', 'img_bbox_path', 'img_path']
# 遍历每个源目录
for source_dir in source_dirs:
print(f"正在遍历目录:{source_dir}")
for root, dirs, files in os.walk(source_dir):
for file in files:
if file.endswith('.json'):
total_files_processed += 1
source_file_path = os.path.join(root, file)
try:
with open(source_file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
answer_letter = data.get('answer', '').strip()
options = data.get('options', [])
if not answer_letter or not options:
print(f"文件缺少 'answer' 或 'options' 字段,跳过:{source_file_path}")
continue
# 创建选项字典,映射字母到选项文本
option_dict = {}
for opt in options:
if len(opt) > 2 and opt[1] == '.':
opt_letter = opt[0]
opt_text = opt[3:].strip()
option_dict[opt_letter] = opt_text
else:
print(f"选项格式错误,文件:{source_file_path},选项:{opt}")
# 获取关键词
keyword = option_dict.get(answer_letter)
if not keyword:
print(f"答案字母 '{answer_letter}' 在选项中未找到,文件:{source_file_path}")
continue
print(f"处理文件:{source_file_path}")
print(f"关键词:'{keyword}'")
# 检查关键词是否在关键词字典中
if keyword in keyword_dict:
department_info = keyword_dict[keyword]
department = department_info['department']
print(f"关键词 '{keyword}' 的科室为:'{department}'")
if department in departments_set:
files_matched += 1
department_dir_name = get_department_dir_name(department)
destination_base = os.path.join(destination_root, department_dir_name)
# 构造目标文件路径
relative_path = os.path.relpath(source_file_path, '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI')
destination_file_path = os.path.join(destination_base, relative_path)
# 创建目标目录(如果不存在)
destination_dir = os.path.dirname(destination_file_path)
if not os.path.exists(destination_dir):
os.makedirs(destination_dir)
print(f"创建目录:{destination_dir}")
# 复制JSON文件
shutil.copy2(source_file_path, destination_file_path)
print(f"已复制文件到:{destination_file_path}")
# 处理并复制图片
for image_key in image_keys:
if image_key in data:
image_path = data[image_key]
# 图片路径是相对于 source_dir + '/images' 的
source_image_path = os.path.join(source_dir, 'images', image_path)
if not os.path.exists(source_image_path):
print(f"源图片不存在,跳过:{source_image_path}")
continue
# 构造相对路径,从 GMAI 之后开始,包括 'images' 目录
relative_image_path = os.path.relpath(source_image_path, '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI')
# 构造目标图片路径
destination_image_path = os.path.join(destination_base, relative_image_path)
destination_image_dir = os.path.dirname(destination_image_path)
if not os.path.exists(destination_image_dir):
os.makedirs(destination_image_dir)
print(f"创建图片目录:{destination_image_dir}")
# 复制图片文件
shutil.copy2(source_image_path, destination_image_path)
images_copied += 1
print(f"已复制图片到:{destination_image_path}")
# 增加对应科室的文件计数
department_file_counts[department] += 1
else:
print(f"科室 '{department}' 不在处理列表中,不复制文件。")
else:
print(f"关键词 '{keyword}' 不在关键词列表中。")
except Exception as e:
print(f"处理文件 {source_file_path} 时发生错误:{e}")
print(f"总共处理了 {total_files_processed} 个 JSON 文件。")
print(f"总共匹配并复制了 {files_matched} 个 JSON 文件。")
print(f"总共复制了 {images_copied} 张图片。")
# 打印每个科室的文件计数
print("每个科室匹配并复制的文件数量:")
for dept in departments:
count = department_file_counts[dept]
dept_dir_name = get_department_dir_name(dept)
print(f"{dept_dir_name}: {count} 个文件")
|