|
import os |
|
import json |
|
import shutil |
|
|
|
|
|
keyword_file = '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI-MMbench-CoT/output_multi_column.txt' |
|
keyword_dict = {} |
|
|
|
with open(keyword_file, 'r', encoding='utf-8') as f: |
|
for line in f: |
|
line = line.strip() |
|
if not line: |
|
continue |
|
parts = line.split(',') |
|
if len(parts) != 4: |
|
print(f"格式错误,跳过此行:{line}") |
|
continue |
|
keyword, department, task, modality = [p.strip() for p in parts] |
|
keyword_dict[keyword] = { |
|
'department': department, |
|
'task': task, |
|
'modality': modality |
|
} |
|
|
|
print(f"总共加载了 {len(keyword_dict)} 个关键词。") |
|
|
|
|
|
departments = [ |
|
'Cardiovascular Surgery', |
|
'Dermatology', |
|
'Endocrinology', |
|
'Gastroenterology and Hepatology', |
|
'General Surgery', |
|
'Hematology', |
|
'Infectious Diseases', |
|
'Laboratory Medicine and Pathology', |
|
'Nephrology and Hypertension', |
|
'Neurosurgery', |
|
'Obstetrics and Gynecology', |
|
'Oncology (Medical)', |
|
'Ophthalmology', |
|
'Orthopedic Surgery', |
|
'Otolaryngology (ENT)/Head and Neck Surgery', |
|
'Pulmonary Medicine', |
|
'Sports Medicine', |
|
'Urology' |
|
] |
|
|
|
|
|
def get_department_dir_name(department): |
|
if department == 'Otolaryngology (ENT)/Head and Neck Surgery': |
|
return 'Otolaryngology (ENT)' |
|
else: |
|
return department |
|
|
|
|
|
departments_set = set(departments) |
|
|
|
|
|
source_dirs = [ |
|
'/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/cls_2d', |
|
'/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/det_2d', |
|
'/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/semantic_seg_2d', |
|
'/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/semantic_seg_3d' |
|
] |
|
|
|
|
|
destination_root = '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI-MMbench-CoT' |
|
|
|
|
|
total_files_processed = 0 |
|
files_matched = 0 |
|
images_copied = 0 |
|
|
|
|
|
department_file_counts = {dept: 0 for dept in departments} |
|
|
|
|
|
image_keys = ['img_mask_path', 'img_contour_path', 'img_bbox_path', 'img_path'] |
|
|
|
|
|
for source_dir in source_dirs: |
|
print(f"正在遍历目录:{source_dir}") |
|
for root, dirs, files in os.walk(source_dir): |
|
for file in files: |
|
if file.endswith('.json'): |
|
total_files_processed += 1 |
|
source_file_path = os.path.join(root, file) |
|
try: |
|
with open(source_file_path, 'r', encoding='utf-8') as f: |
|
data = json.load(f) |
|
answer_letter = data.get('answer', '').strip() |
|
options = data.get('options', []) |
|
if not answer_letter or not options: |
|
print(f"文件缺少 'answer' 或 'options' 字段,跳过:{source_file_path}") |
|
continue |
|
|
|
option_dict = {} |
|
for opt in options: |
|
if len(opt) > 2 and opt[1] == '.': |
|
opt_letter = opt[0] |
|
opt_text = opt[3:].strip() |
|
option_dict[opt_letter] = opt_text |
|
else: |
|
print(f"选项格式错误,文件:{source_file_path},选项:{opt}") |
|
|
|
keyword = option_dict.get(answer_letter) |
|
if not keyword: |
|
print(f"答案字母 '{answer_letter}' 在选项中未找到,文件:{source_file_path}") |
|
continue |
|
print(f"处理文件:{source_file_path}") |
|
print(f"关键词:'{keyword}'") |
|
|
|
if keyword in keyword_dict: |
|
department_info = keyword_dict[keyword] |
|
department = department_info['department'] |
|
print(f"关键词 '{keyword}' 的科室为:'{department}'") |
|
if department in departments_set: |
|
files_matched += 1 |
|
department_dir_name = get_department_dir_name(department) |
|
destination_base = os.path.join(destination_root, department_dir_name) |
|
|
|
relative_path = os.path.relpath(source_file_path, '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI') |
|
destination_file_path = os.path.join(destination_base, relative_path) |
|
|
|
destination_dir = os.path.dirname(destination_file_path) |
|
if not os.path.exists(destination_dir): |
|
os.makedirs(destination_dir) |
|
print(f"创建目录:{destination_dir}") |
|
|
|
shutil.copy2(source_file_path, destination_file_path) |
|
print(f"已复制文件到:{destination_file_path}") |
|
|
|
for image_key in image_keys: |
|
if image_key in data: |
|
image_path = data[image_key] |
|
|
|
source_image_path = os.path.join(source_dir, 'images', image_path) |
|
if not os.path.exists(source_image_path): |
|
print(f"源图片不存在,跳过:{source_image_path}") |
|
continue |
|
|
|
relative_image_path = os.path.relpath(source_image_path, '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI') |
|
|
|
destination_image_path = os.path.join(destination_base, relative_image_path) |
|
destination_image_dir = os.path.dirname(destination_image_path) |
|
if not os.path.exists(destination_image_dir): |
|
os.makedirs(destination_image_dir) |
|
print(f"创建图片目录:{destination_image_dir}") |
|
|
|
shutil.copy2(source_image_path, destination_image_path) |
|
images_copied += 1 |
|
print(f"已复制图片到:{destination_image_path}") |
|
|
|
department_file_counts[department] += 1 |
|
else: |
|
print(f"科室 '{department}' 不在处理列表中,不复制文件。") |
|
else: |
|
print(f"关键词 '{keyword}' 不在关键词列表中。") |
|
except Exception as e: |
|
print(f"处理文件 {source_file_path} 时发生错误:{e}") |
|
|
|
print(f"总共处理了 {total_files_processed} 个 JSON 文件。") |
|
print(f"总共匹配并复制了 {files_matched} 个 JSON 文件。") |
|
print(f"总共复制了 {images_copied} 张图片。") |
|
|
|
|
|
print("每个科室匹配并复制的文件数量:") |
|
for dept in departments: |
|
count = department_file_counts[dept] |
|
dept_dir_name = get_department_dir_name(dept) |
|
print(f"{dept_dir_name}: {count} 个文件") |
|
|