pengcc1
/

V2

Model card Files Files and versions Community

File size: 8,266 Bytes

a7d4c7b

import os
import json
import shutil

# 读取关键词文件并构建关键词映射字典
keyword_file = '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI-MMbench-CoT/output_multi_column.txt'
keyword_dict = {}

with open(keyword_file, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if not line:
            continue  # 跳过空行
        parts = line.split(',')
        if len(parts) != 4:
            print(f"格式错误，跳过此行：{line}")
            continue
        keyword, department, task, modality = [p.strip() for p in parts]
        keyword_dict[keyword] = {
            'department': department,
            'task': task,
            'modality': modality
        }

print(f"总共加载了 {len(keyword_dict)} 个关键词。")

# 定义需要处理的科室列表
departments = [
    'Cardiovascular Surgery',
    'Dermatology',
    'Endocrinology',
    'Gastroenterology and Hepatology',
    'General Surgery',
    'Hematology',
    'Infectious Diseases',
    'Laboratory Medicine and Pathology',
    'Nephrology and Hypertension',
    'Neurosurgery',
    'Obstetrics and Gynecology',
    'Oncology (Medical)',
    'Ophthalmology',
    'Orthopedic Surgery',
    'Otolaryngology (ENT)/Head and Neck Surgery',
    'Pulmonary Medicine',
    'Sports Medicine',
    'Urology'
]

# 创建科室到目录名称的映射，处理特殊情况
def get_department_dir_name(department):
    if department == 'Otolaryngology (ENT)/Head and Neck Surgery':
        return 'Otolaryngology (ENT)'
    else:
        return department

# 将科室列表转换为集合，方便查找
departments_set = set(departments)

# 定义源目录列表
source_dirs = [
    '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/cls_2d',
    '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/det_2d',
    '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/semantic_seg_2d',
    '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/semantic_seg_3d'
]

# 定义目标基础目录
destination_root = '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI-MMbench-CoT'

# 用于统计和调试
total_files_processed = 0
files_matched = 0
images_copied = 0

# 用于统计每个科室的匹配文件数
department_file_counts = {dept: 0 for dept in departments}

# 要处理的图片键列表
image_keys = ['img_mask_path', 'img_contour_path', 'img_bbox_path', 'img_path']

# 遍历每个源目录
for source_dir in source_dirs:
    print(f"正在遍历目录：{source_dir}")
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            if file.endswith('.json'):
                total_files_processed += 1
                source_file_path = os.path.join(root, file)
                try:
                    with open(source_file_path, 'r', encoding='utf-8') as f:
                        data = json.load(f)
                    answer_letter = data.get('answer', '').strip()
                    options = data.get('options', [])
                    if not answer_letter or not options:
                        print(f"文件缺少 'answer' 或 'options' 字段，跳过：{source_file_path}")
                        continue
                    # 创建选项字典，映射字母到选项文本
                    option_dict = {}
                    for opt in options:
                        if len(opt) > 2 and opt[1] == '.':
                            opt_letter = opt[0]
                            opt_text = opt[3:].strip()
                            option_dict[opt_letter] = opt_text
                        else:
                            print(f"选项格式错误，文件：{source_file_path}，选项：{opt}")
                    # 获取关键词
                    keyword = option_dict.get(answer_letter)
                    if not keyword:
                        print(f"答案字母 '{answer_letter}' 在选项中未找到，文件：{source_file_path}")
                        continue
                    print(f"处理文件：{source_file_path}")
                    print(f"关键词：'{keyword}'")
                    # 检查关键词是否在关键词字典中
                    if keyword in keyword_dict:
                        department_info = keyword_dict[keyword]
                        department = department_info['department']
                        print(f"关键词 '{keyword}' 的科室为：'{department}'")
                        if department in departments_set:
                            files_matched += 1
                            department_dir_name = get_department_dir_name(department)
                            destination_base = os.path.join(destination_root, department_dir_name)
                            # 构造目标文件路径
                            relative_path = os.path.relpath(source_file_path, '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI')
                            destination_file_path = os.path.join(destination_base, relative_path)
                            # 创建目标目录（如果不存在）
                            destination_dir = os.path.dirname(destination_file_path)
                            if not os.path.exists(destination_dir):
                                os.makedirs(destination_dir)
                                print(f"创建目录：{destination_dir}")
                            # 复制JSON文件
                            shutil.copy2(source_file_path, destination_file_path)
                            print(f"已复制文件到：{destination_file_path}")
                            # 处理并复制图片
                            for image_key in image_keys:
                                if image_key in data:
                                    image_path = data[image_key]
                                    # 图片路径是相对于 source_dir + '/images' 的
                                    source_image_path = os.path.join(source_dir, 'images', image_path)
                                    if not os.path.exists(source_image_path):
                                        print(f"源图片不存在，跳过：{source_image_path}")
                                        continue
                                    # 构造相对路径，从 GMAI 之后开始，包括 'images' 目录
                                    relative_image_path = os.path.relpath(source_image_path, '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI')
                                    # 构造目标图片路径
                                    destination_image_path = os.path.join(destination_base, relative_image_path)
                                    destination_image_dir = os.path.dirname(destination_image_path)
                                    if not os.path.exists(destination_image_dir):
                                        os.makedirs(destination_image_dir)
                                        print(f"创建图片目录：{destination_image_dir}")
                                    # 复制图片文件
                                    shutil.copy2(source_image_path, destination_image_path)
                                    images_copied += 1
                                    print(f"已复制图片到：{destination_image_path}")
                            # 增加对应科室的文件计数
                            department_file_counts[department] += 1
                        else:
                            print(f"科室 '{department}' 不在处理列表中，不复制文件。")
                    else:
                        print(f"关键词 '{keyword}' 不在关键词列表中。")
                except Exception as e:
                    print(f"处理文件 {source_file_path} 时发生错误：{e}")

print(f"总共处理了 {total_files_processed} 个 JSON 文件。")
print(f"总共匹配并复制了 {files_matched} 个 JSON 文件。")
print(f"总共复制了 {images_copied} 张图片。")

# 打印每个科室的文件计数
print("每个科室匹配并复制的文件数量：")
for dept in departments:
    count = department_file_counts[dept]
    dept_dir_name = get_department_dir_name(dept)
    print(f"{dept_dir_name}: {count} 个文件")