File size: 8,266 Bytes
a7d4c7b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import os
import json
import shutil

# 读取关键词文件并构建关键词映射字典
keyword_file = '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI-MMbench-CoT/output_multi_column.txt'
keyword_dict = {}

with open(keyword_file, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if not line:
            continue  # 跳过空行
        parts = line.split(',')
        if len(parts) != 4:
            print(f"格式错误,跳过此行:{line}")
            continue
        keyword, department, task, modality = [p.strip() for p in parts]
        keyword_dict[keyword] = {
            'department': department,
            'task': task,
            'modality': modality
        }

print(f"总共加载了 {len(keyword_dict)} 个关键词。")

# 定义需要处理的科室列表
departments = [
    'Cardiovascular Surgery',
    'Dermatology',
    'Endocrinology',
    'Gastroenterology and Hepatology',
    'General Surgery',
    'Hematology',
    'Infectious Diseases',
    'Laboratory Medicine and Pathology',
    'Nephrology and Hypertension',
    'Neurosurgery',
    'Obstetrics and Gynecology',
    'Oncology (Medical)',
    'Ophthalmology',
    'Orthopedic Surgery',
    'Otolaryngology (ENT)/Head and Neck Surgery',
    'Pulmonary Medicine',
    'Sports Medicine',
    'Urology'
]

# 创建科室到目录名称的映射,处理特殊情况
def get_department_dir_name(department):
    if department == 'Otolaryngology (ENT)/Head and Neck Surgery':
        return 'Otolaryngology (ENT)'
    else:
        return department

# 将科室列表转换为集合,方便查找
departments_set = set(departments)

# 定义源目录列表
source_dirs = [
    '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/cls_2d',
    '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/det_2d',
    '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/semantic_seg_2d',
    '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/semantic_seg_3d'
]

# 定义目标基础目录
destination_root = '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI-MMbench-CoT'

# 用于统计和调试
total_files_processed = 0
files_matched = 0
images_copied = 0

# 用于统计每个科室的匹配文件数
department_file_counts = {dept: 0 for dept in departments}

# 要处理的图片键列表
image_keys = ['img_mask_path', 'img_contour_path', 'img_bbox_path', 'img_path']

# 遍历每个源目录
for source_dir in source_dirs:
    print(f"正在遍历目录:{source_dir}")
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            if file.endswith('.json'):
                total_files_processed += 1
                source_file_path = os.path.join(root, file)
                try:
                    with open(source_file_path, 'r', encoding='utf-8') as f:
                        data = json.load(f)
                    answer_letter = data.get('answer', '').strip()
                    options = data.get('options', [])
                    if not answer_letter or not options:
                        print(f"文件缺少 'answer' 或 'options' 字段,跳过:{source_file_path}")
                        continue
                    # 创建选项字典,映射字母到选项文本
                    option_dict = {}
                    for opt in options:
                        if len(opt) > 2 and opt[1] == '.':
                            opt_letter = opt[0]
                            opt_text = opt[3:].strip()
                            option_dict[opt_letter] = opt_text
                        else:
                            print(f"选项格式错误,文件:{source_file_path},选项:{opt}")
                    # 获取关键词
                    keyword = option_dict.get(answer_letter)
                    if not keyword:
                        print(f"答案字母 '{answer_letter}' 在选项中未找到,文件:{source_file_path}")
                        continue
                    print(f"处理文件:{source_file_path}")
                    print(f"关键词:'{keyword}'")
                    # 检查关键词是否在关键词字典中
                    if keyword in keyword_dict:
                        department_info = keyword_dict[keyword]
                        department = department_info['department']
                        print(f"关键词 '{keyword}' 的科室为:'{department}'")
                        if department in departments_set:
                            files_matched += 1
                            department_dir_name = get_department_dir_name(department)
                            destination_base = os.path.join(destination_root, department_dir_name)
                            # 构造目标文件路径
                            relative_path = os.path.relpath(source_file_path, '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI')
                            destination_file_path = os.path.join(destination_base, relative_path)
                            # 创建目标目录(如果不存在)
                            destination_dir = os.path.dirname(destination_file_path)
                            if not os.path.exists(destination_dir):
                                os.makedirs(destination_dir)
                                print(f"创建目录:{destination_dir}")
                            # 复制JSON文件
                            shutil.copy2(source_file_path, destination_file_path)
                            print(f"已复制文件到:{destination_file_path}")
                            # 处理并复制图片
                            for image_key in image_keys:
                                if image_key in data:
                                    image_path = data[image_key]
                                    # 图片路径是相对于 source_dir + '/images' 的
                                    source_image_path = os.path.join(source_dir, 'images', image_path)
                                    if not os.path.exists(source_image_path):
                                        print(f"源图片不存在,跳过:{source_image_path}")
                                        continue
                                    # 构造相对路径,从 GMAI 之后开始,包括 'images' 目录
                                    relative_image_path = os.path.relpath(source_image_path, '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI')
                                    # 构造目标图片路径
                                    destination_image_path = os.path.join(destination_base, relative_image_path)
                                    destination_image_dir = os.path.dirname(destination_image_path)
                                    if not os.path.exists(destination_image_dir):
                                        os.makedirs(destination_image_dir)
                                        print(f"创建图片目录:{destination_image_dir}")
                                    # 复制图片文件
                                    shutil.copy2(source_image_path, destination_image_path)
                                    images_copied += 1
                                    print(f"已复制图片到:{destination_image_path}")
                            # 增加对应科室的文件计数
                            department_file_counts[department] += 1
                        else:
                            print(f"科室 '{department}' 不在处理列表中,不复制文件。")
                    else:
                        print(f"关键词 '{keyword}' 不在关键词列表中。")
                except Exception as e:
                    print(f"处理文件 {source_file_path} 时发生错误:{e}")

print(f"总共处理了 {total_files_processed} 个 JSON 文件。")
print(f"总共匹配并复制了 {files_matched} 个 JSON 文件。")
print(f"总共复制了 {images_copied} 张图片。")

# 打印每个科室的文件计数
print("每个科室匹配并复制的文件数量:")
for dept in departments:
    count = department_file_counts[dept]
    dept_dir_name = get_department_dir_name(dept)
    print(f"{dept_dir_name}: {count} 个文件")