#!/usr/bin/env python # Copyright (c) 2022, National Diet Library, Japan # # This software is released under the CC BY 4.0. # https://creativecommons.org/licenses/by/4.0/ from typing import List from .utils import auto_run from enum import IntEnum, auto class Category(IntEnum): LINE_MAIN = 0 LINE_INOTE = auto() LINE_HNOTE = auto() LINE_CAPTION = auto() BLOCK_FIG = auto() BLOCK_TABLE = auto() BLOCK_PILLAR = auto() BLOCK_FOLIO = auto() BLOCK_RUBI = auto() BLOCK_CHART = auto() BLOCK_EQN = auto() BLOCK_CFM = auto() BLOCK_ENG = auto() CHAR = auto() NUM = auto() # TYPE=“本文|割注|頭注|キャプション" # TYPE=“図版|表組|柱|ノンブル|ルビ|組織図|数式|化学式|欧文| categories = [ {'id': int(Category.LINE_MAIN), 'name': 'line_main', 'org_name': '本文'}, {'id': int(Category.LINE_INOTE), 'name': 'line_inote', 'org_name': '割注'}, {'id': int(Category.LINE_HNOTE), 'name': 'line_hnote', 'org_name': '頭注'}, {'id': int(Category.LINE_CAPTION), 'name': 'line_caption', 'org_name': 'キャプション'}, {'id': int(Category.BLOCK_FIG), 'name': 'block_fig', 'org_name': '図版'}, {'id': int(Category.BLOCK_TABLE), 'name': 'block_table', 'org_name': '表組'}, {'id': int(Category.BLOCK_PILLAR), 'name': 'block_pillar', 'org_name': '柱'}, {'id': int(Category.BLOCK_FOLIO), 'name': 'block_folio', 'org_name': 'ノンブル'}, {'id': int(Category.BLOCK_RUBI), 'name': 'block_rubi', 'org_name': 'ルビ'}, {'id': int(Category.BLOCK_CHART), 'name': 'block_chart', 'org_name': '組織図'}, {'id': int(Category.BLOCK_EQN), 'name': 'block_eqn', 'org_name': '数式'}, {'id': int(Category.BLOCK_CFM), 'name': 'block_cfm', 'org_name': '化学式'}, {'id': int(Category.BLOCK_ENG), 'name': 'block_eng', 'org_name': '欧文'}, {'id': int(Category.CHAR), 'name': 'char', 'org_name': 'char'}, {'id': int(Category.NUM), 'name': 'void', 'org_name': 'void'}] categories_org_name_index = {elem['org_name']: elem for elem in categories} categories_name_index = {elem['name']: elem for elem in categories} def org_name_to_id(s: str): return categories_org_name_index[s]['id'] def name_to_org_name(s: str): return categories_name_index[s]['org_name'] class NDLObject: def __init__(self, x, y, width, height, category_id=-1): self.x, self.y = x, y self.width, self.height = width, height self.category_id = category_id def __repr__(self): return f'NDLObject({self.x}, {self.y}, {self.width}, {self.height}, category_id={self.category_id})' class NDLBlock(NDLObject): def __init__(self, type, *args, **kwargs): super().__init__(*args, **kwargs) self.category_id = org_name_to_id(type) self.type = type def __repr__(self): return f'NDLBlock({self.type}, {self.x}, {self.y}, {self.width}, {self.height}, category_id={self.category_id})' class NDLChar(NDLObject): def __init__(self, moji: str, *args, **kwargs): super().__init__(*args, **kwargs) self.moji = moji self.category_id = Category.CHAR def __repr__(self): return f'NDLChar(\'{self.moji}\', {self.x}, {self.y}, {self.width}, {self.height}, category_id={self.category_id})' class NDLLine(NDLObject): def __init__(self, chars: List[NDLChar], opt: str, *args, **kwargs): super().__init__(*args, **kwargs) self.chars = chars self.category_id = org_name_to_id(opt) self.opt = opt def __repr__(self): return f'NDLLine({self.chars}, {self.opt}, {self.x}, {self.y}, {self.width}, {self.height}, category_id={self.category_id})' class NDLPage: def __init__(self, img_path: str, objects: List[NDLObject], source_xml: str): self.img_path = img_path self.objects = objects self.source_xml = source_xml def __repr__(self): return f'NDLPage({self.img_path}, {self.objects}, {self.source_xml})' class NDLDataset: def __init__(self, pages=None): self.pages = [] if pages is None else pages def parse(self, xml_path: str, img_dir: str): import xml.etree.ElementTree as ET from pathlib import Path print(f'loading from {xml_path} ... ', end='') tree = ET.parse(xml_path) root = tree.getroot() pages = [] def parse_bbox(elem): return float(elem.attrib['X']), float(elem.attrib['Y']), float(elem.attrib['WIDTH']), float(elem.attrib['HEIGHT']) for page in root: img_path = str(Path(img_dir) / page.attrib['IMAGENAME']) objects = [] for elem in page: bbox = parse_bbox(elem) prefix, has_namespace, postfix = elem.tag.partition('}') if has_namespace: tag = postfix else: tag = elem.tag if tag == 'BLOCK': objects.append(NDLBlock(elem.attrib['TYPE'], *bbox)) elif tag == 'LINE': chars = [] for char in elem: bbox_char = parse_bbox(char) if char.get('MOJI') is None: continue chars.append(NDLChar(char.attrib['MOJI'], *bbox_char)) # Changed OPT to TYPE specification. # objects.append(NDLLine(chars, elem.attrib.get('OPT', ''), *bbox)) objects.append( NDLLine(chars, elem.attrib.get('TYPE', ''), *bbox)) else: pass pages.append(NDLPage(img_path, objects, Path(xml_path).stem)) print(f'done! {len(pages)} loaded') self.pages.extend(pages) def summary(self, output_dir: str = "./generated/"): import numpy as np import matplotlib.pyplot as plt from collections import defaultdict sizes = [] bbox_nums = [] opts = defaultdict(int) types = defaultdict(int) for page in self.pages: cnt = 0 for obj in page.objects: sizes.append( np.array([obj.width, obj.height], dtype=np.float32)) if isinstance(obj, NDLBlock): types[obj.type] += 1 cnt += 1 if isinstance(obj, NDLLine): cnt += len(obj.chars) opts[obj.opt] += 1 bbox_nums.append(cnt) print(opts) print(types) sizes = np.array(sizes) bbox_nums = np.array(bbox_nums) def savefig(data, file_name): plt.figure() plt.hist(data) plt.savefig(output_dir + file_name) savefig(sizes[:, 0], "hist_width.png") savefig(sizes[:, 1], "hist_height.png") savefig(sizes[:, 1] / sizes[:, 0], "hist_aspect.png") savefig(bbox_nums, "hist_bbox_num.png") def to_coco_fmt(self, fx=1.0, fy=1.0, add_char: bool = True, add_block: bool = True, add_prefix: bool = False, suffix: str = ".jpg"): import cv2 from pathlib import Path from tqdm import tqdm from collections import defaultdict output = {'images': [], 'annotations': []} image_id = 0 annotation_id = 0 instance_num = defaultdict(int) print("start to_coco_fmt") def make_bbox(obj): x1, y1 = fx * obj.x, fy * obj.y width, height = fx * obj.width, fy * obj.height x2, y2 = x1 + width, y1 + height bbox = [x1, y1, width, height] area = width * height contour = [x1, y1, x2, y1, x2, y2, x1, y2] return bbox, contour, area def add_annotation(obj): bbox, contour, area = make_bbox(obj) ann = {'image_id': image_id, 'id': annotation_id, 'bbox': bbox, 'area': area, 'iscrowd': 0, 'category_id': int(obj.category_id)} ann['segmentation'] = [contour] output['annotations'].append(ann) def add_line_annotation(obj): bbox, _, area_sum = make_bbox(obj) area = 0 contours = [] for char in obj.chars: _, contour, area_ = make_bbox(char) area += area_ contours.append(contour) if area == 0: area = area_sum ann = {'image_id': image_id, 'id': annotation_id, 'bbox': bbox, 'area': area, 'iscrowd': 0, 'category_id': int(obj.category_id)} ann['segmentation'] = contours output['annotations'].append(ann) for page in tqdm(self.pages): img = cv2.imread(page.img_path) if img is None: print(f"Cannot load {page.img_path}") continue prefix = page.source_xml + "_" if add_prefix else "" file_name = prefix + str(Path(page.img_path).name) if Path(file_name).suffix != suffix: file_name = str(Path(file_name).with_suffix('.jpg')) image = {'file_name': file_name, 'width': int(fx * img.shape[1]), 'height': int(fy * img.shape[0]), "id": image_id} output['images'].append(image) for obj in page.objects: if add_block: if isinstance(obj, NDLLine): add_line_annotation(obj) else: add_annotation(obj) instance_num[int(obj.category_id)] += 1 annotation_id += 1 image_id += 1 print(instance_num) output['categories'] = categories output['info'] = { "description": "NDL", "url": "", "version": "0.1a", "year": 2021, "contributor": "morpho", "date_created": "2021/09/01" } output['licenses'] = [] return output def train_test_split(self, ratio: float = 0.9): import random from copy import deepcopy print("start train_test_split") pages = deepcopy(self.pages) random.shuffle(pages) split = int(ratio * len(pages)) return NDLDataset(pages[:split]), NDLDataset(pages[split:]) def json_to_file(data, output_path: str): import json with open(output_path, 'w') as f: json.dump(data, f, indent=4) def main(xml_paths: List[str] = None, xml_list_path: str = None, img_dirs: List[str] = None, img_list_path: str = None, show_summary: bool = False, fx: float = 1.0, fy: float = 1.0, train_json_path: str = "generated/train.json", test_json_path: str = "generated/test.json", add_prefix: bool = False): if xml_list_path is not None: xml_paths = list([s.strip() for s in open(xml_list_path).readlines()]) if xml_paths is None: print('Please specify --xml_paths or --xml_list_path') return -1 if img_list_path is not None: img_dirs = list([s.strip() for s in open(img_list_path).readlines()]) if img_dirs is None: print('Please specify --img_dirs or --img_list_path') return -1 dataset = NDLDataset() for xml_path, img_dir in zip(xml_paths, img_dirs): dataset.parse(xml_path, img_dir) if show_summary: dataset.summary() train_dataset, test_dataset = dataset.train_test_split() train_json = train_dataset.to_coco_fmt(fx=fx, fy=fy, add_prefix=add_prefix) json_to_file(train_json, train_json_path) test_json = test_dataset.to_coco_fmt(fx=fx, fy=fy, add_prefix=add_prefix) json_to_file(test_json, test_json_path) # whole data annotation import os data_json_path = os.path.join( os.path.dirname(train_json_path), 'data.json') data_json = dataset.to_coco_fmt(fx=fx, fy=fy, add_prefix=add_prefix) json_to_file(data_json, data_json_path) if __name__ == '__main__': auto_run(main)