#!/usr/bin/env python # Copyright (c) 2022, National Diet Library, Japan # # This software is released under the CC BY 4.0. # https://creativecommons.org/licenses/by/4.0/ import argparse import xml.etree.ElementTree as ET def parse_args(): parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('xml', help='input xml file path') parser.add_argument('-o', '--out', default=None, help='output xml file path') parser.add_argument('-ov', '--vh_overlap_th', type=int, default=2, help='How many intersecting vertical and horizontal boxes should be removed') parser.add_argument('-im', '--inclusion_margin', default=0.05, help='inclusion margin ratio. default 0.05') parser.add_argument('-co', '--category_option', default='SAME', help='SAME(default) : investigate whether inclusion is only for the same category\n' 'SIM : investigate inclusion for similar categories(line/block).\n' 'ALL : investigate category-independent inclusions.\n') parser.add_argument('--rm_vh_confusion_only', action='store_true') parser.add_argument('--rm_inclusion_only', action='store_true') return parser.parse_args() def get_points(elm): x1 = int(elm.attrib['X']) y1 = int(elm.attrib['Y']) x2 = x1 + int(elm.attrib['WIDTH']) y2 = y1 + int(elm.attrib['HEIGHT']) return x1, y1, x2, y2 def vh_comp(elm_a, elm_b): v1 = int(elm_a.attrib['WIDTH'])-int(elm_a.attrib['HEIGHT']) v2 = int(elm_b.attrib['WIDTH'])-int(elm_b.attrib['HEIGHT']) return v1*v2 > 0 def vh_overlapping(elm_a, elm_b): if vh_comp(elm_a, elm_b): # vert vert or hori hori return False else: a_x1, a_y1, a_x2, a_y2 = get_points(elm_a) b_x1, b_y1, b_x2, b_y2 = get_points(elm_b) # c ... intersection c_x1 = max(a_x1, b_x1) c_y1 = max(a_y1, b_y1) c_x2 = min(a_x2, b_x2) c_y2 = min(a_y2, b_y2) if (c_x1 > c_x2) or (c_y1 > c_y2): return False # No intersection else: return True def refine_vh_confusion(root, overlap_th): print('Refine VH Confusion') for page in root: print(page.attrib['IMAGENAME']) for elm in reversed(page): # vh overlap count vh_overlap_count = 0 for elm_ref in page: if elm.tag == 'LINE' and elm.tag == elm_ref.tag and elm.attrib['TYPE'] == elm_ref.attrib['TYPE']: if vh_overlapping(elm, elm_ref): vh_overlap_count += 1 if vh_overlap_count >= overlap_th: page.remove(elm) break return root def include(parent, child, margin=0.05): p_x1, p_y1, p_x2, p_y2 = get_points(parent) c_x1, c_y1, c_x2, c_y2 = get_points(child) if p_x1 == c_x1 and p_y1 == c_y1 and p_x2 == c_x2 and p_y2 == c_y2: return False w_m = int(child.attrib['WIDTH']) * margin h_m = int(child.attrib['HEIGHT']) * margin if (p_x1-w_m <= c_x1) and (p_y1-h_m <= c_y1) and (p_x2+w_m >= c_x2) and (p_y2+h_m > c_y2): return True else: return False def refine_inclusion(root, margin=0.05, category_option='SAME'): print('Refine inclusion') for page in root: print(page.attrib['IMAGENAME']) for elm in reversed(page): # child include_flag = False for elm_ref in page: # parent if category_option == 'SAME': if elm.attrib['TYPE'] != elm_ref.attrib['TYPE']: continue elif category_option == 'SIM': if elm.tag != elm_ref.tag: continue include_flag = include(parent=elm_ref, child=elm, margin=margin) if include_flag: page.remove(elm) break return root def refine(xml, out_xml, vh_overlap_th=2, margin=0.05, category_option='SAME', vh=True, inc=True): tree = ET.parse(xml) root = tree.getroot() if vh: root = refine_vh_confusion(root, vh_overlap_th) if inc: root = refine_inclusion(root, margin, category_option) tree.write(out_xml, encoding='UTF-8') return def main(): args = parse_args() out_xml_path = 'out.xml' if args.out is not None: out_xml_path = args.out refine(xml=args.xml, out_xml=out_xml_path, vh_overlap_th=args.vh_overlap_th, margin=args.inclusion_margin, category_option=args.category_option, vh=not args.rm_inclusion_only, inc=not args.rm_vh_confusion_only) print('Export: {}'.format(out_xml_path)) if __name__ == '__main__': main()