NDLOCR / src /ndl_layout /tools /xml_refiner.py
3v324v23's picture
Add files
c9019cd
raw
history blame
4.9 kB
#!/usr/bin/env python
# Copyright (c) 2022, National Diet Library, Japan
#
# This software is released under the CC BY 4.0.
# https://creativecommons.org/licenses/by/4.0/
import argparse
import xml.etree.ElementTree as ET
def parse_args():
parser = argparse.ArgumentParser(
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('xml', help='input xml file path')
parser.add_argument('-o', '--out', default=None,
help='output xml file path')
parser.add_argument('-ov', '--vh_overlap_th', type=int, default=2,
help='How many intersecting vertical and horizontal boxes should be removed')
parser.add_argument('-im', '--inclusion_margin', default=0.05,
help='inclusion margin ratio. default 0.05')
parser.add_argument('-co', '--category_option', default='SAME',
help='SAME(default) : investigate whether inclusion is only for the same category\n'
'SIM : investigate inclusion for similar categories(line/block).\n'
'ALL : investigate category-independent inclusions.\n')
parser.add_argument('--rm_vh_confusion_only', action='store_true')
parser.add_argument('--rm_inclusion_only', action='store_true')
return parser.parse_args()
def get_points(elm):
x1 = int(elm.attrib['X'])
y1 = int(elm.attrib['Y'])
x2 = x1 + int(elm.attrib['WIDTH'])
y2 = y1 + int(elm.attrib['HEIGHT'])
return x1, y1, x2, y2
def vh_comp(elm_a, elm_b):
v1 = int(elm_a.attrib['WIDTH'])-int(elm_a.attrib['HEIGHT'])
v2 = int(elm_b.attrib['WIDTH'])-int(elm_b.attrib['HEIGHT'])
return v1*v2 > 0
def vh_overlapping(elm_a, elm_b):
if vh_comp(elm_a, elm_b):
# vert vert or hori hori
return False
else:
a_x1, a_y1, a_x2, a_y2 = get_points(elm_a)
b_x1, b_y1, b_x2, b_y2 = get_points(elm_b)
# c ... intersection
c_x1 = max(a_x1, b_x1)
c_y1 = max(a_y1, b_y1)
c_x2 = min(a_x2, b_x2)
c_y2 = min(a_y2, b_y2)
if (c_x1 > c_x2) or (c_y1 > c_y2):
return False # No intersection
else:
return True
def refine_vh_confusion(root, overlap_th):
print('Refine VH Confusion')
for page in root:
print(page.attrib['IMAGENAME'])
for elm in reversed(page):
# vh overlap count
vh_overlap_count = 0
for elm_ref in page:
if elm.tag == 'LINE' and elm.tag == elm_ref.tag and elm.attrib['TYPE'] == elm_ref.attrib['TYPE']:
if vh_overlapping(elm, elm_ref):
vh_overlap_count += 1
if vh_overlap_count >= overlap_th:
page.remove(elm)
break
return root
def include(parent, child, margin=0.05):
p_x1, p_y1, p_x2, p_y2 = get_points(parent)
c_x1, c_y1, c_x2, c_y2 = get_points(child)
if p_x1 == c_x1 and p_y1 == c_y1 and p_x2 == c_x2 and p_y2 == c_y2:
return False
w_m = int(child.attrib['WIDTH']) * margin
h_m = int(child.attrib['HEIGHT']) * margin
if (p_x1-w_m <= c_x1) and (p_y1-h_m <= c_y1) and (p_x2+w_m >= c_x2) and (p_y2+h_m > c_y2):
return True
else:
return False
def refine_inclusion(root, margin=0.05, category_option='SAME'):
print('Refine inclusion')
for page in root:
print(page.attrib['IMAGENAME'])
for elm in reversed(page): # child
include_flag = False
for elm_ref in page: # parent
if category_option == 'SAME':
if elm.attrib['TYPE'] != elm_ref.attrib['TYPE']:
continue
elif category_option == 'SIM':
if elm.tag != elm_ref.tag:
continue
include_flag = include(parent=elm_ref, child=elm, margin=margin)
if include_flag:
page.remove(elm)
break
return root
def refine(xml, out_xml, vh_overlap_th=2, margin=0.05, category_option='SAME', vh=True, inc=True):
tree = ET.parse(xml)
root = tree.getroot()
if vh:
root = refine_vh_confusion(root, vh_overlap_th)
if inc:
root = refine_inclusion(root, margin, category_option)
tree.write(out_xml, encoding='UTF-8')
return
def main():
args = parse_args()
out_xml_path = 'out.xml'
if args.out is not None:
out_xml_path = args.out
refine(xml=args.xml,
out_xml=out_xml_path,
vh_overlap_th=args.vh_overlap_th,
margin=args.inclusion_margin,
category_option=args.category_option,
vh=not args.rm_inclusion_only,
inc=not args.rm_vh_confusion_only)
print('Export: {}'.format(out_xml_path))
if __name__ == '__main__':
main()