Spaces:

tomofi
/

NDLOCR

Build error

App Files Files Community

NDLOCR / src /ndl_layout /tools /xml_refiner.py

3v324v23

Add files

c9019cd over 2 years ago

raw

history blame

4.9 kB

	#!/usr/bin/env python

	# Copyright (c) 2022, National Diet Library, Japan
	#
	# This software is released under the CC BY 4.0.
	# https://creativecommons.org/licenses/by/4.0/

	import argparse
	import xml.etree.ElementTree as ET


	def parse_args():
	parser = argparse.ArgumentParser(
	formatter_class=argparse.RawTextHelpFormatter)
	parser.add_argument('xml', help='input xml file path')
	parser.add_argument('-o', '--out', default=None,
	help='output xml file path')
	parser.add_argument('-ov', '--vh_overlap_th', type=int, default=2,
	help='How many intersecting vertical and horizontal boxes should be removed')
	parser.add_argument('-im', '--inclusion_margin', default=0.05,
	help='inclusion margin ratio. default 0.05')
	parser.add_argument('-co', '--category_option', default='SAME',
	help='SAME(default) : investigate whether inclusion is only for the same category\n'
	'SIM : investigate inclusion for similar categories(line/block).\n'
	'ALL : investigate category-independent inclusions.\n')
	parser.add_argument('--rm_vh_confusion_only', action='store_true')
	parser.add_argument('--rm_inclusion_only', action='store_true')
	return parser.parse_args()


	def get_points(elm):
	x1 = int(elm.attrib['X'])
	y1 = int(elm.attrib['Y'])
	x2 = x1 + int(elm.attrib['WIDTH'])
	y2 = y1 + int(elm.attrib['HEIGHT'])

	return x1, y1, x2, y2


	def vh_comp(elm_a, elm_b):
	v1 = int(elm_a.attrib['WIDTH'])-int(elm_a.attrib['HEIGHT'])
	v2 = int(elm_b.attrib['WIDTH'])-int(elm_b.attrib['HEIGHT'])
	return v1*v2 > 0


	def vh_overlapping(elm_a, elm_b):
	if vh_comp(elm_a, elm_b):
	# vert vert or hori hori
	return False
	else:
	a_x1, a_y1, a_x2, a_y2 = get_points(elm_a)
	b_x1, b_y1, b_x2, b_y2 = get_points(elm_b)
	# c ... intersection
	c_x1 = max(a_x1, b_x1)
	c_y1 = max(a_y1, b_y1)
	c_x2 = min(a_x2, b_x2)
	c_y2 = min(a_y2, b_y2)
	if (c_x1 > c_x2) or (c_y1 > c_y2):
	return False # No intersection
	else:
	return True


	def refine_vh_confusion(root, overlap_th):
	print('Refine VH Confusion')
	for page in root:
	print(page.attrib['IMAGENAME'])

	for elm in reversed(page):
	# vh overlap count
	vh_overlap_count = 0
	for elm_ref in page:
	if elm.tag == 'LINE' and elm.tag == elm_ref.tag and elm.attrib['TYPE'] == elm_ref.attrib['TYPE']:
	if vh_overlapping(elm, elm_ref):
	vh_overlap_count += 1
	if vh_overlap_count >= overlap_th:
	page.remove(elm)
	break
	return root


	def include(parent, child, margin=0.05):
	p_x1, p_y1, p_x2, p_y2 = get_points(parent)
	c_x1, c_y1, c_x2, c_y2 = get_points(child)
	if p_x1 == c_x1 and p_y1 == c_y1 and p_x2 == c_x2 and p_y2 == c_y2:
	return False

	w_m = int(child.attrib['WIDTH']) * margin
	h_m = int(child.attrib['HEIGHT']) * margin

	if (p_x1-w_m <= c_x1) and (p_y1-h_m <= c_y1) and (p_x2+w_m >= c_x2) and (p_y2+h_m > c_y2):
	return True
	else:
	return False


	def refine_inclusion(root, margin=0.05, category_option='SAME'):
	print('Refine inclusion')
	for page in root:
	print(page.attrib['IMAGENAME'])
	for elm in reversed(page): # child
	include_flag = False
	for elm_ref in page: # parent
	if category_option == 'SAME':
	if elm.attrib['TYPE'] != elm_ref.attrib['TYPE']:
	continue
	elif category_option == 'SIM':
	if elm.tag != elm_ref.tag:
	continue

	include_flag = include(parent=elm_ref, child=elm, margin=margin)
	if include_flag:
	page.remove(elm)
	break
	return root


	def refine(xml, out_xml, vh_overlap_th=2, margin=0.05, category_option='SAME', vh=True, inc=True):
	tree = ET.parse(xml)
	root = tree.getroot()
	if vh:
	root = refine_vh_confusion(root, vh_overlap_th)
	if inc:
	root = refine_inclusion(root, margin, category_option)

	tree.write(out_xml, encoding='UTF-8')
	return


	def main():
	args = parse_args()
	out_xml_path = 'out.xml'
	if args.out is not None:
	out_xml_path = args.out
	refine(xml=args.xml,
	out_xml=out_xml_path,
	vh_overlap_th=args.vh_overlap_th,
	margin=args.inclusion_margin,
	category_option=args.category_option,
	vh=not args.rm_inclusion_only,
	inc=not args.rm_vh_confusion_only)

	print('Export: {}'.format(out_xml_path))


	if __name__ == '__main__':
	main()