NDLOCR / src /text_recognition /create_xmldataset.py
3v324v23's picture
Add files
c9019cd
raw
history blame
2.32 kB
# Copyright (c) 2022, National Diet Library, Japan
#
# This software is released under the CC BY 4.0.
# https://creativecommons.org/licenses/by/4.0/
import pathlib
import json
import lmdb
from xmlparser import XMLRawDataset, ListRawDataset
import argparse
class Env:
def __init__(self, output_path, interval_writeCache=1000):
self.output_path = output_path
self.env = lmdb.open(str(output_path), map_size=1099511627776)
self.cache = dict()
self.n = 0
self.interval = interval_writeCache
def finish_line(self):
self.n += 1
if self.n % 1000 == 0:
self.writeCache()
def writeCache(self):
with self.env.begin(write=True) as txn:
for k, v in self.cache.items():
txn.put(k, v)
self.cache = {}
print(f'Written {self.n} lines @ {self.output_path}')
def createDataset(input_path, output_path, db_type='xml', dry_run=False):
p = pathlib.Path(output_path)
p.mkdir(parents=True, exist_ok=True)
if db_type == 'xml':
generator = XMLRawDataset.from_list(input_path, image_type=XMLRawDataset.IMAGE_TYPE_ENCODED)
elif db_type == 'list':
generator = ListRawDataset(input_path, image_type=XMLRawDataset.IMAGE_TYPE_ENCODED)
if dry_run:
return
# generate database
env = Env(output_path[0])
env.cache['dbtype'.encode()] = 'xml'.encode()
for il, (g, line) in enumerate(generator):
env.cache[f'{env.n:09d}-direction'.encode()] = line.get('direction').encode()
env.cache[f'{env.n:09d}-label'.encode()] = line.get('label').encode()
env.cache[f'{env.n:09d}-cattrs'.encode()] = json.dumps(line.get('cattrs')).encode()
env.cache[f'{env.n:09d}-image'.encode()] = g
env.finish_line()
env.cache['n_line'.encode()] = str(env.n).encode()
env.writeCache()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--input_path', nargs='+', required=True)
parser.add_argument('--output_path', required=True)
parser.add_argument('--db_type', default='xml', choices=['xml', 'list'])
parser.add_argument('--dry-run', action='store_true')
opt = parser.parse_args()
createDataset(opt.input_path, opt.output_path, opt.db_type, dry_run=opt.dry_run)