# Copyright (c) OpenMMLab. All rights reserved. import shutil import sys import time from pathlib import Path import lmdb from mmocr.utils import list_from_file def lmdb_converter(img_list_file, output, batch_size=1000, coding='utf-8', lmdb_map_size=109951162776): # read img_list_file lines = list_from_file(img_list_file) # create lmdb database if Path(output).is_dir(): while True: print('%s already exist, delete or not? [Y/n]' % output) Yn = input().strip() if Yn in ['Y', 'y']: shutil.rmtree(output) break if Yn in ['N', 'n']: return print('create database %s' % output) Path(output).mkdir(parents=True, exist_ok=False) env = lmdb.open(output, map_size=lmdb_map_size) # build lmdb beg_time = time.strftime('%H:%M:%S') for beg_index in range(0, len(lines), batch_size): end_index = min(beg_index + batch_size, len(lines)) sys.stdout.write('\r[%s-%s], processing [%d-%d] / %d' % (beg_time, time.strftime('%H:%M:%S'), beg_index, end_index, len(lines))) sys.stdout.flush() batch = [(str(index).encode(coding), lines[index].encode(coding)) for index in range(beg_index, end_index)] with env.begin(write=True) as txn: cursor = txn.cursor() cursor.putmulti(batch, dupdata=False, overwrite=True) sys.stdout.write('\n') with env.begin(write=True) as txn: key = 'total_number'.encode(coding) value = str(len(lines)).encode(coding) txn.put(key, value) print('done', flush=True)