|
|
|
|
|
|
|
|
|
|
|
|
|
import pathlib |
|
import json |
|
import lmdb |
|
|
|
from xmlparser import XMLRawDataset, ListRawDataset |
|
|
|
import argparse |
|
|
|
|
|
class Env: |
|
def __init__(self, output_path, interval_writeCache=1000): |
|
self.output_path = output_path |
|
self.env = lmdb.open(str(output_path), map_size=1099511627776) |
|
self.cache = dict() |
|
self.n = 0 |
|
self.interval = interval_writeCache |
|
|
|
def finish_line(self): |
|
self.n += 1 |
|
if self.n % 1000 == 0: |
|
self.writeCache() |
|
|
|
def writeCache(self): |
|
with self.env.begin(write=True) as txn: |
|
for k, v in self.cache.items(): |
|
txn.put(k, v) |
|
self.cache = {} |
|
print(f'Written {self.n} lines @ {self.output_path}') |
|
|
|
|
|
def createDataset(input_path, output_path, db_type='xml', dry_run=False): |
|
p = pathlib.Path(output_path) |
|
p.mkdir(parents=True, exist_ok=True) |
|
|
|
if db_type == 'xml': |
|
generator = XMLRawDataset.from_list(input_path, image_type=XMLRawDataset.IMAGE_TYPE_ENCODED) |
|
elif db_type == 'list': |
|
generator = ListRawDataset(input_path, image_type=XMLRawDataset.IMAGE_TYPE_ENCODED) |
|
if dry_run: |
|
return |
|
|
|
|
|
env = Env(output_path[0]) |
|
env.cache['dbtype'.encode()] = 'xml'.encode() |
|
|
|
for il, (g, line) in enumerate(generator): |
|
env.cache[f'{env.n:09d}-direction'.encode()] = line.get('direction').encode() |
|
env.cache[f'{env.n:09d}-label'.encode()] = line.get('label').encode() |
|
env.cache[f'{env.n:09d}-cattrs'.encode()] = json.dumps(line.get('cattrs')).encode() |
|
env.cache[f'{env.n:09d}-image'.encode()] = g |
|
env.finish_line() |
|
|
|
env.cache['n_line'.encode()] = str(env.n).encode() |
|
env.writeCache() |
|
|
|
|
|
if __name__ == '__main__': |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument('--input_path', nargs='+', required=True) |
|
parser.add_argument('--output_path', required=True) |
|
parser.add_argument('--db_type', default='xml', choices=['xml', 'list']) |
|
parser.add_argument('--dry-run', action='store_true') |
|
opt = parser.parse_args() |
|
createDataset(opt.input_path, opt.output_path, opt.db_type, dry_run=opt.dry_run) |
|
|