|
import sys |
|
import traceback |
|
import pickle |
|
import os |
|
import concurrent.futures |
|
from tqdm import tqdm |
|
import time |
|
from font_dataset.font import load_fonts |
|
from font_dataset.layout import generate_font_image |
|
from font_dataset.text import CorpusGeneratorManager, UnqualifiedFontException |
|
from font_dataset.background import background_image_generator |
|
|
|
|
|
global_script_index = int(sys.argv[1]) |
|
global_script_index_total = int(sys.argv[2]) |
|
|
|
print(f"Mission {global_script_index} / {global_script_index_total}") |
|
|
|
num_workers = 32 |
|
|
|
cjk_ratio = 3 |
|
|
|
train_cnt = 100 |
|
val_cnt = 5 |
|
test_cnt = 30 |
|
|
|
train_cnt_cjk = int(train_cnt * cjk_ratio) |
|
val_cnt_cjk = int(val_cnt * cjk_ratio) |
|
test_cnt_cjk = int(test_cnt * cjk_ratio) |
|
|
|
dataset_path = "./dataset/font_img" |
|
os.makedirs(dataset_path, exist_ok=True) |
|
|
|
unqualified_log_file_name = f"unqualified_font_{time.time()}.txt" |
|
runtime_exclusion_list = [] |
|
|
|
fonts, exclusion_rule = load_fonts() |
|
corpus_manager = CorpusGeneratorManager() |
|
images = background_image_generator() |
|
|
|
|
|
def generate_dataset(dataset_type: str, cnt: int): |
|
dataset_bath_dir = os.path.join(dataset_path, dataset_type) |
|
os.makedirs(dataset_bath_dir, exist_ok=True) |
|
|
|
def _generate_single(args): |
|
i, j, font = args |
|
print( |
|
f"Generating {dataset_type} font: {font.path} {i} / {len(fonts)}, image {j}" |
|
) |
|
|
|
if exclusion_rule(font): |
|
print(f"Excluded font: {font.path}") |
|
return |
|
if font.path in runtime_exclusion_list: |
|
print(f"Excluded font: {font.path}") |
|
return |
|
|
|
while True: |
|
try: |
|
image_file_name = f"font_{i}_img_{j}.jpg" |
|
label_file_name = f"font_{i}_img_{j}.bin" |
|
|
|
image_file_path = os.path.join(dataset_bath_dir, image_file_name) |
|
label_file_path = os.path.join(dataset_bath_dir, label_file_name) |
|
|
|
|
|
if os.path.exists(image_file_path) and os.path.exists(label_file_path): |
|
return |
|
|
|
im = next(images) |
|
im, label = generate_font_image( |
|
im, |
|
font, |
|
corpus_manager, |
|
) |
|
|
|
im.save(image_file_path) |
|
pickle.dump(label, open(label_file_path, "wb")) |
|
return |
|
except UnqualifiedFontException as e: |
|
print(f"SKIPPING Unqualified font: {e.font.path}") |
|
runtime_exclusion_list.append(e.font.path) |
|
with open(unqualified_log_file_name, "a+") as f: |
|
f.write(f"{e.font.path}\n") |
|
return |
|
except Exception as _: |
|
traceback.print_exc() |
|
continue |
|
|
|
work_list = [] |
|
|
|
|
|
for i in range( |
|
(global_script_index - 1) * len(fonts) // global_script_index_total, |
|
global_script_index * len(fonts) // global_script_index_total, |
|
): |
|
font = fonts[i] |
|
if font.language == "CJK": |
|
true_cnt = cnt * cjk_ratio |
|
else: |
|
true_cnt = cnt |
|
for j in range(true_cnt): |
|
work_list.append((i, j, font)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for i in tqdm(range(len(work_list))): |
|
_generate_single(work_list[i]) |
|
|
|
|
|
generate_dataset("train", train_cnt) |
|
generate_dataset("val", val_cnt) |
|
generate_dataset("test", test_cnt) |
|
|