Spaces:
Running
Running
openocr demo
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- app.py +127 -0
- configs/det/dbnet/repvit_db.yml +173 -0
- configs/rec/abinet/resnet45_trans_abinet_lang.yml +94 -0
- configs/rec/abinet/resnet45_trans_abinet_wo_lang.yml +93 -0
- configs/rec/abinet/svtrv2_abinet_lang.yml +130 -0
- configs/rec/abinet/svtrv2_abinet_wo_lang.yml +128 -0
- configs/rec/aster/resnet31_lstm_aster_tps_on.yml +93 -0
- configs/rec/aster/svtrv2_aster.yml +127 -0
- configs/rec/aster/svtrv2_aster_tps_on.yml +102 -0
- configs/rec/autostr/autostr_lstm_aster_tps_on.yml +95 -0
- configs/rec/busnet/svtrv2_busnet.yml +135 -0
- configs/rec/busnet/svtrv2_busnet_pretraining.yml +134 -0
- configs/rec/busnet/vit_busnet.yml +104 -0
- configs/rec/busnet/vit_busnet_pretraining.yml +104 -0
- configs/rec/cam/convnextv2_cam_tps_on.yml +118 -0
- configs/rec/cam/convnextv2_tiny_cam_tps_on.yml +118 -0
- configs/rec/cam/svtrv2_cam_tps_on.yml +123 -0
- configs/rec/cdistnet/resnet45_trans_cdistnet.yml +93 -0
- configs/rec/cdistnet/svtrv2_cdistnet.yml +139 -0
- configs/rec/cppd/svtr_base_cppd.yml +123 -0
- configs/rec/cppd/svtr_base_cppd_ch.yml +126 -0
- configs/rec/cppd/svtr_base_cppd_h8.yml +123 -0
- configs/rec/cppd/svtr_base_cppd_syn.yml +124 -0
- configs/rec/cppd/svtrv2_cppd.yml +150 -0
- configs/rec/dan/resnet45_fpn_dan.yml +98 -0
- configs/rec/dan/svtrv2_dan.yml +130 -0
- configs/rec/focalsvtr/focalsvtr_ctc.yml +137 -0
- configs/rec/gtc/svtrv2_lnconv_nrtr_gtc.yml +168 -0
- configs/rec/gtc/svtrv2_lnconv_smtr_gtc_long_infer.yml +151 -0
- configs/rec/gtc/svtrv2_lnconv_smtr_gtc_smtr_long.yml +150 -0
- configs/rec/gtc/svtrv2_lnconv_smtr_gtc_stream.yml +152 -0
- configs/rec/igtr/readme.md +189 -0
- configs/rec/igtr/svtr_base_ds_igtr.yml +157 -0
- configs/rec/lister/focalsvtr_lister_wo_fem_maxratio12.yml +133 -0
- configs/rec/lister/svtrv2_lister_wo_fem_maxratio12.yml +138 -0
- configs/rec/lpv/svtr_base_lpv.yml +124 -0
- configs/rec/lpv/svtr_base_lpv_wo_glrm.yml +123 -0
- configs/rec/lpv/svtrv2_lpv.yml +147 -0
- configs/rec/lpv/svtrv2_lpv_wo_glrm.yml +146 -0
- configs/rec/maerec/vit_nrtr.yml +116 -0
- configs/rec/matrn/resnet45_trans_matrn.yml +95 -0
- configs/rec/matrn/svtrv2_matrn.yml +130 -0
- configs/rec/mgpstr/svtrv2_mgpstr_only_char.yml +140 -0
- configs/rec/mgpstr/vit_base_mgpstr_only_char.yml +111 -0
- configs/rec/mgpstr/vit_large_mgpstr_only_char.yml +110 -0
- configs/rec/mgpstr/vit_mgpstr.yml +110 -0
- configs/rec/mgpstr/vit_mgpstr_only_char.yml +110 -0
- configs/rec/moran/resnet31_lstm_moran.yml +92 -0
- configs/rec/nrtr/focalsvtr_nrtr_maxraio12.yml +145 -0
- configs/rec/nrtr/nrtr.yml +107 -0
app.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr # gradio==4.20.0
|
3 |
+
|
4 |
+
os.environ['FLAGS_allocator_strategy'] = 'auto_growth'
|
5 |
+
import cv2
|
6 |
+
import numpy as np
|
7 |
+
import json
|
8 |
+
import time
|
9 |
+
from PIL import Image
|
10 |
+
from tools.infer_e2e import OpenOCR, check_and_download_font, draw_ocr_box_txt
|
11 |
+
|
12 |
+
drop_score = 0.01
|
13 |
+
text_sys = OpenOCR(drop_score=drop_score)
|
14 |
+
# warm up 5 times
|
15 |
+
if True:
|
16 |
+
img = np.random.uniform(0, 255, [640, 640, 3]).astype(np.uint8)
|
17 |
+
for i in range(5):
|
18 |
+
res = text_sys(img_numpy=img)
|
19 |
+
font_path = './simfang.ttf'
|
20 |
+
check_and_download_font(font_path)
|
21 |
+
|
22 |
+
|
23 |
+
def main(input_image):
|
24 |
+
img = input_image[:, :, ::-1]
|
25 |
+
starttime = time.time()
|
26 |
+
results, time_dict, mask = text_sys(img_numpy=img, return_mask=True)
|
27 |
+
elapse = time.time() - starttime
|
28 |
+
save_pred = json.dumps(results[0], ensure_ascii=False)
|
29 |
+
image = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
|
30 |
+
boxes = [res['points'] for res in results[0]]
|
31 |
+
txts = [res['transcription'] for res in results[0]]
|
32 |
+
scores = [res['score'] for res in results[0]]
|
33 |
+
draw_img = draw_ocr_box_txt(
|
34 |
+
image,
|
35 |
+
boxes,
|
36 |
+
txts,
|
37 |
+
scores,
|
38 |
+
drop_score=drop_score,
|
39 |
+
font_path=font_path,
|
40 |
+
)
|
41 |
+
mask = mask[0, 0, :, :] > 0.3
|
42 |
+
return save_pred, elapse, draw_img, mask.astype('uint8') * 255
|
43 |
+
|
44 |
+
|
45 |
+
def get_all_file_names_including_subdirs(dir_path):
|
46 |
+
all_file_names = []
|
47 |
+
|
48 |
+
for root, dirs, files in os.walk(dir_path):
|
49 |
+
for file_name in files:
|
50 |
+
all_file_names.append(os.path.join(root, file_name))
|
51 |
+
|
52 |
+
file_names_only = [os.path.basename(file) for file in all_file_names]
|
53 |
+
return file_names_only
|
54 |
+
|
55 |
+
|
56 |
+
def list_image_paths(directory):
|
57 |
+
image_extensions = ('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff')
|
58 |
+
|
59 |
+
image_paths = []
|
60 |
+
for root, dirs, files in os.walk(directory):
|
61 |
+
for file in files:
|
62 |
+
if file.lower().endswith(image_extensions):
|
63 |
+
relative_path = os.path.relpath(os.path.join(root, file),
|
64 |
+
directory)
|
65 |
+
full_path = os.path.join(directory, relative_path)
|
66 |
+
image_paths.append(full_path)
|
67 |
+
image_paths = sorted(image_paths)
|
68 |
+
return image_paths
|
69 |
+
|
70 |
+
|
71 |
+
def find_file_in_current_dir_and_subdirs(file_name):
|
72 |
+
for root, dirs, files in os.walk('.'):
|
73 |
+
if file_name in files:
|
74 |
+
relative_path = os.path.join(root, file_name)
|
75 |
+
return relative_path
|
76 |
+
|
77 |
+
|
78 |
+
def predict1(input_image, Model_type, OCR_type):
|
79 |
+
if OCR_type == 'E2E':
|
80 |
+
return 11111, 'E2E', input_image
|
81 |
+
elif OCR_type == 'STR':
|
82 |
+
return 11111, 'STR', input_image
|
83 |
+
else:
|
84 |
+
return 11111, 'STD', input_image
|
85 |
+
|
86 |
+
|
87 |
+
e2e_img_example = list_image_paths('./OCR_e2e_img')
|
88 |
+
|
89 |
+
if __name__ == '__main__':
|
90 |
+
css = '.image-container img { width: 100%; max-height: 320px;}'
|
91 |
+
|
92 |
+
with gr.Blocks(css=css) as demo:
|
93 |
+
gr.HTML("""
|
94 |
+
<h1 style='text-align: center;'>OpenOCR</h1>""")
|
95 |
+
with gr.Row():
|
96 |
+
with gr.Column(scale=1):
|
97 |
+
input_image = gr.Image(label='Input image',
|
98 |
+
elem_classes=['image-container'])
|
99 |
+
|
100 |
+
examples = gr.Examples(examples=e2e_img_example,
|
101 |
+
inputs=input_image,
|
102 |
+
label='Examples')
|
103 |
+
downstream = gr.Button('Run')
|
104 |
+
|
105 |
+
with gr.Column(scale=1):
|
106 |
+
img_mask = gr.Image(label='mask',
|
107 |
+
interactive=False,
|
108 |
+
elem_classes=['image-container'])
|
109 |
+
img_output = gr.Image(label=' ',
|
110 |
+
interactive=False,
|
111 |
+
elem_classes=['image-container'])
|
112 |
+
|
113 |
+
output = gr.Textbox(label='Result')
|
114 |
+
confidence = gr.Textbox(label='Latency')
|
115 |
+
|
116 |
+
downstream.click(fn=main,
|
117 |
+
inputs=[
|
118 |
+
input_image,
|
119 |
+
],
|
120 |
+
outputs=[
|
121 |
+
output,
|
122 |
+
confidence,
|
123 |
+
img_output,
|
124 |
+
img_mask,
|
125 |
+
])
|
126 |
+
|
127 |
+
demo.launch(share=True)
|
configs/det/dbnet/repvit_db.yml
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: &epoch_num 500
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 100
|
6 |
+
save_model_dir: ./output/det_repsvtr_db
|
7 |
+
save_epoch_step: 10
|
8 |
+
eval_batch_step:
|
9 |
+
- 0
|
10 |
+
- 1000
|
11 |
+
cal_metric_during_train: false
|
12 |
+
checkpoints:
|
13 |
+
pretrained_model: openocr_det_repvit_ch.pth
|
14 |
+
save_inference_dir: null
|
15 |
+
use_visualdl: false
|
16 |
+
infer_img: ./testA
|
17 |
+
save_res_path: ./checkpoints/det_db/predicts_db.txt
|
18 |
+
distributed: true
|
19 |
+
model_type: det
|
20 |
+
|
21 |
+
Architecture:
|
22 |
+
algorithm: DB
|
23 |
+
Backbone:
|
24 |
+
name: RepSVTR_det
|
25 |
+
Neck:
|
26 |
+
name: RSEFPN
|
27 |
+
out_channels: 96
|
28 |
+
shortcut: True
|
29 |
+
Head:
|
30 |
+
name: DBHead
|
31 |
+
k: 50
|
32 |
+
|
33 |
+
# Loss:
|
34 |
+
# name: DBLoss
|
35 |
+
# balance_loss: true
|
36 |
+
# main_loss_type: DiceLoss
|
37 |
+
# alpha: 5
|
38 |
+
# beta: 10
|
39 |
+
# ohem_ratio: 3
|
40 |
+
|
41 |
+
# Optimizer:
|
42 |
+
# name: Adam
|
43 |
+
# beta1: 0.9
|
44 |
+
# beta2: 0.999
|
45 |
+
# lr:
|
46 |
+
# name: Cosine
|
47 |
+
# learning_rate: 0.001 #(8*8c)
|
48 |
+
# warmup_epoch: 2
|
49 |
+
# regularizer:
|
50 |
+
# name: L2
|
51 |
+
# factor: 5.0e-05
|
52 |
+
|
53 |
+
PostProcess:
|
54 |
+
name: DBPostProcess
|
55 |
+
thresh: 0.3
|
56 |
+
box_thresh: 0.4
|
57 |
+
max_candidates: 1000
|
58 |
+
unclip_ratio: 1.5
|
59 |
+
score_mode: 'slow'
|
60 |
+
|
61 |
+
# Metric:
|
62 |
+
# name: DetMetric
|
63 |
+
# main_indicator: hmean
|
64 |
+
|
65 |
+
# Train:
|
66 |
+
# dataset:
|
67 |
+
# name: SimpleDataSet
|
68 |
+
# data_dir: ./train_data/icdar2015/text_localization/
|
69 |
+
# label_file_list:
|
70 |
+
# - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt
|
71 |
+
# ratio_list: [1.0]
|
72 |
+
# transforms:
|
73 |
+
# - DecodeImage:
|
74 |
+
# img_mode: BGR
|
75 |
+
# channel_first: false
|
76 |
+
# - DetLabelEncode: null
|
77 |
+
# - CopyPaste: null
|
78 |
+
# - IaaAugment:
|
79 |
+
# augmenter_args:
|
80 |
+
# - type: Fliplr
|
81 |
+
# args:
|
82 |
+
# p: 0.5
|
83 |
+
# - type: Affine
|
84 |
+
# args:
|
85 |
+
# rotate:
|
86 |
+
# - -10
|
87 |
+
# - 10
|
88 |
+
# - type: Resize
|
89 |
+
# args:
|
90 |
+
# size:
|
91 |
+
# - 0.5
|
92 |
+
# - 3
|
93 |
+
# - EastRandomCropData:
|
94 |
+
# size:
|
95 |
+
# - 640
|
96 |
+
# - 640
|
97 |
+
# max_tries: 50
|
98 |
+
# keep_ratio: true
|
99 |
+
# - MakeBorderMap:
|
100 |
+
# shrink_ratio: 0.4
|
101 |
+
# thresh_min: 0.3
|
102 |
+
# thresh_max: 0.7
|
103 |
+
# total_epoch: *epoch_num
|
104 |
+
# - MakeShrinkMap:
|
105 |
+
# shrink_ratio: 0.4
|
106 |
+
# min_text_size: 8
|
107 |
+
# total_epoch: *epoch_num
|
108 |
+
# - NormalizeImage:
|
109 |
+
# scale: 1./255.
|
110 |
+
# mean:
|
111 |
+
# - 0.485
|
112 |
+
# - 0.456
|
113 |
+
# - 0.406
|
114 |
+
# std:
|
115 |
+
# - 0.229
|
116 |
+
# - 0.224
|
117 |
+
# - 0.225
|
118 |
+
# order: hwc
|
119 |
+
# - ToCHWImage: null
|
120 |
+
# - KeepKeys:
|
121 |
+
# keep_keys:
|
122 |
+
# - image
|
123 |
+
# - threshold_map
|
124 |
+
# - threshold_mask
|
125 |
+
# - shrink_map
|
126 |
+
# - shrink_mask
|
127 |
+
# loader:
|
128 |
+
# shuffle: true
|
129 |
+
# drop_last: false
|
130 |
+
# batch_size_per_card: 8
|
131 |
+
# num_workers: 8
|
132 |
+
|
133 |
+
Eval:
|
134 |
+
dataset:
|
135 |
+
name: SimpleDataSet
|
136 |
+
data_dir: ./train_data/icdar2015/text_localization/
|
137 |
+
label_file_list:
|
138 |
+
- ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
|
139 |
+
transforms:
|
140 |
+
- DecodeImage:
|
141 |
+
img_mode: BGR
|
142 |
+
channel_first: false
|
143 |
+
- DetLabelEncode: null
|
144 |
+
- DetResizeForTest:
|
145 |
+
# image_shape: [1280, 1280]
|
146 |
+
# keep_ratio: True
|
147 |
+
# padding: True
|
148 |
+
limit_side_len: 960
|
149 |
+
limit_type: max
|
150 |
+
- NormalizeImage:
|
151 |
+
scale: 1./255.
|
152 |
+
mean:
|
153 |
+
- 0.485
|
154 |
+
- 0.456
|
155 |
+
- 0.406
|
156 |
+
std:
|
157 |
+
- 0.229
|
158 |
+
- 0.224
|
159 |
+
- 0.225
|
160 |
+
order: hwc
|
161 |
+
- ToCHWImage: null
|
162 |
+
- KeepKeys:
|
163 |
+
keep_keys:
|
164 |
+
- image
|
165 |
+
- shape
|
166 |
+
- polys
|
167 |
+
- ignore_tags
|
168 |
+
loader:
|
169 |
+
shuffle: false
|
170 |
+
drop_last: false
|
171 |
+
batch_size_per_card: 1
|
172 |
+
num_workers: 2
|
173 |
+
profiler_options: null
|
configs/rec/abinet/resnet45_trans_abinet_lang.yml
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/resnet45_trans_abinet_lang/
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: True
|
10 |
+
pretrained_model:
|
11 |
+
# ./openocr_nolang_abinet_lang.pth
|
12 |
+
checkpoints:
|
13 |
+
use_tensorboard: false
|
14 |
+
infer_img:
|
15 |
+
# for data or label process
|
16 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
17 |
+
max_text_length: 25
|
18 |
+
use_space_char: False
|
19 |
+
save_res_path: ./output/rec/u14m_filter/predicts_resnet45_trans_abinet_lang.txt
|
20 |
+
grad_clip_val: 20
|
21 |
+
use_amp: True
|
22 |
+
|
23 |
+
Optimizer:
|
24 |
+
name: Adam
|
25 |
+
lr: 0.000267
|
26 |
+
weight_decay: 0.0
|
27 |
+
filter_bias_and_bn: False
|
28 |
+
|
29 |
+
LRScheduler:
|
30 |
+
name: MultiStepLR
|
31 |
+
milestones: [12]
|
32 |
+
gamma: 0.1
|
33 |
+
|
34 |
+
Architecture:
|
35 |
+
model_type: rec
|
36 |
+
algorithm: ABINet
|
37 |
+
Transform:
|
38 |
+
Encoder:
|
39 |
+
name: ResNet45
|
40 |
+
in_channels: 3
|
41 |
+
strides: [2, 1, 2, 1, 1]
|
42 |
+
Decoder:
|
43 |
+
name: ABINetDecoder
|
44 |
+
iter_size: 3
|
45 |
+
|
46 |
+
Loss:
|
47 |
+
name: ABINetLoss
|
48 |
+
|
49 |
+
PostProcess:
|
50 |
+
name: ABINetLabelDecode
|
51 |
+
|
52 |
+
Metric:
|
53 |
+
name: RecMetric
|
54 |
+
main_indicator: acc
|
55 |
+
is_filter: True
|
56 |
+
|
57 |
+
Train:
|
58 |
+
dataset:
|
59 |
+
name: LMDBDataSet
|
60 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
61 |
+
transforms:
|
62 |
+
- DecodeImagePIL: # load image
|
63 |
+
img_mode: RGB
|
64 |
+
- PARSeqAugPIL:
|
65 |
+
- ABINetLabelEncode:
|
66 |
+
- RecTVResize:
|
67 |
+
image_shape: [32, 128]
|
68 |
+
padding: False
|
69 |
+
- KeepKeys:
|
70 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
71 |
+
loader:
|
72 |
+
shuffle: True
|
73 |
+
batch_size_per_card: 256
|
74 |
+
drop_last: True
|
75 |
+
num_workers: 4
|
76 |
+
|
77 |
+
Eval:
|
78 |
+
dataset:
|
79 |
+
name: LMDBDataSet
|
80 |
+
data_dir: ../evaluation
|
81 |
+
transforms:
|
82 |
+
- DecodeImagePIL: # load image
|
83 |
+
img_mode: RGB
|
84 |
+
- ABINetLabelEncode:
|
85 |
+
- RecTVResize:
|
86 |
+
image_shape: [32, 128]
|
87 |
+
padding: False
|
88 |
+
- KeepKeys:
|
89 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
90 |
+
loader:
|
91 |
+
shuffle: False
|
92 |
+
drop_last: False
|
93 |
+
batch_size_per_card: 256
|
94 |
+
num_workers: 2
|
configs/rec/abinet/resnet45_trans_abinet_wo_lang.yml
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/resnet45_trans_abinet_wo_lang/
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: True
|
10 |
+
pretrained_model:
|
11 |
+
checkpoints:
|
12 |
+
use_tensorboard: false
|
13 |
+
infer_img:
|
14 |
+
# for data or label process
|
15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
16 |
+
max_text_length: 25
|
17 |
+
use_space_char: False
|
18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_resnet45_trans_abinet_wo_lang.txt
|
19 |
+
grad_clip_val: 20
|
20 |
+
use_amp: True
|
21 |
+
|
22 |
+
Optimizer:
|
23 |
+
name: Adam
|
24 |
+
lr: 0.000267
|
25 |
+
weight_decay: 0.0
|
26 |
+
filter_bias_and_bn: False
|
27 |
+
|
28 |
+
LRScheduler:
|
29 |
+
name: MultiStepLR
|
30 |
+
milestones: [12]
|
31 |
+
gamma: 0.1
|
32 |
+
|
33 |
+
Architecture:
|
34 |
+
model_type: rec
|
35 |
+
algorithm: ABINet
|
36 |
+
Transform:
|
37 |
+
Encoder:
|
38 |
+
name: ResNet45
|
39 |
+
in_channels: 3
|
40 |
+
strides: [2, 1, 2, 1, 1]
|
41 |
+
Decoder:
|
42 |
+
name: ABINetDecoder
|
43 |
+
iter_size: 0
|
44 |
+
|
45 |
+
Loss:
|
46 |
+
name: ABINetLoss
|
47 |
+
|
48 |
+
PostProcess:
|
49 |
+
name: ABINetLabelDecode
|
50 |
+
|
51 |
+
Metric:
|
52 |
+
name: RecMetric
|
53 |
+
main_indicator: acc
|
54 |
+
is_filter: True
|
55 |
+
|
56 |
+
Train:
|
57 |
+
dataset:
|
58 |
+
name: LMDBDataSet
|
59 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
60 |
+
transforms:
|
61 |
+
- DecodeImagePIL: # load image
|
62 |
+
img_mode: RGB
|
63 |
+
- PARSeqAugPIL:
|
64 |
+
- ABINetLabelEncode:
|
65 |
+
- RecTVResize:
|
66 |
+
image_shape: [32, 128]
|
67 |
+
padding: False
|
68 |
+
- KeepKeys:
|
69 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
70 |
+
loader:
|
71 |
+
shuffle: True
|
72 |
+
batch_size_per_card: 256
|
73 |
+
drop_last: True
|
74 |
+
num_workers: 4
|
75 |
+
|
76 |
+
Eval:
|
77 |
+
dataset:
|
78 |
+
name: LMDBDataSet
|
79 |
+
data_dir: ../evaluation
|
80 |
+
transforms:
|
81 |
+
- DecodeImagePIL: # load image
|
82 |
+
img_mode: RGB
|
83 |
+
- ABINetLabelEncode:
|
84 |
+
- RecTVResize:
|
85 |
+
image_shape: [32, 128]
|
86 |
+
padding: False
|
87 |
+
- KeepKeys:
|
88 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
89 |
+
loader:
|
90 |
+
shuffle: False
|
91 |
+
drop_last: False
|
92 |
+
batch_size_per_card: 256
|
93 |
+
num_workers: 2
|
configs/rec/abinet/svtrv2_abinet_lang.yml
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_abinet_lang/
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: True
|
10 |
+
pretrained_model:
|
11 |
+
# ./openocr_svtrv2_nolang_abinet_lang.pth
|
12 |
+
checkpoints:
|
13 |
+
use_tensorboard: false
|
14 |
+
infer_img:
|
15 |
+
# for data or label process
|
16 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
17 |
+
max_text_length: 25
|
18 |
+
use_space_char: False
|
19 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_abinet_lang.txt
|
20 |
+
use_amp: True
|
21 |
+
grad_clip_val: 20
|
22 |
+
|
23 |
+
Optimizer:
|
24 |
+
name: AdamW
|
25 |
+
lr: 0.00065 # for 4gpus bs256/gpu
|
26 |
+
weight_decay: 0.05
|
27 |
+
filter_bias_and_bn: True
|
28 |
+
|
29 |
+
LRScheduler:
|
30 |
+
name: OneCycleLR
|
31 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
32 |
+
cycle_momentum: False
|
33 |
+
|
34 |
+
Architecture:
|
35 |
+
model_type: rec
|
36 |
+
algorithm: ABINet
|
37 |
+
Transform:
|
38 |
+
Encoder:
|
39 |
+
name: SVTRv2LNConvTwo33
|
40 |
+
use_pos_embed: False
|
41 |
+
dims: [128, 256, 384]
|
42 |
+
depths: [6, 6, 6]
|
43 |
+
num_heads: [4, 8, 12]
|
44 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
45 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
46 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
47 |
+
last_stage: false
|
48 |
+
feat2d: True
|
49 |
+
Decoder:
|
50 |
+
name: ABINetDecoder
|
51 |
+
iter_size: 3
|
52 |
+
num_layers: 0
|
53 |
+
|
54 |
+
Loss:
|
55 |
+
name: ABINetLoss
|
56 |
+
|
57 |
+
PostProcess:
|
58 |
+
name: ABINetLabelDecode
|
59 |
+
|
60 |
+
Metric:
|
61 |
+
name: RecMetric
|
62 |
+
main_indicator: acc
|
63 |
+
is_filter: True
|
64 |
+
|
65 |
+
Train:
|
66 |
+
dataset:
|
67 |
+
name: RatioDataSetTVResize
|
68 |
+
ds_width: True
|
69 |
+
padding: false
|
70 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
71 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
72 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
73 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
74 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
75 |
+
]
|
76 |
+
transforms:
|
77 |
+
- DecodeImagePIL: # load image
|
78 |
+
img_mode: RGB
|
79 |
+
- PARSeqAugPIL:
|
80 |
+
- ABINetLabelEncode:
|
81 |
+
- KeepKeys:
|
82 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
83 |
+
sampler:
|
84 |
+
name: RatioSampler
|
85 |
+
scales: [[128, 32]] # w, h
|
86 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
87 |
+
first_bs: &bs 256
|
88 |
+
fix_bs: false
|
89 |
+
divided_factor: [4, 16] # w, h
|
90 |
+
is_training: True
|
91 |
+
loader:
|
92 |
+
shuffle: True
|
93 |
+
batch_size_per_card: *bs
|
94 |
+
drop_last: True
|
95 |
+
max_ratio: &max_ratio 4
|
96 |
+
num_workers: 4
|
97 |
+
|
98 |
+
Eval:
|
99 |
+
dataset:
|
100 |
+
name: RatioDataSetTVResize
|
101 |
+
ds_width: True
|
102 |
+
padding: False
|
103 |
+
data_dir_list: [
|
104 |
+
'../evaluation/CUTE80',
|
105 |
+
'../evaluation/IC13_857',
|
106 |
+
'../evaluation/IC15_1811',
|
107 |
+
'../evaluation/IIIT5k',
|
108 |
+
'../evaluation/SVT',
|
109 |
+
'../evaluation/SVTP',
|
110 |
+
]
|
111 |
+
transforms:
|
112 |
+
- DecodeImagePIL: # load image
|
113 |
+
img_mode: RGB
|
114 |
+
- ABINetLabelEncode:
|
115 |
+
- KeepKeys:
|
116 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
117 |
+
sampler:
|
118 |
+
name: RatioSampler
|
119 |
+
scales: [[128, 32]] # w, h
|
120 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
121 |
+
first_bs: *bs
|
122 |
+
fix_bs: false
|
123 |
+
divided_factor: [4, 16] # w, h
|
124 |
+
is_training: False
|
125 |
+
loader:
|
126 |
+
shuffle: False
|
127 |
+
drop_last: False
|
128 |
+
batch_size_per_card: *bs
|
129 |
+
max_ratio: *max_ratio
|
130 |
+
num_workers: 4
|
configs/rec/abinet/svtrv2_abinet_wo_lang.yml
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_abinet_wo_lang/
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: True
|
10 |
+
pretrained_model:
|
11 |
+
checkpoints:
|
12 |
+
use_tensorboard: false
|
13 |
+
infer_img:
|
14 |
+
# for data or label process
|
15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
16 |
+
max_text_length: 25
|
17 |
+
use_space_char: False
|
18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_abinet_wo_lang.txt
|
19 |
+
use_amp: True
|
20 |
+
grad_clip_val: 20
|
21 |
+
|
22 |
+
Optimizer:
|
23 |
+
name: AdamW
|
24 |
+
lr: 0.00065 # for 4gpus bs256/gpu
|
25 |
+
weight_decay: 0.05
|
26 |
+
filter_bias_and_bn: True
|
27 |
+
|
28 |
+
LRScheduler:
|
29 |
+
name: OneCycleLR
|
30 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
31 |
+
cycle_momentum: False
|
32 |
+
|
33 |
+
Architecture:
|
34 |
+
model_type: rec
|
35 |
+
algorithm: ABINet
|
36 |
+
Transform:
|
37 |
+
Encoder:
|
38 |
+
name: SVTRv2LNConvTwo33
|
39 |
+
use_pos_embed: False
|
40 |
+
dims: [128, 256, 384]
|
41 |
+
depths: [6, 6, 6]
|
42 |
+
num_heads: [4, 8, 12]
|
43 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
44 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
45 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
46 |
+
last_stage: false
|
47 |
+
feat2d: True
|
48 |
+
Decoder:
|
49 |
+
name: ABINetDecoder
|
50 |
+
iter_size: 0
|
51 |
+
num_layers: 0
|
52 |
+
Loss:
|
53 |
+
name: ABINetLoss
|
54 |
+
|
55 |
+
PostProcess:
|
56 |
+
name: ABINetLabelDecode
|
57 |
+
|
58 |
+
Metric:
|
59 |
+
name: RecMetric
|
60 |
+
main_indicator: acc
|
61 |
+
is_filter: True
|
62 |
+
|
63 |
+
Train:
|
64 |
+
dataset:
|
65 |
+
name: RatioDataSetTVResize
|
66 |
+
ds_width: True
|
67 |
+
padding: false
|
68 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
69 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
70 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
71 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
72 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
73 |
+
]
|
74 |
+
transforms:
|
75 |
+
- DecodeImagePIL: # load image
|
76 |
+
img_mode: RGB
|
77 |
+
- PARSeqAugPIL:
|
78 |
+
- ABINetLabelEncode:
|
79 |
+
- KeepKeys:
|
80 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
81 |
+
sampler:
|
82 |
+
name: RatioSampler
|
83 |
+
scales: [[128, 32]] # w, h
|
84 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
85 |
+
first_bs: &bs 256
|
86 |
+
fix_bs: false
|
87 |
+
divided_factor: [4, 16] # w, h
|
88 |
+
is_training: True
|
89 |
+
loader:
|
90 |
+
shuffle: True
|
91 |
+
batch_size_per_card: *bs
|
92 |
+
drop_last: True
|
93 |
+
max_ratio: &max_ratio 4
|
94 |
+
num_workers: 4
|
95 |
+
|
96 |
+
Eval:
|
97 |
+
dataset:
|
98 |
+
name: RatioDataSetTVResize
|
99 |
+
ds_width: True
|
100 |
+
padding: False
|
101 |
+
data_dir_list: [
|
102 |
+
'../evaluation/CUTE80',
|
103 |
+
'../evaluation/IC13_857',
|
104 |
+
'../evaluation/IC15_1811',
|
105 |
+
'../evaluation/IIIT5k',
|
106 |
+
'../evaluation/SVT',
|
107 |
+
'../evaluation/SVTP',
|
108 |
+
]
|
109 |
+
transforms:
|
110 |
+
- DecodeImagePIL: # load image
|
111 |
+
img_mode: RGB
|
112 |
+
- ABINetLabelEncode:
|
113 |
+
- KeepKeys:
|
114 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
115 |
+
sampler:
|
116 |
+
name: RatioSampler
|
117 |
+
scales: [[128, 32]] # w, h
|
118 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
119 |
+
first_bs: *bs
|
120 |
+
fix_bs: false
|
121 |
+
divided_factor: [4, 16] # w, h
|
122 |
+
is_training: False
|
123 |
+
loader:
|
124 |
+
shuffle: False
|
125 |
+
drop_last: False
|
126 |
+
batch_size_per_card: *bs
|
127 |
+
max_ratio: *max_ratio
|
128 |
+
num_workers: 4
|
configs/rec/aster/resnet31_lstm_aster_tps_on.yml
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/resnet31_lstm_aster_tps_on
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: True
|
10 |
+
pretrained_model:
|
11 |
+
checkpoints:
|
12 |
+
use_tensorboard: false
|
13 |
+
infer_img:
|
14 |
+
# for data or label process
|
15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
16 |
+
max_text_length: 25
|
17 |
+
use_space_char: False
|
18 |
+
save_res_path: ./output/rec/predicts_aster_tps.txt
|
19 |
+
use_amp: True
|
20 |
+
grad_clip_val: 1.0
|
21 |
+
|
22 |
+
Optimizer:
|
23 |
+
name: Adam
|
24 |
+
lr: 0.002 # for 1gpus bs1024/gpu
|
25 |
+
weight_decay: 0.0
|
26 |
+
filter_bias_and_bn: False
|
27 |
+
|
28 |
+
LRScheduler:
|
29 |
+
name: OneCycleLR
|
30 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
31 |
+
cycle_momentum: False
|
32 |
+
|
33 |
+
Architecture:
|
34 |
+
model_type: rec
|
35 |
+
algorithm: aster
|
36 |
+
Transform:
|
37 |
+
name: Aster_TPS
|
38 |
+
tps_inputsize: [32, 64]
|
39 |
+
tps_outputsize: [32, 128]
|
40 |
+
Encoder:
|
41 |
+
name: ResNet_ASTER
|
42 |
+
Decoder:
|
43 |
+
name: ASTERDecoder
|
44 |
+
|
45 |
+
Loss:
|
46 |
+
name: ARLoss
|
47 |
+
|
48 |
+
Metric:
|
49 |
+
name: RecMetric
|
50 |
+
main_indicator: acc
|
51 |
+
is_filter: True
|
52 |
+
|
53 |
+
PostProcess:
|
54 |
+
name: ARLabelDecode
|
55 |
+
|
56 |
+
Train:
|
57 |
+
dataset:
|
58 |
+
name: LMDBDataSet
|
59 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
60 |
+
transforms:
|
61 |
+
- DecodeImagePIL: # load image
|
62 |
+
img_mode: RGB
|
63 |
+
- PARSeqAugPIL:
|
64 |
+
- ARLabelEncode: # Class handling label
|
65 |
+
- RecTVResize:
|
66 |
+
image_shape: [64, 256]
|
67 |
+
padding: False
|
68 |
+
- KeepKeys:
|
69 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
70 |
+
loader:
|
71 |
+
shuffle: True
|
72 |
+
batch_size_per_card: 1024
|
73 |
+
drop_last: True
|
74 |
+
num_workers: 4
|
75 |
+
|
76 |
+
Eval:
|
77 |
+
dataset:
|
78 |
+
name: LMDBDataSet
|
79 |
+
data_dir: ../evaluation
|
80 |
+
transforms:
|
81 |
+
- DecodeImagePIL: # load image
|
82 |
+
img_mode: RGB
|
83 |
+
- ARLabelEncode: # Class handling label
|
84 |
+
- RecTVResize:
|
85 |
+
image_shape: [64, 256]
|
86 |
+
padding: False
|
87 |
+
- KeepKeys:
|
88 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
89 |
+
loader:
|
90 |
+
shuffle: False
|
91 |
+
drop_last: False
|
92 |
+
batch_size_per_card: 256
|
93 |
+
num_workers: 2
|
configs/rec/aster/svtrv2_aster.yml
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_aster
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: True
|
10 |
+
pretrained_model:
|
11 |
+
checkpoints:
|
12 |
+
use_tensorboard: false
|
13 |
+
infer_img:
|
14 |
+
# for data or label process
|
15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
16 |
+
max_text_length: 25
|
17 |
+
use_space_char: False
|
18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_aster.txt
|
19 |
+
use_amp: True
|
20 |
+
|
21 |
+
Optimizer:
|
22 |
+
name: AdamW
|
23 |
+
lr: 0.00065 # for 4gpus bs256/gpu
|
24 |
+
weight_decay: 0.05
|
25 |
+
filter_bias_and_bn: True
|
26 |
+
|
27 |
+
LRScheduler:
|
28 |
+
name: OneCycleLR
|
29 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
30 |
+
cycle_momentum: False
|
31 |
+
|
32 |
+
Architecture:
|
33 |
+
model_type: rec
|
34 |
+
algorithm: aster
|
35 |
+
Transform:
|
36 |
+
Encoder:
|
37 |
+
name: SVTRv2LNConvTwo33
|
38 |
+
use_pos_embed: False
|
39 |
+
out_channels: 256
|
40 |
+
dims: [128, 256, 384]
|
41 |
+
depths: [6, 6, 6]
|
42 |
+
num_heads: [4, 8, 12]
|
43 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
44 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
45 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
46 |
+
last_stage: false
|
47 |
+
feat2d: False
|
48 |
+
Decoder:
|
49 |
+
name: ASTERDecoder
|
50 |
+
|
51 |
+
Loss:
|
52 |
+
name: ARLoss
|
53 |
+
|
54 |
+
Metric:
|
55 |
+
name: RecMetric
|
56 |
+
main_indicator: acc
|
57 |
+
is_filter: True
|
58 |
+
|
59 |
+
PostProcess:
|
60 |
+
name: ARLabelDecode
|
61 |
+
|
62 |
+
Train:
|
63 |
+
dataset:
|
64 |
+
name: RatioDataSetTVResize
|
65 |
+
ds_width: True
|
66 |
+
padding: false
|
67 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
68 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
69 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
70 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
71 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
72 |
+
]
|
73 |
+
transforms:
|
74 |
+
- DecodeImagePIL: # load image
|
75 |
+
img_mode: RGB
|
76 |
+
- PARSeqAugPIL:
|
77 |
+
- ARLabelEncode: # Class handling label
|
78 |
+
- KeepKeys:
|
79 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
80 |
+
sampler:
|
81 |
+
name: RatioSampler
|
82 |
+
scales: [[128, 32]] # w, h
|
83 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
84 |
+
first_bs: &bs 256
|
85 |
+
fix_bs: false
|
86 |
+
divided_factor: [4, 16] # w, h
|
87 |
+
is_training: True
|
88 |
+
loader:
|
89 |
+
shuffle: True
|
90 |
+
batch_size_per_card: *bs
|
91 |
+
drop_last: True
|
92 |
+
max_ratio: &max_ratio 4
|
93 |
+
num_workers: 4
|
94 |
+
|
95 |
+
Eval:
|
96 |
+
dataset:
|
97 |
+
name: RatioDataSetTVResize
|
98 |
+
ds_width: True
|
99 |
+
padding: False
|
100 |
+
data_dir_list: [
|
101 |
+
'../evaluation/CUTE80',
|
102 |
+
'../evaluation/IC13_857',
|
103 |
+
'../evaluation/IC15_1811',
|
104 |
+
'../evaluation/IIIT5k',
|
105 |
+
'../evaluation/SVT',
|
106 |
+
'../evaluation/SVTP',
|
107 |
+
]
|
108 |
+
transforms:
|
109 |
+
- DecodeImagePIL: # load image
|
110 |
+
img_mode: RGB
|
111 |
+
- ARLabelEncode: # Class handling label
|
112 |
+
- KeepKeys:
|
113 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
114 |
+
sampler:
|
115 |
+
name: RatioSampler
|
116 |
+
scales: [[128, 32]] # w, h
|
117 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
118 |
+
first_bs: *bs
|
119 |
+
fix_bs: false
|
120 |
+
divided_factor: [4, 16] # w, h
|
121 |
+
is_training: False
|
122 |
+
loader:
|
123 |
+
shuffle: False
|
124 |
+
drop_last: False
|
125 |
+
batch_size_per_card: *bs
|
126 |
+
max_ratio: *max_ratio
|
127 |
+
num_workers: 4
|
configs/rec/aster/svtrv2_aster_tps_on.yml
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_aster_tps_on
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: True
|
10 |
+
pretrained_model:
|
11 |
+
checkpoints:
|
12 |
+
use_tensorboard: false
|
13 |
+
infer_img:
|
14 |
+
# for data or label process
|
15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
16 |
+
max_text_length: 25
|
17 |
+
use_space_char: False
|
18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_aster_tps_on.txt
|
19 |
+
use_amp: True
|
20 |
+
|
21 |
+
Optimizer:
|
22 |
+
name: AdamW
|
23 |
+
lr: 0.00065 # for 4gpus bs256/gpu
|
24 |
+
weight_decay: 0.05
|
25 |
+
filter_bias_and_bn: True
|
26 |
+
|
27 |
+
LRScheduler:
|
28 |
+
name: OneCycleLR
|
29 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
30 |
+
cycle_momentum: False
|
31 |
+
|
32 |
+
Architecture:
|
33 |
+
model_type: rec
|
34 |
+
algorithm: aster
|
35 |
+
Transform:
|
36 |
+
name: Aster_TPS
|
37 |
+
tps_inputsize: [32, 64]
|
38 |
+
tps_outputsize: [32, 128]
|
39 |
+
Encoder:
|
40 |
+
name: SVTRv2LNConvTwo33
|
41 |
+
use_pos_embed: False
|
42 |
+
out_channels: 256
|
43 |
+
dims: [128, 256, 384]
|
44 |
+
depths: [6, 6, 6]
|
45 |
+
num_heads: [4, 8, 12]
|
46 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
47 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
48 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
49 |
+
last_stage: false
|
50 |
+
feat2d: False
|
51 |
+
Decoder:
|
52 |
+
name: ASTERDecoder
|
53 |
+
|
54 |
+
Loss:
|
55 |
+
name: ARLoss
|
56 |
+
|
57 |
+
Metric:
|
58 |
+
name: RecMetric
|
59 |
+
main_indicator: acc
|
60 |
+
is_filter: True
|
61 |
+
|
62 |
+
PostProcess:
|
63 |
+
name: ARLabelDecode
|
64 |
+
|
65 |
+
Train:
|
66 |
+
dataset:
|
67 |
+
name: LMDBDataSet
|
68 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
69 |
+
transforms:
|
70 |
+
- DecodeImagePIL: # load image
|
71 |
+
img_mode: RGB
|
72 |
+
- PARSeqAugPIL:
|
73 |
+
- ARLabelEncode: # Class handling label
|
74 |
+
- RecTVResize:
|
75 |
+
image_shape: [64, 256]
|
76 |
+
padding: False
|
77 |
+
- KeepKeys:
|
78 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
79 |
+
loader:
|
80 |
+
shuffle: True
|
81 |
+
batch_size_per_card: 256
|
82 |
+
drop_last: True
|
83 |
+
num_workers: 4
|
84 |
+
|
85 |
+
Eval:
|
86 |
+
dataset:
|
87 |
+
name: LMDBDataSet
|
88 |
+
data_dir: ../evaluation
|
89 |
+
transforms:
|
90 |
+
- DecodeImagePIL: # load image
|
91 |
+
img_mode: RGB
|
92 |
+
- ARLabelEncode: # Class handling label
|
93 |
+
- RecTVResize:
|
94 |
+
image_shape: [64, 256]
|
95 |
+
padding: False
|
96 |
+
- KeepKeys:
|
97 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
98 |
+
loader:
|
99 |
+
shuffle: False
|
100 |
+
drop_last: False
|
101 |
+
batch_size_per_card: 256
|
102 |
+
num_workers: 2
|
configs/rec/autostr/autostr_lstm_aster_tps_on.yml
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/autostr_lstm_aster_tps_on
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: True
|
10 |
+
pretrained_model:
|
11 |
+
checkpoints:
|
12 |
+
use_tensorboard: false
|
13 |
+
infer_img:
|
14 |
+
# for data or label process
|
15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
16 |
+
max_text_length: 25
|
17 |
+
use_space_char: False
|
18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_autostr_lstm_aster_tps_on.txt
|
19 |
+
use_amp: True
|
20 |
+
grad_clip_val: 1.0
|
21 |
+
|
22 |
+
Optimizer:
|
23 |
+
name: Adam
|
24 |
+
lr: 0.002 # for 4gpus bs256/gpu
|
25 |
+
weight_decay: 0.0
|
26 |
+
filter_bias_and_bn: False
|
27 |
+
|
28 |
+
LRScheduler:
|
29 |
+
name: OneCycleLR
|
30 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
31 |
+
cycle_momentum: False
|
32 |
+
|
33 |
+
Architecture:
|
34 |
+
model_type: rec
|
35 |
+
algorithm: autostr
|
36 |
+
Transform:
|
37 |
+
name: Aster_TPS
|
38 |
+
tps_inputsize: [32, 64]
|
39 |
+
tps_outputsize: [32, 128]
|
40 |
+
Encoder:
|
41 |
+
name: AutoSTREncoder
|
42 |
+
stride_stages: '[(2, 2), (2, 1), (2, 2), (2, 1), (2, 1)]'
|
43 |
+
conv_op_ids: [2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 4, 1, 1, 6, 6]
|
44 |
+
Decoder:
|
45 |
+
name: ASTERDecoder
|
46 |
+
|
47 |
+
Loss:
|
48 |
+
name: ARLoss
|
49 |
+
|
50 |
+
Metric:
|
51 |
+
name: RecMetric
|
52 |
+
main_indicator: acc
|
53 |
+
is_filter: True
|
54 |
+
|
55 |
+
PostProcess:
|
56 |
+
name: ARLabelDecode
|
57 |
+
|
58 |
+
Train:
|
59 |
+
dataset:
|
60 |
+
name: LMDBDataSet
|
61 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
62 |
+
transforms:
|
63 |
+
- DecodeImagePIL: # load image
|
64 |
+
img_mode: RGB
|
65 |
+
- PARSeqAugPIL:
|
66 |
+
- ARLabelEncode: # Class handling label
|
67 |
+
- RecTVResize:
|
68 |
+
image_shape: [64, 256]
|
69 |
+
padding: False
|
70 |
+
- KeepKeys:
|
71 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
72 |
+
loader:
|
73 |
+
shuffle: True
|
74 |
+
batch_size_per_card: 256
|
75 |
+
drop_last: True
|
76 |
+
num_workers: 4
|
77 |
+
|
78 |
+
Eval:
|
79 |
+
dataset:
|
80 |
+
name: LMDBDataSet
|
81 |
+
data_dir: ../evaluation
|
82 |
+
transforms:
|
83 |
+
- DecodeImagePIL: # load image
|
84 |
+
img_mode: RGB
|
85 |
+
- ARLabelEncode: # Class handling label
|
86 |
+
- RecTVResize:
|
87 |
+
image_shape: [64, 256]
|
88 |
+
padding: False
|
89 |
+
- KeepKeys:
|
90 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
91 |
+
loader:
|
92 |
+
shuffle: False
|
93 |
+
drop_last: False
|
94 |
+
batch_size_per_card: 256
|
95 |
+
num_workers: 2
|
configs/rec/busnet/svtrv2_busnet.yml
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 10
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_busnet/
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: True
|
10 |
+
pretrained_model:
|
11 |
+
# ./output/rec/u14m_filter/svtrv2_busnet_pretraining/best.pth
|
12 |
+
checkpoints:
|
13 |
+
use_tensorboard: false
|
14 |
+
infer_img:
|
15 |
+
# for data or label process
|
16 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
17 |
+
max_text_length: 25
|
18 |
+
use_space_char: False
|
19 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_busnet.txt
|
20 |
+
use_amp: True
|
21 |
+
|
22 |
+
Optimizer:
|
23 |
+
name: AdamW
|
24 |
+
lr: 0.00065 # 4gpus bs256/gpu
|
25 |
+
weight_decay: 0.05
|
26 |
+
filter_bias_and_bn: True
|
27 |
+
|
28 |
+
LRScheduler:
|
29 |
+
name: OneCycleLR
|
30 |
+
warmup_epoch: 1 # pct_start 0.075*20 = 1.5ep
|
31 |
+
cycle_momentum: False
|
32 |
+
|
33 |
+
Architecture:
|
34 |
+
model_type: rec
|
35 |
+
algorithm: BUSBet
|
36 |
+
Transform:
|
37 |
+
Encoder:
|
38 |
+
name: SVTRv2LNConvTwo33
|
39 |
+
use_pos_embed: False
|
40 |
+
dims: [128, 256, 384]
|
41 |
+
depths: [6, 6, 6]
|
42 |
+
num_heads: [4, 8, 12]
|
43 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
44 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
45 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
46 |
+
last_stage: false
|
47 |
+
feat2d: False
|
48 |
+
Decoder:
|
49 |
+
name: BUSDecoder
|
50 |
+
nhead: 6
|
51 |
+
num_layers: 6
|
52 |
+
dim_feedforward: 1536
|
53 |
+
ignore_index: &ignore_index 100
|
54 |
+
pretraining: False
|
55 |
+
# return_id: 2
|
56 |
+
Loss:
|
57 |
+
name: ABINetLoss
|
58 |
+
ignore_index: *ignore_index
|
59 |
+
|
60 |
+
PostProcess:
|
61 |
+
name: ABINetLabelDecode
|
62 |
+
|
63 |
+
Metric:
|
64 |
+
name: RecMetric
|
65 |
+
main_indicator: acc
|
66 |
+
is_filter: True
|
67 |
+
|
68 |
+
Train:
|
69 |
+
dataset:
|
70 |
+
name: RatioDataSetTVResize
|
71 |
+
ds_width: True
|
72 |
+
padding: false
|
73 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
74 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
75 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
76 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
77 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
78 |
+
]
|
79 |
+
transforms:
|
80 |
+
- DecodeImagePIL: # load image
|
81 |
+
img_mode: RGB
|
82 |
+
- PARSeqAugPIL:
|
83 |
+
- ABINetLabelEncode:
|
84 |
+
ignore_index: *ignore_index
|
85 |
+
- KeepKeys:
|
86 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
87 |
+
sampler:
|
88 |
+
name: RatioSampler
|
89 |
+
scales: [[128, 32]] # w, h
|
90 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
91 |
+
first_bs: &bs 256
|
92 |
+
fix_bs: false
|
93 |
+
divided_factor: [4, 16] # w, h
|
94 |
+
is_training: True
|
95 |
+
loader:
|
96 |
+
shuffle: True
|
97 |
+
batch_size_per_card: *bs
|
98 |
+
drop_last: True
|
99 |
+
max_ratio: &max_ratio 4
|
100 |
+
num_workers: 4
|
101 |
+
|
102 |
+
Eval:
|
103 |
+
dataset:
|
104 |
+
name: RatioDataSetTVResize
|
105 |
+
ds_width: True
|
106 |
+
padding: False
|
107 |
+
data_dir_list: [
|
108 |
+
'../evaluation/CUTE80',
|
109 |
+
'../evaluation/IC13_857',
|
110 |
+
'../evaluation/IC15_1811',
|
111 |
+
'../evaluation/IIIT5k',
|
112 |
+
'../evaluation/SVT',
|
113 |
+
'../evaluation/SVTP',
|
114 |
+
]
|
115 |
+
transforms:
|
116 |
+
- DecodeImagePIL: # load image
|
117 |
+
img_mode: RGB
|
118 |
+
- ABINetLabelEncode:
|
119 |
+
ignore_index: *ignore_index
|
120 |
+
- KeepKeys:
|
121 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
122 |
+
sampler:
|
123 |
+
name: RatioSampler
|
124 |
+
scales: [[128, 32]] # w, h
|
125 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
126 |
+
first_bs: *bs
|
127 |
+
fix_bs: false
|
128 |
+
divided_factor: [4, 16] # w, h
|
129 |
+
is_training: False
|
130 |
+
loader:
|
131 |
+
shuffle: False
|
132 |
+
drop_last: False
|
133 |
+
batch_size_per_card: *bs
|
134 |
+
max_ratio: *max_ratio
|
135 |
+
num_workers: 4
|
configs/rec/busnet/svtrv2_busnet_pretraining.yml
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 10
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_busnet_pretraining/
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: True
|
10 |
+
pretrained_model:
|
11 |
+
checkpoints:
|
12 |
+
use_tensorboard: false
|
13 |
+
infer_img:
|
14 |
+
# for data or label process
|
15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
16 |
+
max_text_length: 25
|
17 |
+
use_space_char: False
|
18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_busnet_pretraining.txt
|
19 |
+
use_amp: True
|
20 |
+
|
21 |
+
Optimizer:
|
22 |
+
name: AdamW
|
23 |
+
lr: 0.00065 # 4gpus bs256/gpu
|
24 |
+
weight_decay: 0.05
|
25 |
+
filter_bias_and_bn: True
|
26 |
+
|
27 |
+
LRScheduler:
|
28 |
+
name: OneCycleLR
|
29 |
+
warmup_epoch: 1 # pct_start 0.075*20 = 1.5ep
|
30 |
+
cycle_momentum: False
|
31 |
+
|
32 |
+
Architecture:
|
33 |
+
model_type: rec
|
34 |
+
algorithm: BUSBet
|
35 |
+
Transform:
|
36 |
+
Encoder:
|
37 |
+
name: SVTRv2LNConvTwo33
|
38 |
+
use_pos_embed: False
|
39 |
+
dims: [128, 256, 384]
|
40 |
+
depths: [6, 6, 6]
|
41 |
+
num_heads: [4, 8, 12]
|
42 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
43 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
44 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
45 |
+
last_stage: false
|
46 |
+
feat2d: False
|
47 |
+
Decoder:
|
48 |
+
name: BUSDecoder
|
49 |
+
nhead: 6
|
50 |
+
num_layers: 6
|
51 |
+
dim_feedforward: 1536
|
52 |
+
ignore_index: &ignore_index 100
|
53 |
+
pretraining: True
|
54 |
+
# return_id: 0
|
55 |
+
Loss:
|
56 |
+
name: ABINetLoss
|
57 |
+
ignore_index: *ignore_index
|
58 |
+
|
59 |
+
PostProcess:
|
60 |
+
name: ABINetLabelDecode
|
61 |
+
|
62 |
+
Metric:
|
63 |
+
name: RecMetric
|
64 |
+
main_indicator: acc
|
65 |
+
is_filter: True
|
66 |
+
|
67 |
+
Train:
|
68 |
+
dataset:
|
69 |
+
name: RatioDataSetTVResize
|
70 |
+
ds_width: True
|
71 |
+
padding: false
|
72 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
73 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
74 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
75 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
76 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
77 |
+
]
|
78 |
+
transforms:
|
79 |
+
- DecodeImagePIL: # load image
|
80 |
+
img_mode: RGB
|
81 |
+
- PARSeqAugPIL:
|
82 |
+
- ABINetLabelEncode:
|
83 |
+
ignore_index: *ignore_index
|
84 |
+
- KeepKeys:
|
85 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
86 |
+
sampler:
|
87 |
+
name: RatioSampler
|
88 |
+
scales: [[128, 32]] # w, h
|
89 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
90 |
+
first_bs: &bs 256
|
91 |
+
fix_bs: false
|
92 |
+
divided_factor: [4, 16] # w, h
|
93 |
+
is_training: True
|
94 |
+
loader:
|
95 |
+
shuffle: True
|
96 |
+
batch_size_per_card: *bs
|
97 |
+
drop_last: True
|
98 |
+
max_ratio: &max_ratio 4
|
99 |
+
num_workers: 4
|
100 |
+
|
101 |
+
Eval:
|
102 |
+
dataset:
|
103 |
+
name: RatioDataSetTVResize
|
104 |
+
ds_width: True
|
105 |
+
padding: False
|
106 |
+
data_dir_list: [
|
107 |
+
'../evaluation/CUTE80',
|
108 |
+
'../evaluation/IC13_857',
|
109 |
+
'../evaluation/IC15_1811',
|
110 |
+
'../evaluation/IIIT5k',
|
111 |
+
'../evaluation/SVT',
|
112 |
+
'../evaluation/SVTP',
|
113 |
+
]
|
114 |
+
transforms:
|
115 |
+
- DecodeImagePIL: # load image
|
116 |
+
img_mode: RGB
|
117 |
+
- ABINetLabelEncode:
|
118 |
+
ignore_index: *ignore_index
|
119 |
+
- KeepKeys:
|
120 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
121 |
+
sampler:
|
122 |
+
name: RatioSampler
|
123 |
+
scales: [[128, 32]] # w, h
|
124 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
125 |
+
first_bs: *bs
|
126 |
+
fix_bs: false
|
127 |
+
divided_factor: [4, 16] # w, h
|
128 |
+
is_training: False
|
129 |
+
loader:
|
130 |
+
shuffle: False
|
131 |
+
drop_last: False
|
132 |
+
batch_size_per_card: *bs
|
133 |
+
max_ratio: *max_ratio
|
134 |
+
num_workers: 4
|
configs/rec/busnet/vit_busnet.yml
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 10
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/vit_busnet/
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: True
|
10 |
+
pretrained_model:
|
11 |
+
checkpoints:
|
12 |
+
use_tensorboard: false
|
13 |
+
infer_img:
|
14 |
+
# for data or label process
|
15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
16 |
+
max_text_length: 25
|
17 |
+
use_space_char: False
|
18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_vit_busnet.txt
|
19 |
+
grad_clip_val: 20
|
20 |
+
use_amp: True
|
21 |
+
|
22 |
+
Optimizer:
|
23 |
+
name: Adam
|
24 |
+
lr: 0.00053 # 4gpus bs256/gpu
|
25 |
+
weight_decay: 0.0
|
26 |
+
filter_bias_and_bn: False
|
27 |
+
|
28 |
+
LRScheduler:
|
29 |
+
name: MultiStepLR
|
30 |
+
milestones: [6]
|
31 |
+
gamma: 0.1
|
32 |
+
|
33 |
+
Architecture:
|
34 |
+
model_type: rec
|
35 |
+
algorithm: BUSBet
|
36 |
+
Transform:
|
37 |
+
Encoder:
|
38 |
+
name: ViT
|
39 |
+
img_size: [32,128]
|
40 |
+
patch_size: [4, 8]
|
41 |
+
embed_dim: 384
|
42 |
+
depth: 12
|
43 |
+
num_heads: 6
|
44 |
+
mlp_ratio: 4
|
45 |
+
qkv_bias: True
|
46 |
+
Decoder:
|
47 |
+
name: BUSDecoder
|
48 |
+
nhead: 6
|
49 |
+
num_layers: 6
|
50 |
+
dim_feedforward: 1536
|
51 |
+
ignore_index: &ignore_index 100
|
52 |
+
pretraining: False
|
53 |
+
Loss:
|
54 |
+
name: ABINetLoss
|
55 |
+
ignore_index: *ignore_index
|
56 |
+
|
57 |
+
PostProcess:
|
58 |
+
name: ABINetLabelDecode
|
59 |
+
|
60 |
+
Metric:
|
61 |
+
name: RecMetric
|
62 |
+
main_indicator: acc
|
63 |
+
is_filter: True
|
64 |
+
|
65 |
+
Train:
|
66 |
+
dataset:
|
67 |
+
name: LMDBDataSet
|
68 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
69 |
+
transforms:
|
70 |
+
- DecodeImagePIL: # load image
|
71 |
+
img_mode: RGB
|
72 |
+
- PARSeqAugPIL:
|
73 |
+
- ABINetLabelEncode:
|
74 |
+
ignore_index: *ignore_index
|
75 |
+
- RecTVResize:
|
76 |
+
image_shape: [32, 128]
|
77 |
+
padding: False
|
78 |
+
- KeepKeys:
|
79 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
80 |
+
loader:
|
81 |
+
shuffle: True
|
82 |
+
batch_size_per_card: 256
|
83 |
+
drop_last: True
|
84 |
+
num_workers: 4
|
85 |
+
|
86 |
+
Eval:
|
87 |
+
dataset:
|
88 |
+
name: LMDBDataSet
|
89 |
+
data_dir: ../evaluation
|
90 |
+
transforms:
|
91 |
+
- DecodeImagePIL: # load image
|
92 |
+
img_mode: RGB
|
93 |
+
- ABINetLabelEncode:
|
94 |
+
ignore_index: *ignore_index
|
95 |
+
- RecTVResize:
|
96 |
+
image_shape: [32, 128]
|
97 |
+
padding: False
|
98 |
+
- KeepKeys:
|
99 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
100 |
+
loader:
|
101 |
+
shuffle: False
|
102 |
+
drop_last: False
|
103 |
+
batch_size_per_card: 256
|
104 |
+
num_workers: 2
|
configs/rec/busnet/vit_busnet_pretraining.yml
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 10
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/vit_busnet_pretraining/
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: True
|
10 |
+
pretrained_model:
|
11 |
+
checkpoints:
|
12 |
+
use_tensorboard: false
|
13 |
+
infer_img:
|
14 |
+
# for data or label process
|
15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
16 |
+
max_text_length: 25
|
17 |
+
use_space_char: False
|
18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_vit_busnet_pretraining.txt
|
19 |
+
grad_clip_val: 20
|
20 |
+
use_amp: True
|
21 |
+
|
22 |
+
Optimizer:
|
23 |
+
name: Adam
|
24 |
+
lr: 0.00053 # 4gpus bs256/gpu
|
25 |
+
weight_decay: 0.0
|
26 |
+
filter_bias_and_bn: False
|
27 |
+
|
28 |
+
LRScheduler:
|
29 |
+
name: MultiStepLR
|
30 |
+
milestones: [6]
|
31 |
+
gamma: 0.1
|
32 |
+
|
33 |
+
Architecture:
|
34 |
+
model_type: rec
|
35 |
+
algorithm: BUSBet
|
36 |
+
Transform:
|
37 |
+
Encoder:
|
38 |
+
name: ViT
|
39 |
+
img_size: [32,128]
|
40 |
+
patch_size: [4, 8]
|
41 |
+
embed_dim: 384
|
42 |
+
depth: 12
|
43 |
+
num_heads: 6
|
44 |
+
mlp_ratio: 4
|
45 |
+
qkv_bias: True
|
46 |
+
Decoder:
|
47 |
+
name: BUSDecoder
|
48 |
+
nhead: 6
|
49 |
+
num_layers: 6
|
50 |
+
dim_feedforward: 1536
|
51 |
+
ignore_index: &ignore_index 100
|
52 |
+
pretraining: True
|
53 |
+
Loss:
|
54 |
+
name: ABINetLoss
|
55 |
+
ignore_index: *ignore_index
|
56 |
+
|
57 |
+
PostProcess:
|
58 |
+
name: ABINetLabelDecode
|
59 |
+
|
60 |
+
Metric:
|
61 |
+
name: RecMetric
|
62 |
+
main_indicator: acc
|
63 |
+
is_filter: True
|
64 |
+
|
65 |
+
Train:
|
66 |
+
dataset:
|
67 |
+
name: LMDBDataSet
|
68 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
69 |
+
transforms:
|
70 |
+
- DecodeImagePIL: # load image
|
71 |
+
img_mode: RGB
|
72 |
+
- PARSeqAugPIL:
|
73 |
+
- ABINetLabelEncode:
|
74 |
+
ignore_index: *ignore_index
|
75 |
+
- RecTVResize:
|
76 |
+
image_shape: [32, 128]
|
77 |
+
padding: False
|
78 |
+
- KeepKeys:
|
79 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
80 |
+
loader:
|
81 |
+
shuffle: True
|
82 |
+
batch_size_per_card: 256
|
83 |
+
drop_last: True
|
84 |
+
num_workers: 4
|
85 |
+
|
86 |
+
Eval:
|
87 |
+
dataset:
|
88 |
+
name: LMDBDataSet
|
89 |
+
data_dir: ../evaluation
|
90 |
+
transforms:
|
91 |
+
- DecodeImagePIL: # load image
|
92 |
+
img_mode: RGB
|
93 |
+
- ABINetLabelEncode:
|
94 |
+
ignore_index: *ignore_index
|
95 |
+
- RecTVResize:
|
96 |
+
image_shape: [32, 128]
|
97 |
+
padding: False
|
98 |
+
- KeepKeys:
|
99 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
100 |
+
loader:
|
101 |
+
shuffle: False
|
102 |
+
drop_last: False
|
103 |
+
batch_size_per_card: 256
|
104 |
+
num_workers: 2
|
configs/rec/cam/convnextv2_cam_tps_on.yml
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/convnextv2_cam_tps_on
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: False
|
10 |
+
pretrained_model:
|
11 |
+
checkpoints:
|
12 |
+
use_tensorboard: false
|
13 |
+
infer_img:
|
14 |
+
# for data or label process
|
15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
16 |
+
max_text_length: &max_text_length 25
|
17 |
+
use_space_char: False
|
18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_convnextv2_cam_tps_on.txt
|
19 |
+
use_amp: True
|
20 |
+
|
21 |
+
Optimizer:
|
22 |
+
name: AdamW
|
23 |
+
lr: 0.0008 # for 4gpus bs256/gpu
|
24 |
+
weight_decay: 0.05
|
25 |
+
filter_bias_and_bn: True
|
26 |
+
eps: 1.e-8
|
27 |
+
|
28 |
+
LRScheduler:
|
29 |
+
name: OneCycleLR
|
30 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 : 1.5ep
|
31 |
+
cycle_momentum: False
|
32 |
+
|
33 |
+
Architecture:
|
34 |
+
model_type: rec
|
35 |
+
algorithm: CAM
|
36 |
+
Transform:
|
37 |
+
name: Aster_TPS
|
38 |
+
tps_inputsize: [32, 64]
|
39 |
+
tps_outputsize: &img_shape [32, 128]
|
40 |
+
Encoder:
|
41 |
+
name: CAMEncoder
|
42 |
+
encoder_config:
|
43 |
+
name: ConvNeXtV2
|
44 |
+
depths: [2, 2, 8, 2]
|
45 |
+
dims: [80, 160, 320, 640]
|
46 |
+
strides: [[4,4], [2,1], [2,1], [1,1]]
|
47 |
+
drop_path_rate: 0.2
|
48 |
+
feat2d: True
|
49 |
+
nb_classes: 97
|
50 |
+
strides: [[4,4], [2,1], [2,1], [1,1]]
|
51 |
+
deform_stride: 2
|
52 |
+
stage_idx: 2
|
53 |
+
use_depthwise_unet: True
|
54 |
+
use_more_unet: False
|
55 |
+
binary_loss_type: BanlanceMultiClassCrossEntropyLoss
|
56 |
+
mid_size: True
|
57 |
+
d_embedding: 384
|
58 |
+
Decoder:
|
59 |
+
name: CAMDecoder
|
60 |
+
num_encoder_layers: -1
|
61 |
+
beam_size: 0
|
62 |
+
num_decoder_layers: 2
|
63 |
+
nhead: 8
|
64 |
+
max_len: *max_text_length
|
65 |
+
|
66 |
+
Loss:
|
67 |
+
name: CAMLoss
|
68 |
+
loss_weight_binary: 1.5
|
69 |
+
label_smoothing: 0.
|
70 |
+
|
71 |
+
Metric:
|
72 |
+
name: RecMetric
|
73 |
+
main_indicator: acc
|
74 |
+
is_filter: True
|
75 |
+
|
76 |
+
PostProcess:
|
77 |
+
name: ARLabelDecode
|
78 |
+
|
79 |
+
Train:
|
80 |
+
dataset:
|
81 |
+
name: LMDBDataSet
|
82 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
83 |
+
transforms:
|
84 |
+
- DecodeImagePIL: # load image
|
85 |
+
img_mode: RGB
|
86 |
+
- PARSeqAugPIL:
|
87 |
+
- CAMLabelEncode: # Class handling label
|
88 |
+
font_path: ./arial.ttf
|
89 |
+
image_shape: *img_shape
|
90 |
+
- RecTVResize:
|
91 |
+
image_shape: [64, 256]
|
92 |
+
padding: False
|
93 |
+
- KeepKeys:
|
94 |
+
keep_keys: ['image', 'label', 'length', 'binary_mask'] # dataloader will return list in this order
|
95 |
+
loader:
|
96 |
+
shuffle: True
|
97 |
+
batch_size_per_card: 256
|
98 |
+
drop_last: True
|
99 |
+
num_workers: 4
|
100 |
+
|
101 |
+
Eval:
|
102 |
+
dataset:
|
103 |
+
name: LMDBDataSet
|
104 |
+
data_dir: ../evaluation
|
105 |
+
transforms:
|
106 |
+
- DecodeImagePIL: # load image
|
107 |
+
img_mode: RGB
|
108 |
+
- ARLabelEncode: # Class handling label
|
109 |
+
- RecTVResize:
|
110 |
+
image_shape: [64, 256]
|
111 |
+
padding: False
|
112 |
+
- KeepKeys:
|
113 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
114 |
+
loader:
|
115 |
+
shuffle: False
|
116 |
+
drop_last: False
|
117 |
+
batch_size_per_card: 256
|
118 |
+
num_workers: 2
|
configs/rec/cam/convnextv2_tiny_cam_tps_on.yml
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/convnextv2_tiny_cam_tps_on
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: False
|
10 |
+
pretrained_model:
|
11 |
+
checkpoints:
|
12 |
+
use_tensorboard: false
|
13 |
+
infer_img:
|
14 |
+
# for data or label process
|
15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
16 |
+
max_text_length: &max_text_length 25
|
17 |
+
use_space_char: False
|
18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_convnextv2_cam_tps_on.txt
|
19 |
+
use_amp: True
|
20 |
+
|
21 |
+
Optimizer:
|
22 |
+
name: AdamW
|
23 |
+
lr: 0.0008 # for 4gpus bs256/gpu
|
24 |
+
weight_decay: 0.05
|
25 |
+
filter_bias_and_bn: True
|
26 |
+
eps: 1.e-8
|
27 |
+
|
28 |
+
LRScheduler:
|
29 |
+
name: OneCycleLR
|
30 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 : 1.5ep
|
31 |
+
cycle_momentum: False
|
32 |
+
|
33 |
+
Architecture:
|
34 |
+
model_type: rec
|
35 |
+
algorithm: CAM
|
36 |
+
Transform:
|
37 |
+
name: Aster_TPS
|
38 |
+
tps_inputsize: [32, 64]
|
39 |
+
tps_outputsize: &img_shape [32, 128]
|
40 |
+
Encoder:
|
41 |
+
name: CAMEncoder
|
42 |
+
encoder_config:
|
43 |
+
name: ConvNeXtV2
|
44 |
+
depths: [3, 3, 9, 3]
|
45 |
+
dims: [96, 192, 384, 768]
|
46 |
+
strides: [[4,4], [2,1], [2,1], [1,1]]
|
47 |
+
drop_path_rate: 0.2
|
48 |
+
feat2d: True
|
49 |
+
nb_classes: 97
|
50 |
+
strides: [[4,4], [2,1], [2,1], [1,1]]
|
51 |
+
deform_stride: 2
|
52 |
+
stage_idx: 2
|
53 |
+
use_depthwise_unet: True
|
54 |
+
use_more_unet: False
|
55 |
+
binary_loss_type: BanlanceMultiClassCrossEntropyLoss
|
56 |
+
mid_size: False
|
57 |
+
d_embedding: 512
|
58 |
+
Decoder:
|
59 |
+
name: CAMDecoder
|
60 |
+
num_encoder_layers: -1
|
61 |
+
beam_size: 0
|
62 |
+
num_decoder_layers: 2
|
63 |
+
nhead: 8
|
64 |
+
max_len: *max_text_length
|
65 |
+
|
66 |
+
Loss:
|
67 |
+
name: CAMLoss
|
68 |
+
loss_weight_binary: 1.5
|
69 |
+
label_smoothing: 0.
|
70 |
+
|
71 |
+
Metric:
|
72 |
+
name: RecMetric
|
73 |
+
main_indicator: acc
|
74 |
+
is_filter: True
|
75 |
+
|
76 |
+
PostProcess:
|
77 |
+
name: ARLabelDecode
|
78 |
+
|
79 |
+
Train:
|
80 |
+
dataset:
|
81 |
+
name: LMDBDataSet
|
82 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
83 |
+
transforms:
|
84 |
+
- DecodeImagePIL: # load image
|
85 |
+
img_mode: RGB
|
86 |
+
- PARSeqAugPIL:
|
87 |
+
- CAMLabelEncode: # Class handling label
|
88 |
+
font_path: ./arial.ttf
|
89 |
+
image_shape: *img_shape
|
90 |
+
- RecTVResize:
|
91 |
+
image_shape: [64, 256]
|
92 |
+
padding: False
|
93 |
+
- KeepKeys:
|
94 |
+
keep_keys: ['image', 'label', 'length', 'binary_mask'] # dataloader will return list in this order
|
95 |
+
loader:
|
96 |
+
shuffle: True
|
97 |
+
batch_size_per_card: 256
|
98 |
+
drop_last: True
|
99 |
+
num_workers: 4
|
100 |
+
|
101 |
+
Eval:
|
102 |
+
dataset:
|
103 |
+
name: LMDBDataSet
|
104 |
+
data_dir: ../evaluation
|
105 |
+
transforms:
|
106 |
+
- DecodeImagePIL: # load image
|
107 |
+
img_mode: RGB
|
108 |
+
- ARLabelEncode: # Class handling label
|
109 |
+
- RecTVResize:
|
110 |
+
image_shape: [64, 256]
|
111 |
+
padding: False
|
112 |
+
- KeepKeys:
|
113 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
114 |
+
loader:
|
115 |
+
shuffle: False
|
116 |
+
drop_last: False
|
117 |
+
batch_size_per_card: 256
|
118 |
+
num_workers: 2
|
configs/rec/cam/svtrv2_cam_tps_on.yml
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_cam_tps_on
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: False
|
10 |
+
pretrained_model:
|
11 |
+
checkpoints:
|
12 |
+
use_tensorboard: false
|
13 |
+
infer_img:
|
14 |
+
# for data or label process
|
15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
16 |
+
max_text_length: &max_text_length 25
|
17 |
+
use_space_char: False
|
18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_cam_tps_on.txt
|
19 |
+
use_amp: True
|
20 |
+
|
21 |
+
Optimizer:
|
22 |
+
name: AdamW
|
23 |
+
lr: 0.00065 # for 4gpus bs256/gpu
|
24 |
+
weight_decay: 0.05
|
25 |
+
filter_bias_and_bn: True
|
26 |
+
|
27 |
+
LRScheduler:
|
28 |
+
name: OneCycleLR
|
29 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 : 1.5ep
|
30 |
+
cycle_momentum: False
|
31 |
+
|
32 |
+
Architecture:
|
33 |
+
model_type: rec
|
34 |
+
algorithm: CAM
|
35 |
+
Transform:
|
36 |
+
name: Aster_TPS
|
37 |
+
tps_inputsize: [32, 64]
|
38 |
+
tps_outputsize: &img_shape [32, 128]
|
39 |
+
Encoder:
|
40 |
+
name: CAMEncoder
|
41 |
+
encoder_config:
|
42 |
+
name: SVTRv2LNConvTwo33
|
43 |
+
use_pos_embed: False
|
44 |
+
dims: [128, 256, 384]
|
45 |
+
depths: [6, 6, 6]
|
46 |
+
num_heads: [4, 8, 12]
|
47 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
48 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
49 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
50 |
+
last_stage: false
|
51 |
+
feat2d: True
|
52 |
+
nb_classes: 97
|
53 |
+
strides: [[4, 4], [1, 1], [2, 1], [1, 1]]
|
54 |
+
k_size: [[2, 2], [1, 1], [2, 1], [1, 1]]
|
55 |
+
q_size: [4, 32]
|
56 |
+
deform_stride: 2
|
57 |
+
stage_idx: 2
|
58 |
+
use_depthwise_unet: True
|
59 |
+
use_more_unet: False
|
60 |
+
binary_loss_type: BanlanceMultiClassCrossEntropyLoss
|
61 |
+
mid_size: True
|
62 |
+
d_embedding: 384
|
63 |
+
Decoder:
|
64 |
+
name: CAMDecoder
|
65 |
+
num_encoder_layers: -1
|
66 |
+
beam_size: 0
|
67 |
+
num_decoder_layers: 2
|
68 |
+
nhead: 8
|
69 |
+
max_len: *max_text_length
|
70 |
+
|
71 |
+
Loss:
|
72 |
+
name: CAMLoss
|
73 |
+
loss_weight_binary: 1.5
|
74 |
+
label_smoothing: 0.
|
75 |
+
|
76 |
+
Metric:
|
77 |
+
name: RecMetric
|
78 |
+
main_indicator: acc
|
79 |
+
is_filter: True
|
80 |
+
|
81 |
+
PostProcess:
|
82 |
+
name: ARLabelDecode
|
83 |
+
|
84 |
+
Train:
|
85 |
+
dataset:
|
86 |
+
name: LMDBDataSet
|
87 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
88 |
+
transforms:
|
89 |
+
- DecodeImagePIL: # load image
|
90 |
+
img_mode: RGB
|
91 |
+
- PARSeqAugPIL:
|
92 |
+
- CAMLabelEncode: # Class handling label
|
93 |
+
font_path: ./arial.ttf
|
94 |
+
image_shape: *img_shape
|
95 |
+
- RecTVResize:
|
96 |
+
image_shape: [64, 256]
|
97 |
+
padding: False
|
98 |
+
- KeepKeys:
|
99 |
+
keep_keys: ['image', 'label', 'length', 'binary_mask'] # dataloader will return list in this order
|
100 |
+
loader:
|
101 |
+
shuffle: True
|
102 |
+
batch_size_per_card: 256
|
103 |
+
drop_last: True
|
104 |
+
num_workers: 4
|
105 |
+
|
106 |
+
Eval:
|
107 |
+
dataset:
|
108 |
+
name: LMDBDataSet
|
109 |
+
data_dir: ../evaluation
|
110 |
+
transforms:
|
111 |
+
- DecodeImagePIL: # load image
|
112 |
+
img_mode: RGB
|
113 |
+
- ARLabelEncode: # Class handling label
|
114 |
+
- RecTVResize:
|
115 |
+
image_shape: [64, 256]
|
116 |
+
padding: False
|
117 |
+
- KeepKeys:
|
118 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
119 |
+
loader:
|
120 |
+
shuffle: False
|
121 |
+
drop_last: False
|
122 |
+
batch_size_per_card: 256
|
123 |
+
num_workers: 2
|
configs/rec/cdistnet/resnet45_trans_cdistnet.yml
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/resnet45_trans_cdistnet
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: True
|
10 |
+
pretrained_model:
|
11 |
+
checkpoints:
|
12 |
+
use_tensorboard: false
|
13 |
+
infer_img:
|
14 |
+
# for data or label process
|
15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
16 |
+
max_text_length: 25
|
17 |
+
use_space_char: False
|
18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_resnet45_trans_cdistnet.txt
|
19 |
+
use_amp: True
|
20 |
+
grad_clip_val: 5
|
21 |
+
|
22 |
+
Optimizer:
|
23 |
+
name: Adam
|
24 |
+
lr: 0.002 # for 4gpus bs256/gpu
|
25 |
+
weight_decay: 0.0
|
26 |
+
filter_bias_and_bn: False
|
27 |
+
|
28 |
+
LRScheduler:
|
29 |
+
name: OneCycleLR
|
30 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
31 |
+
cycle_momentum: False
|
32 |
+
|
33 |
+
Architecture:
|
34 |
+
model_type: rec
|
35 |
+
algorithm: CDistNet
|
36 |
+
Transform:
|
37 |
+
Encoder:
|
38 |
+
name: ResNet45
|
39 |
+
in_channels: 3
|
40 |
+
strides: [2, 1, 2, 1, 1]
|
41 |
+
Decoder:
|
42 |
+
name: CDistNetDecoder
|
43 |
+
add_conv: True
|
44 |
+
|
45 |
+
Loss:
|
46 |
+
name: ARLoss
|
47 |
+
|
48 |
+
PostProcess:
|
49 |
+
name: ARLabelDecode
|
50 |
+
|
51 |
+
Metric:
|
52 |
+
name: RecMetric
|
53 |
+
main_indicator: acc
|
54 |
+
is_filter: True
|
55 |
+
|
56 |
+
Train:
|
57 |
+
dataset:
|
58 |
+
name: LMDBDataSet
|
59 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
60 |
+
transforms:
|
61 |
+
- DecodeImagePIL: # load image
|
62 |
+
img_mode: RGB
|
63 |
+
- PARSeqAugPIL:
|
64 |
+
- ARLabelEncode: # Class handling label
|
65 |
+
- RecTVResize:
|
66 |
+
image_shape: [32, 128]
|
67 |
+
padding: False
|
68 |
+
- KeepKeys:
|
69 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
70 |
+
loader:
|
71 |
+
shuffle: True
|
72 |
+
batch_size_per_card: 256
|
73 |
+
drop_last: True
|
74 |
+
num_workers: 4
|
75 |
+
|
76 |
+
Eval:
|
77 |
+
dataset:
|
78 |
+
name: LMDBDataSet
|
79 |
+
data_dir: ../evaluation
|
80 |
+
transforms:
|
81 |
+
- DecodeImagePIL: # load image
|
82 |
+
img_mode: RGB
|
83 |
+
- ARLabelEncode: # Class handling label
|
84 |
+
- RecTVResize:
|
85 |
+
image_shape: [32, 128]
|
86 |
+
padding: False
|
87 |
+
- KeepKeys:
|
88 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
89 |
+
loader:
|
90 |
+
shuffle: False
|
91 |
+
drop_last: False
|
92 |
+
batch_size_per_card: 256
|
93 |
+
num_workers: 2
|
configs/rec/cdistnet/svtrv2_cdistnet.yml
ADDED
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_cdistnet/
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: True
|
10 |
+
pretrained_model:
|
11 |
+
checkpoints:
|
12 |
+
use_tensorboard: false
|
13 |
+
infer_img:
|
14 |
+
# for data or label process
|
15 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
|
16 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
17 |
+
max_text_length: &max_text_length 25
|
18 |
+
use_space_char: &use_space_char False
|
19 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_cdistnet.txt
|
20 |
+
use_amp: True
|
21 |
+
|
22 |
+
Optimizer:
|
23 |
+
name: AdamW
|
24 |
+
lr: 0.00065 #4gpus bs256/gpu
|
25 |
+
weight_decay: 0.05
|
26 |
+
filter_bias_and_bn: True
|
27 |
+
|
28 |
+
LRScheduler:
|
29 |
+
name: OneCycleLR
|
30 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
31 |
+
cycle_momentum: False
|
32 |
+
|
33 |
+
Architecture:
|
34 |
+
model_type: rec
|
35 |
+
algorithm: CDistNet
|
36 |
+
in_channels: 3
|
37 |
+
Transform:
|
38 |
+
Encoder:
|
39 |
+
name: SVTRv2LNConvTwo33
|
40 |
+
use_pos_embed: False
|
41 |
+
out_channels: 256
|
42 |
+
dims: [128, 256, 384]
|
43 |
+
depths: [6, 6, 6]
|
44 |
+
num_heads: [4, 8, 12]
|
45 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
46 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
47 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
48 |
+
last_stage: false
|
49 |
+
feat2d: True
|
50 |
+
Decoder:
|
51 |
+
name: CDistNetDecoder
|
52 |
+
add_conv: False
|
53 |
+
num_encoder_blocks: 0
|
54 |
+
|
55 |
+
Loss:
|
56 |
+
name: ARLoss
|
57 |
+
|
58 |
+
PostProcess:
|
59 |
+
name: ARLabelDecode
|
60 |
+
character_dict_path: *character_dict_path
|
61 |
+
use_space_char: *use_space_char
|
62 |
+
|
63 |
+
Metric:
|
64 |
+
name: RecMetric
|
65 |
+
main_indicator: acc
|
66 |
+
is_filter: True
|
67 |
+
|
68 |
+
Train:
|
69 |
+
dataset:
|
70 |
+
name: RatioDataSetTVResize
|
71 |
+
ds_width: True
|
72 |
+
padding: false
|
73 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
74 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
75 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
76 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
77 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
78 |
+
]
|
79 |
+
transforms:
|
80 |
+
- DecodeImagePIL: # load image
|
81 |
+
img_mode: RGB
|
82 |
+
- PARSeqAugPIL:
|
83 |
+
- ARLabelEncode: # Class handling label
|
84 |
+
character_dict_path: *character_dict_path
|
85 |
+
use_space_char: *use_space_char
|
86 |
+
max_text_length: *max_text_length
|
87 |
+
- KeepKeys:
|
88 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
89 |
+
sampler:
|
90 |
+
name: RatioSampler
|
91 |
+
scales: [[128, 32]] # w, h
|
92 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
93 |
+
first_bs: &bs 256
|
94 |
+
fix_bs: false
|
95 |
+
divided_factor: [4, 16] # w, h
|
96 |
+
is_training: True
|
97 |
+
loader:
|
98 |
+
shuffle: True
|
99 |
+
batch_size_per_card: *bs
|
100 |
+
drop_last: True
|
101 |
+
max_ratio: &max_ratio 4
|
102 |
+
num_workers: 4
|
103 |
+
|
104 |
+
Eval:
|
105 |
+
dataset:
|
106 |
+
name: RatioDataSetTVResize
|
107 |
+
ds_width: True
|
108 |
+
padding: False
|
109 |
+
data_dir_list: [
|
110 |
+
'../evaluation/CUTE80',
|
111 |
+
'../evaluation/IC13_857',
|
112 |
+
'../evaluation/IC15_1811',
|
113 |
+
'../evaluation/IIIT5k',
|
114 |
+
'../evaluation/SVT',
|
115 |
+
'../evaluation/SVTP',
|
116 |
+
]
|
117 |
+
transforms:
|
118 |
+
- DecodeImagePIL: # load image
|
119 |
+
img_mode: RGB
|
120 |
+
- ARLabelEncode: # Class handling label
|
121 |
+
character_dict_path: *character_dict_path
|
122 |
+
use_space_char: *use_space_char
|
123 |
+
max_text_length: *max_text_length
|
124 |
+
- KeepKeys:
|
125 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
126 |
+
sampler:
|
127 |
+
name: RatioSampler
|
128 |
+
scales: [[128, 32]] # w, h
|
129 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
130 |
+
first_bs: *bs
|
131 |
+
fix_bs: false
|
132 |
+
divided_factor: [4, 16] # w, h
|
133 |
+
is_training: False
|
134 |
+
loader:
|
135 |
+
shuffle: False
|
136 |
+
drop_last: False
|
137 |
+
batch_size_per_card: *bs
|
138 |
+
max_ratio: *max_ratio
|
139 |
+
num_workers: 4
|
configs/rec/cppd/svtr_base_cppd.yml
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/svtr_base_cppd/
|
7 |
+
save_epoch_step: 1
|
8 |
+
# evaluation is run every 2000 iterations
|
9 |
+
eval_batch_step: [0, 500]
|
10 |
+
eval_epoch_step: [0, 1]
|
11 |
+
cal_metric_during_train: True
|
12 |
+
pretrained_model:
|
13 |
+
checkpoints:
|
14 |
+
use_tensorboard: false
|
15 |
+
infer_img:
|
16 |
+
# for data or label process
|
17 |
+
character_dict_path: &character_dict_path
|
18 |
+
# ./tools/utils/EN_symbol_dict.txt # 96en
|
19 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
20 |
+
max_text_length: &max_text_length 25
|
21 |
+
use_space_char: &use_space_char False
|
22 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtr_base_cppd.txt
|
23 |
+
use_amp: True
|
24 |
+
|
25 |
+
Optimizer:
|
26 |
+
name: AdamW
|
27 |
+
lr: 0.00065 # for 4gpus bs256/gpu
|
28 |
+
weight_decay: 0.05
|
29 |
+
filter_bias_and_bn: True
|
30 |
+
|
31 |
+
LRScheduler:
|
32 |
+
name: OneCycleLR
|
33 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
34 |
+
cycle_momentum: False
|
35 |
+
|
36 |
+
Architecture:
|
37 |
+
model_type: rec
|
38 |
+
algorithm: CPPD
|
39 |
+
in_channels: 3
|
40 |
+
Transform:
|
41 |
+
Encoder:
|
42 |
+
name: SVTRNet
|
43 |
+
img_size: [32, 128]
|
44 |
+
out_char_num: 25
|
45 |
+
out_channels: 256
|
46 |
+
patch_merging: 'Conv'
|
47 |
+
embed_dim: [128, 256, 384]
|
48 |
+
depth: [6, 6, 6]
|
49 |
+
num_heads: [4, 8, 12]
|
50 |
+
mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
|
51 |
+
local_mixer: [[5, 5], [5, 5], [5, 5]]
|
52 |
+
last_stage: False
|
53 |
+
prenorm: True
|
54 |
+
Decoder:
|
55 |
+
name: CPPDDecoder
|
56 |
+
vis_seq: 64
|
57 |
+
num_layer: 2
|
58 |
+
pos_len: False
|
59 |
+
rec_layer: 1
|
60 |
+
|
61 |
+
|
62 |
+
Loss:
|
63 |
+
name: CPPDLoss
|
64 |
+
ignore_index: 100
|
65 |
+
smoothing: True
|
66 |
+
pos_len: False
|
67 |
+
sideloss_weight: 1.0
|
68 |
+
|
69 |
+
PostProcess:
|
70 |
+
name: CPPDLabelDecode
|
71 |
+
character_dict_path: *character_dict_path
|
72 |
+
use_space_char: *use_space_char
|
73 |
+
|
74 |
+
Metric:
|
75 |
+
name: RecMetric
|
76 |
+
main_indicator: acc
|
77 |
+
|
78 |
+
Train:
|
79 |
+
dataset:
|
80 |
+
name: LMDBDataSet
|
81 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
82 |
+
transforms:
|
83 |
+
- DecodeImagePIL: # load image
|
84 |
+
img_mode: RGB
|
85 |
+
- PARSeqAugPIL:
|
86 |
+
- CPPDLabelEncode: # Class handling label
|
87 |
+
pos_len: False
|
88 |
+
character_dict_path: *character_dict_path
|
89 |
+
use_space_char: *use_space_char
|
90 |
+
max_text_length: *max_text_length
|
91 |
+
- RecTVResize:
|
92 |
+
image_shape: [32, 128]
|
93 |
+
padding: False
|
94 |
+
- KeepKeys:
|
95 |
+
keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
|
96 |
+
loader:
|
97 |
+
shuffle: True
|
98 |
+
batch_size_per_card: 256
|
99 |
+
drop_last: True
|
100 |
+
num_workers: 4
|
101 |
+
|
102 |
+
Eval:
|
103 |
+
dataset:
|
104 |
+
name: LMDBDataSet
|
105 |
+
data_dir: ../evaluation/
|
106 |
+
transforms:
|
107 |
+
- DecodeImagePIL: # load image
|
108 |
+
img_mode: RGB
|
109 |
+
- CPPDLabelEncode: # Class handling label
|
110 |
+
pos_len: False
|
111 |
+
character_dict_path: *character_dict_path
|
112 |
+
use_space_char: *use_space_char
|
113 |
+
max_text_length: *max_text_length
|
114 |
+
- RecTVResize:
|
115 |
+
image_shape: [32, 128]
|
116 |
+
padding: False
|
117 |
+
- KeepKeys:
|
118 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
119 |
+
loader:
|
120 |
+
shuffle: False
|
121 |
+
drop_last: False
|
122 |
+
batch_size_per_card: 128
|
123 |
+
num_workers: 4
|
configs/rec/cppd/svtr_base_cppd_ch.yml
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 100
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/ch/svtr_base_cppd/
|
7 |
+
save_epoch_step: 1
|
8 |
+
# evaluation is run every 2000 iterations
|
9 |
+
eval_batch_step: [0, 2000]
|
10 |
+
eval_epoch_step: [0, 1]
|
11 |
+
cal_metric_during_train: False
|
12 |
+
pretrained_model:
|
13 |
+
checkpoints:
|
14 |
+
use_tensorboard: false
|
15 |
+
infer_img:
|
16 |
+
# for data or label process
|
17 |
+
character_dict_path: &character_dict_path ./tools/utils/ppocr_keys_v1.txt
|
18 |
+
# ./tools/utils/EN_symbol_dict.txt # 96en
|
19 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
20 |
+
max_text_length: &max_text_length 25
|
21 |
+
use_space_char: &use_space_char False
|
22 |
+
save_res_path: ./output/rec/ch/predicts_svtr_base_cppd.txt
|
23 |
+
use_amp: True
|
24 |
+
|
25 |
+
Optimizer:
|
26 |
+
name: AdamW
|
27 |
+
lr: 0.0005 # for 4gpus bs128/gpu
|
28 |
+
weight_decay: 0.05
|
29 |
+
filter_bias_and_bn: True
|
30 |
+
|
31 |
+
LRScheduler:
|
32 |
+
name: CosineAnnealingLR
|
33 |
+
warmup_epoch: 5
|
34 |
+
|
35 |
+
Architecture:
|
36 |
+
model_type: rec
|
37 |
+
algorithm: CPPD
|
38 |
+
in_channels: 3
|
39 |
+
Transform:
|
40 |
+
Encoder:
|
41 |
+
name: SVTRNet
|
42 |
+
img_size: [32, 256]
|
43 |
+
patch_merging: 'Conv'
|
44 |
+
embed_dim: [128, 256, 384]
|
45 |
+
depth: [6, 6, 4]
|
46 |
+
num_heads: [4, 8, 12]
|
47 |
+
mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
|
48 |
+
local_mixer: [[5, 5], [5, 5], [5, 5]]
|
49 |
+
last_stage: False
|
50 |
+
prenorm: True
|
51 |
+
Decoder:
|
52 |
+
name: CPPDDecoder
|
53 |
+
vis_seq: 128
|
54 |
+
num_layer: 3
|
55 |
+
pos_len: False
|
56 |
+
rec_layer: 1
|
57 |
+
ch: True
|
58 |
+
|
59 |
+
|
60 |
+
Loss:
|
61 |
+
name: CPPDLoss
|
62 |
+
ignore_index: 7000
|
63 |
+
smoothing: True
|
64 |
+
pos_len: False
|
65 |
+
sideloss_weight: 1.0
|
66 |
+
|
67 |
+
PostProcess:
|
68 |
+
name: CPPDLabelDecode
|
69 |
+
character_dict_path: *character_dict_path
|
70 |
+
use_space_char: *use_space_char
|
71 |
+
|
72 |
+
Metric:
|
73 |
+
name: RecMetric
|
74 |
+
main_indicator: acc
|
75 |
+
|
76 |
+
Train:
|
77 |
+
dataset:
|
78 |
+
name: LMDBDataSet
|
79 |
+
data_dir: ../benchmark_bctr/benchmark_bctr_train
|
80 |
+
transforms:
|
81 |
+
- DecodeImage: # load image
|
82 |
+
img_mode: BGR
|
83 |
+
channel_first: False
|
84 |
+
- CPPDLabelEncode: # Class handling label
|
85 |
+
pos_len: False
|
86 |
+
ch: True
|
87 |
+
ignore_index: 7000
|
88 |
+
character_dict_path: *character_dict_path
|
89 |
+
use_space_char: *use_space_char
|
90 |
+
max_text_length: *max_text_length
|
91 |
+
- SVTRResize:
|
92 |
+
image_shape: [3, 32, 256]
|
93 |
+
padding: True
|
94 |
+
- KeepKeys:
|
95 |
+
keep_keys: ['image', 'label', 'label_node', 'label_index', 'length'] # dataloader will return list in this order
|
96 |
+
loader:
|
97 |
+
shuffle: True
|
98 |
+
batch_size_per_card: 128
|
99 |
+
drop_last: True
|
100 |
+
num_workers: 8
|
101 |
+
|
102 |
+
Eval:
|
103 |
+
dataset:
|
104 |
+
name: LMDBDataSet
|
105 |
+
data_dir: ../benchmark_bctr/benchmark_bctr_test/scene_test
|
106 |
+
transforms:
|
107 |
+
- DecodeImage: # load image
|
108 |
+
img_mode: BGR
|
109 |
+
channel_first: False
|
110 |
+
- CPPDLabelEncode: # Class handling label
|
111 |
+
pos_len: False
|
112 |
+
ch: True
|
113 |
+
ignore_index: 7000
|
114 |
+
character_dict_path: *character_dict_path
|
115 |
+
use_space_char: *use_space_char
|
116 |
+
max_text_length: *max_text_length
|
117 |
+
- SVTRResize:
|
118 |
+
image_shape: [3, 32, 256]
|
119 |
+
padding: True
|
120 |
+
- KeepKeys:
|
121 |
+
keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
|
122 |
+
loader:
|
123 |
+
shuffle: False
|
124 |
+
drop_last: False
|
125 |
+
batch_size_per_card: 256
|
126 |
+
num_workers: 4
|
configs/rec/cppd/svtr_base_cppd_h8.yml
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/svtr_base_h8_cppd/
|
7 |
+
save_epoch_step: 1
|
8 |
+
# evaluation is run every 2000 iterations
|
9 |
+
eval_batch_step: [0, 500]
|
10 |
+
eval_epoch_step: [0, 1]
|
11 |
+
cal_metric_during_train: True
|
12 |
+
pretrained_model:
|
13 |
+
checkpoints:
|
14 |
+
use_tensorboard: false
|
15 |
+
infer_img:
|
16 |
+
# for data or label process
|
17 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
|
18 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
19 |
+
max_text_length: &max_text_length 25
|
20 |
+
use_space_char: &use_space_char False
|
21 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtr_base_cppd.txt
|
22 |
+
use_amp: True
|
23 |
+
|
24 |
+
Optimizer:
|
25 |
+
name: AdamW
|
26 |
+
lr: 0.00065 # for 4gpus bs256/gpu
|
27 |
+
weight_decay: 0.05
|
28 |
+
filter_bias_and_bn: True
|
29 |
+
|
30 |
+
LRScheduler:
|
31 |
+
name: OneCycleLR
|
32 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
33 |
+
cycle_momentum: False
|
34 |
+
|
35 |
+
Architecture:
|
36 |
+
model_type: rec
|
37 |
+
algorithm: CPPD
|
38 |
+
in_channels: 3
|
39 |
+
Transform:
|
40 |
+
Encoder:
|
41 |
+
name: SVTRNet
|
42 |
+
img_size: [32, 128]
|
43 |
+
out_char_num: 25
|
44 |
+
out_channels: 256
|
45 |
+
patch_merging: 'Conv'
|
46 |
+
embed_dim: [128, 256, 384]
|
47 |
+
depth: [6, 6, 6]
|
48 |
+
num_heads: [4, 8, 12]
|
49 |
+
sub_k: [[1, 1], [2, 1]]
|
50 |
+
mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
|
51 |
+
local_mixer: [[5, 5], [5, 5], [5, 5]]
|
52 |
+
last_stage: False
|
53 |
+
prenorm: True
|
54 |
+
Decoder:
|
55 |
+
name: CPPDDecoder
|
56 |
+
vis_seq: 128
|
57 |
+
num_layer: 2
|
58 |
+
pos_len: False
|
59 |
+
rec_layer: 1
|
60 |
+
|
61 |
+
Loss:
|
62 |
+
name: CPPDLoss
|
63 |
+
ignore_index: 100
|
64 |
+
smoothing: True
|
65 |
+
pos_len: False
|
66 |
+
sideloss_weight: 1.0
|
67 |
+
|
68 |
+
PostProcess:
|
69 |
+
name: CPPDLabelDecode
|
70 |
+
character_dict_path: *character_dict_path
|
71 |
+
use_space_char: *use_space_char
|
72 |
+
|
73 |
+
Metric:
|
74 |
+
name: RecMetric
|
75 |
+
main_indicator: acc
|
76 |
+
is_filter: True
|
77 |
+
|
78 |
+
Train:
|
79 |
+
dataset:
|
80 |
+
name: LMDBDataSet
|
81 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
82 |
+
transforms:
|
83 |
+
- DecodeImagePIL: # load image
|
84 |
+
img_mode: RGB
|
85 |
+
- PARSeqAugPIL:
|
86 |
+
- CPPDLabelEncode: # Class handling label
|
87 |
+
pos_len: False
|
88 |
+
character_dict_path: *character_dict_path
|
89 |
+
use_space_char: *use_space_char
|
90 |
+
max_text_length: *max_text_length
|
91 |
+
- RecTVResize:
|
92 |
+
image_shape: [32, 128]
|
93 |
+
padding: False
|
94 |
+
- KeepKeys:
|
95 |
+
keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
|
96 |
+
loader:
|
97 |
+
shuffle: True
|
98 |
+
batch_size_per_card: 256
|
99 |
+
drop_last: True
|
100 |
+
num_workers: 4
|
101 |
+
|
102 |
+
Eval:
|
103 |
+
dataset:
|
104 |
+
name: LMDBDataSet
|
105 |
+
data_dir: ../evaluation/
|
106 |
+
transforms:
|
107 |
+
- DecodeImagePIL: # load image
|
108 |
+
img_mode: RGB
|
109 |
+
- CPPDLabelEncode: # Class handling label
|
110 |
+
pos_len: False
|
111 |
+
character_dict_path: *character_dict_path
|
112 |
+
use_space_char: *use_space_char
|
113 |
+
max_text_length: *max_text_length
|
114 |
+
- RecTVResize:
|
115 |
+
image_shape: [32, 128]
|
116 |
+
padding: False
|
117 |
+
- KeepKeys:
|
118 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
119 |
+
loader:
|
120 |
+
shuffle: False
|
121 |
+
drop_last: False
|
122 |
+
batch_size_per_card: 128
|
123 |
+
num_workers: 4
|
configs/rec/cppd/svtr_base_cppd_syn.yml
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 60
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/syn/svtr_base_cppd/
|
7 |
+
save_epoch_step: 1
|
8 |
+
# evaluation is run every 2000 iterations
|
9 |
+
eval_batch_step: [0, 500]
|
10 |
+
eval_epoch_step: [0, 1]
|
11 |
+
cal_metric_during_train: True
|
12 |
+
pretrained_model:
|
13 |
+
checkpoints:
|
14 |
+
use_tensorboard: false
|
15 |
+
infer_img:
|
16 |
+
# for data or label process
|
17 |
+
character_dict_path: &character_dict_path
|
18 |
+
# ./tools/utils/EN_symbol_dict.txt # 96en
|
19 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
20 |
+
max_text_length: &max_text_length 25
|
21 |
+
use_space_char: &use_space_char False
|
22 |
+
save_res_path: ./output/rec/syn/predicts_svtr_base_cppd.txt
|
23 |
+
use_amp: True
|
24 |
+
|
25 |
+
Optimizer:
|
26 |
+
name: AdamW
|
27 |
+
lr: 0.0005 # for 4gpus bs256/gpu
|
28 |
+
weight_decay: 0.05
|
29 |
+
filter_bias_and_bn: True
|
30 |
+
|
31 |
+
LRScheduler:
|
32 |
+
name: CosineAnnealingLR
|
33 |
+
warmup_epoch: 6
|
34 |
+
|
35 |
+
Architecture:
|
36 |
+
model_type: rec
|
37 |
+
algorithm: CPPD
|
38 |
+
in_channels: 3
|
39 |
+
Transform:
|
40 |
+
Encoder:
|
41 |
+
name: SVTRNet
|
42 |
+
img_size: [32, 100]
|
43 |
+
out_char_num: 25
|
44 |
+
out_channels: 256
|
45 |
+
patch_merging: 'Conv'
|
46 |
+
embed_dim: [128, 256, 384]
|
47 |
+
depth: [6, 6, 4]
|
48 |
+
num_heads: [4, 8, 12]
|
49 |
+
mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
|
50 |
+
local_mixer: [[5, 5], [5, 5], [5, 5]]
|
51 |
+
last_stage: False
|
52 |
+
prenorm: True
|
53 |
+
Decoder:
|
54 |
+
name: CPPDDecoder
|
55 |
+
vis_seq: 50
|
56 |
+
num_layer: 3
|
57 |
+
pos_len: False
|
58 |
+
rec_layer: 1
|
59 |
+
|
60 |
+
|
61 |
+
Loss:
|
62 |
+
name: CPPDLoss
|
63 |
+
ignore_index: 100
|
64 |
+
smoothing: True
|
65 |
+
pos_len: False
|
66 |
+
sideloss_weight: 1.0
|
67 |
+
|
68 |
+
PostProcess:
|
69 |
+
name: CPPDLabelDecode
|
70 |
+
character_dict_path: *character_dict_path
|
71 |
+
use_space_char: *use_space_char
|
72 |
+
|
73 |
+
Metric:
|
74 |
+
name: RecMetric
|
75 |
+
main_indicator: acc
|
76 |
+
|
77 |
+
Train:
|
78 |
+
dataset:
|
79 |
+
name: STRLMDBDataSet
|
80 |
+
data_dir: ./
|
81 |
+
transforms:
|
82 |
+
- DecodeImage: # load image
|
83 |
+
img_mode: BGR
|
84 |
+
channel_first: False
|
85 |
+
# - SVTRRAug:
|
86 |
+
- CPPDLabelEncode: # Class handling label
|
87 |
+
pos_len: False
|
88 |
+
character_dict_path: *character_dict_path
|
89 |
+
use_space_char: *use_space_char
|
90 |
+
max_text_length: *max_text_length
|
91 |
+
- SVTRResize:
|
92 |
+
image_shape: [3, 32, 100]
|
93 |
+
padding: False
|
94 |
+
- KeepKeys:
|
95 |
+
keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
|
96 |
+
loader:
|
97 |
+
shuffle: True
|
98 |
+
batch_size_per_card: 256
|
99 |
+
drop_last: True
|
100 |
+
num_workers: 8
|
101 |
+
|
102 |
+
Eval:
|
103 |
+
dataset:
|
104 |
+
name: LMDBDataSet
|
105 |
+
data_dir: ../evaluation/
|
106 |
+
transforms:
|
107 |
+
- DecodeImage: # load image
|
108 |
+
img_mode: BGR
|
109 |
+
channel_first: False
|
110 |
+
- CPPDLabelEncode: # Class handling label
|
111 |
+
pos_len: False
|
112 |
+
character_dict_path: *character_dict_path
|
113 |
+
use_space_char: *use_space_char
|
114 |
+
max_text_length: *max_text_length
|
115 |
+
- SVTRResize:
|
116 |
+
image_shape: [3, 32, 100]
|
117 |
+
padding: False
|
118 |
+
- KeepKeys:
|
119 |
+
keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
|
120 |
+
loader:
|
121 |
+
shuffle: False
|
122 |
+
drop_last: False
|
123 |
+
batch_size_per_card: 256
|
124 |
+
num_workers: 4
|
configs/rec/cppd/svtrv2_cppd.yml
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_cppd/
|
7 |
+
save_epoch_step: 1
|
8 |
+
# evaluation is run every 2000 iterations
|
9 |
+
eval_batch_step: [0, 500]
|
10 |
+
eval_epoch_step: [0, 1]
|
11 |
+
cal_metric_during_train: True
|
12 |
+
pretrained_model:
|
13 |
+
checkpoints:
|
14 |
+
use_tensorboard: false
|
15 |
+
infer_img:
|
16 |
+
# for data or label process
|
17 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
|
18 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
19 |
+
max_text_length: &max_text_length 25
|
20 |
+
use_space_char: &use_space_char False
|
21 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_cppd.txt
|
22 |
+
use_amp: True
|
23 |
+
|
24 |
+
Optimizer:
|
25 |
+
name: AdamW
|
26 |
+
lr: 0.00065 # for 4gpus bs256/gpu
|
27 |
+
weight_decay: 0.05
|
28 |
+
filter_bias_and_bn: True
|
29 |
+
|
30 |
+
LRScheduler:
|
31 |
+
name: OneCycleLR
|
32 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
33 |
+
cycle_momentum: False
|
34 |
+
|
35 |
+
Architecture:
|
36 |
+
model_type: rec
|
37 |
+
algorithm: CPPD
|
38 |
+
in_channels: 3
|
39 |
+
Transform:
|
40 |
+
Encoder:
|
41 |
+
name: SVTRv2LNConvTwo33
|
42 |
+
use_pos_embed: False
|
43 |
+
out_channels: 256
|
44 |
+
dims: [128, 256, 384]
|
45 |
+
depths: [6, 6, 6]
|
46 |
+
num_heads: [4, 8, 12]
|
47 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
48 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
49 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
50 |
+
last_stage: false
|
51 |
+
feat2d: False
|
52 |
+
Decoder:
|
53 |
+
name: CPPDDecoder
|
54 |
+
ds: True
|
55 |
+
num_layer: 2
|
56 |
+
pos_len: False
|
57 |
+
rec_layer: 1
|
58 |
+
|
59 |
+
|
60 |
+
Loss:
|
61 |
+
name: CPPDLoss
|
62 |
+
ignore_index: 100
|
63 |
+
smoothing: True
|
64 |
+
pos_len: False
|
65 |
+
sideloss_weight: 1.0
|
66 |
+
|
67 |
+
PostProcess:
|
68 |
+
name: CPPDLabelDecode
|
69 |
+
character_dict_path: *character_dict_path
|
70 |
+
use_space_char: *use_space_char
|
71 |
+
|
72 |
+
Metric:
|
73 |
+
name: RecMetric
|
74 |
+
main_indicator: acc
|
75 |
+
is_filter: True
|
76 |
+
|
77 |
+
Train:
|
78 |
+
dataset:
|
79 |
+
name: RatioDataSetTVResize
|
80 |
+
ds_width: True
|
81 |
+
padding: false
|
82 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
83 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
84 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
85 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
86 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
87 |
+
]
|
88 |
+
transforms:
|
89 |
+
- DecodeImagePIL: # load image
|
90 |
+
img_mode: RGB
|
91 |
+
- PARSeqAugPIL:
|
92 |
+
- CPPDLabelEncode: # Class handling label
|
93 |
+
pos_len: False
|
94 |
+
character_dict_path: *character_dict_path
|
95 |
+
use_space_char: *use_space_char
|
96 |
+
max_text_length: *max_text_length
|
97 |
+
- KeepKeys:
|
98 |
+
keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
|
99 |
+
sampler:
|
100 |
+
name: RatioSampler
|
101 |
+
scales: [[128, 32]] # w, h
|
102 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
103 |
+
first_bs: &bs 256
|
104 |
+
fix_bs: false
|
105 |
+
divided_factor: [4, 16] # w, h
|
106 |
+
is_training: True
|
107 |
+
loader:
|
108 |
+
shuffle: True
|
109 |
+
batch_size_per_card: *bs
|
110 |
+
drop_last: True
|
111 |
+
max_ratio: &max_ratio 4
|
112 |
+
num_workers: 4
|
113 |
+
|
114 |
+
Eval:
|
115 |
+
dataset:
|
116 |
+
name: RatioDataSetTVResize
|
117 |
+
ds_width: True
|
118 |
+
padding: False
|
119 |
+
data_dir_list: [
|
120 |
+
'../evaluation/CUTE80',
|
121 |
+
'../evaluation/IC13_857',
|
122 |
+
'../evaluation/IC15_1811',
|
123 |
+
'../evaluation/IIIT5k',
|
124 |
+
'../evaluation/SVT',
|
125 |
+
'../evaluation/SVTP',
|
126 |
+
]
|
127 |
+
transforms:
|
128 |
+
- DecodeImagePIL: # load image
|
129 |
+
img_mode: RGB
|
130 |
+
- CPPDLabelEncode: # Class handling label
|
131 |
+
pos_len: False
|
132 |
+
character_dict_path: *character_dict_path
|
133 |
+
use_space_char: *use_space_char
|
134 |
+
max_text_length: *max_text_length
|
135 |
+
- KeepKeys:
|
136 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
137 |
+
sampler:
|
138 |
+
name: RatioSampler
|
139 |
+
scales: [[128, 32]] # w, h
|
140 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
141 |
+
first_bs: *bs
|
142 |
+
fix_bs: false
|
143 |
+
divided_factor: [4, 16] # w, h
|
144 |
+
is_training: False
|
145 |
+
loader:
|
146 |
+
shuffle: False
|
147 |
+
drop_last: False
|
148 |
+
batch_size_per_card: *bs
|
149 |
+
max_ratio: *max_ratio
|
150 |
+
num_workers: 4
|
configs/rec/dan/resnet45_fpn_dan.yml
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/resnet45_fpn_dan/
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: True
|
10 |
+
pretrained_model:
|
11 |
+
checkpoints:
|
12 |
+
use_tensorboard: false
|
13 |
+
infer_img:
|
14 |
+
# for data or label process
|
15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
16 |
+
max_text_length: 25
|
17 |
+
use_space_char: False
|
18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_resnet45_fpn_dan.txt
|
19 |
+
use_amp: True
|
20 |
+
grad_clip_val: 20
|
21 |
+
|
22 |
+
Optimizer:
|
23 |
+
name: Adam
|
24 |
+
lr: 0.00065 # for 4gpus bs256/gpu
|
25 |
+
weight_decay: 0.0
|
26 |
+
filter_bias_and_bn: False
|
27 |
+
|
28 |
+
LRScheduler:
|
29 |
+
name: OneCycleLR
|
30 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
31 |
+
cycle_momentum: False
|
32 |
+
|
33 |
+
Architecture:
|
34 |
+
model_type: rec
|
35 |
+
algorithm: DAN
|
36 |
+
Transform:
|
37 |
+
Encoder:
|
38 |
+
name: ResNet45
|
39 |
+
in_channels: 3
|
40 |
+
strides: [2, 1, 2, 1, 1]
|
41 |
+
return_list: True
|
42 |
+
Decoder:
|
43 |
+
name: DANDecoder
|
44 |
+
max_len: 25
|
45 |
+
channels_list: [64, 128, 256, 512]
|
46 |
+
strides_list: [[2, 2], [1, 1], [1, 1]]
|
47 |
+
in_shape: [8, 32]
|
48 |
+
depth: 4
|
49 |
+
|
50 |
+
Loss:
|
51 |
+
name: ARLoss
|
52 |
+
|
53 |
+
PostProcess:
|
54 |
+
name: ARLabelDecode
|
55 |
+
|
56 |
+
Metric:
|
57 |
+
name: RecMetric
|
58 |
+
main_indicator: acc
|
59 |
+
is_filter: True
|
60 |
+
|
61 |
+
Train:
|
62 |
+
dataset:
|
63 |
+
name: LMDBDataSet
|
64 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
65 |
+
transforms:
|
66 |
+
- DecodeImagePIL: # load image
|
67 |
+
img_mode: RGB
|
68 |
+
- PARSeqAugPIL:
|
69 |
+
- ARLabelEncode:
|
70 |
+
- RecTVResize:
|
71 |
+
image_shape: [32, 128]
|
72 |
+
padding: False
|
73 |
+
- KeepKeys:
|
74 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
75 |
+
loader:
|
76 |
+
shuffle: True
|
77 |
+
batch_size_per_card: 256
|
78 |
+
drop_last: True
|
79 |
+
num_workers: 4
|
80 |
+
|
81 |
+
Eval:
|
82 |
+
dataset:
|
83 |
+
name: LMDBDataSet
|
84 |
+
data_dir: ../evaluation
|
85 |
+
transforms:
|
86 |
+
- DecodeImagePIL: # load image
|
87 |
+
img_mode: RGB
|
88 |
+
- ARLabelEncode:
|
89 |
+
- RecTVResize:
|
90 |
+
image_shape: [32, 128]
|
91 |
+
padding: False
|
92 |
+
- KeepKeys:
|
93 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
94 |
+
loader:
|
95 |
+
shuffle: False
|
96 |
+
drop_last: False
|
97 |
+
batch_size_per_card: 256
|
98 |
+
num_workers: 2
|
configs/rec/dan/svtrv2_dan.yml
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_dan
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: True
|
10 |
+
pretrained_model:
|
11 |
+
checkpoints:
|
12 |
+
use_tensorboard: false
|
13 |
+
infer_img:
|
14 |
+
# for data or label process
|
15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
16 |
+
max_text_length: 25
|
17 |
+
use_space_char: False
|
18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_dan.txt
|
19 |
+
use_amp: True
|
20 |
+
grad_clip_val: 20
|
21 |
+
|
22 |
+
Optimizer:
|
23 |
+
name: AdamW
|
24 |
+
lr: 0.00065 # 4gpus 256bs/gpu
|
25 |
+
weight_decay: 0.05
|
26 |
+
filter_bias_and_bn: True
|
27 |
+
|
28 |
+
LRScheduler:
|
29 |
+
name: OneCycleLR
|
30 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
31 |
+
cycle_momentum: False
|
32 |
+
|
33 |
+
Architecture:
|
34 |
+
model_type: rec
|
35 |
+
algorithm: DAN
|
36 |
+
Transform:
|
37 |
+
Encoder:
|
38 |
+
name: SVTRv2LNConvTwo33
|
39 |
+
use_pos_embed: False
|
40 |
+
out_channels: 256
|
41 |
+
dims: [128, 256, 384]
|
42 |
+
depths: [6, 6, 6]
|
43 |
+
num_heads: [4, 8, 12]
|
44 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
45 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
46 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
47 |
+
last_stage: false
|
48 |
+
feat2d: True
|
49 |
+
Decoder:
|
50 |
+
name: DANDecoder
|
51 |
+
use_cam: False
|
52 |
+
max_len: 25
|
53 |
+
|
54 |
+
Loss:
|
55 |
+
name: ARLoss
|
56 |
+
|
57 |
+
PostProcess:
|
58 |
+
name: ARLabelDecode
|
59 |
+
|
60 |
+
Metric:
|
61 |
+
name: RecMetric
|
62 |
+
main_indicator: acc
|
63 |
+
is_filter: True
|
64 |
+
|
65 |
+
Train:
|
66 |
+
dataset:
|
67 |
+
name: RatioDataSetTVResize
|
68 |
+
ds_width: True
|
69 |
+
padding: false
|
70 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_filter_train_challenging',
|
71 |
+
'../Union14M-L-LMDB-Filtered/filter_filter_train_hard',
|
72 |
+
'../Union14M-L-LMDB-Filtered/filter_filter_train_medium',
|
73 |
+
'../Union14M-L-LMDB-Filtered/filter_filter_train_normal',
|
74 |
+
'../Union14M-L-LMDB-Filtered/filter_filter_train_easy',
|
75 |
+
]
|
76 |
+
transforms:
|
77 |
+
- DecodeImagePIL: # load image
|
78 |
+
img_mode: RGB
|
79 |
+
- PARSeqAugPIL:
|
80 |
+
- ARLabelEncode:
|
81 |
+
- KeepKeys:
|
82 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
83 |
+
sampler:
|
84 |
+
name: RatioSampler
|
85 |
+
scales: [[128, 32]] # w, h
|
86 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
87 |
+
first_bs: &bs 256
|
88 |
+
fix_bs: false
|
89 |
+
divided_factor: [4, 16] # w, h
|
90 |
+
is_training: True
|
91 |
+
loader:
|
92 |
+
shuffle: True
|
93 |
+
batch_size_per_card: *bs
|
94 |
+
drop_last: True
|
95 |
+
max_ratio: &max_ratio 4
|
96 |
+
num_workers: 4
|
97 |
+
|
98 |
+
Eval:
|
99 |
+
dataset:
|
100 |
+
name: RatioDataSetTVResize
|
101 |
+
ds_width: True
|
102 |
+
padding: False
|
103 |
+
data_dir_list: [
|
104 |
+
'../evaluation/CUTE80',
|
105 |
+
'../evaluation/IC13_857',
|
106 |
+
'../evaluation/IC15_1811',
|
107 |
+
'../evaluation/IIIT5k',
|
108 |
+
'../evaluation/SVT',
|
109 |
+
'../evaluation/SVTP',
|
110 |
+
]
|
111 |
+
transforms:
|
112 |
+
- DecodeImagePIL: # load image
|
113 |
+
img_mode: RGB
|
114 |
+
- ARLabelEncode:
|
115 |
+
- KeepKeys:
|
116 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
117 |
+
sampler:
|
118 |
+
name: RatioSampler
|
119 |
+
scales: [[128, 32]] # w, h
|
120 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
121 |
+
first_bs: *bs
|
122 |
+
fix_bs: false
|
123 |
+
divided_factor: [4, 16] # w, h
|
124 |
+
is_training: False
|
125 |
+
loader:
|
126 |
+
shuffle: False
|
127 |
+
drop_last: False
|
128 |
+
batch_size_per_card: *bs
|
129 |
+
max_ratio: *max_ratio
|
130 |
+
num_workers: 4
|
configs/rec/focalsvtr/focalsvtr_ctc.yml
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/focalsvtr_ctc/
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: True
|
10 |
+
pretrained_model:
|
11 |
+
checkpoints:
|
12 |
+
use_tensorboard: false
|
13 |
+
infer_img:
|
14 |
+
# for data or label process
|
15 |
+
character_dict_path: &character_dict_path
|
16 |
+
# ./tools/utils/EN_symbol_dict.txt
|
17 |
+
max_text_length: &max_text_length 25
|
18 |
+
use_space_char: &use_space_char False
|
19 |
+
save_res_path: ./output/rec/u14m_filter/predicts_focalsvtr_ctc.txt
|
20 |
+
|
21 |
+
|
22 |
+
Optimizer:
|
23 |
+
name: AdamW
|
24 |
+
lr: 0.00065 # for 4gpus bs256/gpu
|
25 |
+
weight_decay: 0.05
|
26 |
+
filter_bias_and_bn: True
|
27 |
+
|
28 |
+
LRScheduler:
|
29 |
+
name: OneCycleLR
|
30 |
+
|
31 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
32 |
+
cycle_momentum: False
|
33 |
+
|
34 |
+
Architecture:
|
35 |
+
model_type: rec
|
36 |
+
algorithm: SVTR
|
37 |
+
Transform:
|
38 |
+
Encoder:
|
39 |
+
name: FocalSVTR
|
40 |
+
img_size: [32, 128]
|
41 |
+
depths: [6, 6, 6]
|
42 |
+
embed_dim: 96
|
43 |
+
sub_k: [[1, 1], [2, 1], [1, 1]]
|
44 |
+
focal_levels: [3, 3, 3]
|
45 |
+
out_channels: 256
|
46 |
+
last_stage: True
|
47 |
+
Decoder:
|
48 |
+
name: CTCDecoder
|
49 |
+
|
50 |
+
Loss:
|
51 |
+
name: CTCLoss
|
52 |
+
zero_infinity: True
|
53 |
+
|
54 |
+
PostProcess:
|
55 |
+
name: CTCLabelDecode
|
56 |
+
character_dict_path: *character_dict_path
|
57 |
+
use_space_char: *use_space_char
|
58 |
+
|
59 |
+
Metric:
|
60 |
+
name: RecMetric
|
61 |
+
main_indicator: acc
|
62 |
+
is_filter: True
|
63 |
+
|
64 |
+
|
65 |
+
Train:
|
66 |
+
dataset:
|
67 |
+
name: RatioDataSet
|
68 |
+
ds_width: True
|
69 |
+
padding: &padding False
|
70 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
71 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
72 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
73 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
74 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
75 |
+
]
|
76 |
+
transforms:
|
77 |
+
- DecodeImage: # load image
|
78 |
+
img_mode: BGR
|
79 |
+
channel_first: False
|
80 |
+
- PARSeqAug:
|
81 |
+
- CTCLabelEncode: # Class handling label
|
82 |
+
character_dict_path: *character_dict_path
|
83 |
+
use_space_char: *use_space_char
|
84 |
+
max_text_length: *max_text_length
|
85 |
+
- KeepKeys:
|
86 |
+
keep_keys: ['image', 'label', 'length']
|
87 |
+
sampler:
|
88 |
+
name: RatioSampler
|
89 |
+
scales: [[128, 32]] # w, h
|
90 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
91 |
+
first_bs: &bs 256
|
92 |
+
fix_bs: false
|
93 |
+
divided_factor: [4, 16] # w, h
|
94 |
+
is_training: True
|
95 |
+
loader:
|
96 |
+
shuffle: True
|
97 |
+
batch_size_per_card: *bs
|
98 |
+
drop_last: True
|
99 |
+
max_ratio: 12
|
100 |
+
num_workers: 4
|
101 |
+
|
102 |
+
Eval:
|
103 |
+
dataset:
|
104 |
+
name: RatioDataSet
|
105 |
+
ds_width: True
|
106 |
+
padding: True
|
107 |
+
data_dir_list: ['../evaluation/CUTE80',
|
108 |
+
'../evaluation/IC13_857',
|
109 |
+
'../evaluation/IC15_1811',
|
110 |
+
'../evaluation/IIIT5k',
|
111 |
+
'../evaluation/SVT',
|
112 |
+
'../evaluation/SVTP',
|
113 |
+
]
|
114 |
+
transforms:
|
115 |
+
- DecodeImage: # load image
|
116 |
+
img_mode: BGR
|
117 |
+
channel_first: False
|
118 |
+
- CTCLabelEncode: # Class handling label
|
119 |
+
character_dict_path: *character_dict_path
|
120 |
+
use_space_char: *use_space_char
|
121 |
+
max_text_length: *max_text_length
|
122 |
+
- KeepKeys:
|
123 |
+
keep_keys: ['image', 'label', 'length']
|
124 |
+
sampler:
|
125 |
+
name: RatioSampler
|
126 |
+
scales: [[128, 32]] # w, h
|
127 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
128 |
+
first_bs: 128
|
129 |
+
fix_bs: false
|
130 |
+
divided_factor: [4, 16] # w, h
|
131 |
+
is_training: False
|
132 |
+
loader:
|
133 |
+
shuffle: False
|
134 |
+
drop_last: False
|
135 |
+
batch_size_per_card: 128
|
136 |
+
max_ratio: 12
|
137 |
+
num_workers: 4
|
configs/rec/gtc/svtrv2_lnconv_nrtr_gtc.yml
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/svtrv2_lnconv_nrtr_gtc
|
7 |
+
save_epoch_step: 1
|
8 |
+
# evaluation is run every 2000 iterations
|
9 |
+
eval_batch_step: [0, 500]
|
10 |
+
eval_epoch_step: [0, 1]
|
11 |
+
cal_metric_during_train: True
|
12 |
+
pretrained_model:
|
13 |
+
checkpoints:
|
14 |
+
use_tensorboard: false
|
15 |
+
infer_img: ../ltb/img
|
16 |
+
# for data or label process
|
17 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
|
18 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
19 |
+
max_text_length: &max_text_length 25
|
20 |
+
use_space_char: &use_space_char False
|
21 |
+
save_res_path: ./output/rec/predicts_smtr.txt
|
22 |
+
use_amp: True
|
23 |
+
distributed: true
|
24 |
+
|
25 |
+
Optimizer:
|
26 |
+
name: AdamW
|
27 |
+
lr: 0.00065
|
28 |
+
weight_decay: 0.05
|
29 |
+
filter_bias_and_bn: True
|
30 |
+
|
31 |
+
LRScheduler:
|
32 |
+
name: OneCycleLR
|
33 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
34 |
+
cycle_momentum: False
|
35 |
+
|
36 |
+
Architecture:
|
37 |
+
model_type: rec
|
38 |
+
algorithm: BGPD
|
39 |
+
in_channels: 3
|
40 |
+
Transform:
|
41 |
+
Encoder:
|
42 |
+
name: SVTRv2LNConvTwo33
|
43 |
+
use_pos_embed: False
|
44 |
+
out_channels: 256
|
45 |
+
dims: [128, 256, 384]
|
46 |
+
depths: [6, 6, 6]
|
47 |
+
num_heads: [4, 8, 12]
|
48 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
49 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
50 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
51 |
+
last_stage: false
|
52 |
+
feat2d: True
|
53 |
+
Decoder:
|
54 |
+
name: GTCDecoder
|
55 |
+
infer_gtc: True
|
56 |
+
detach: False
|
57 |
+
gtc_decoder:
|
58 |
+
name: NRTRDecoder
|
59 |
+
num_encoder_layers: -1
|
60 |
+
beam_size: 0
|
61 |
+
num_decoder_layers: 2
|
62 |
+
nhead: 12
|
63 |
+
max_len: *max_text_length
|
64 |
+
ctc_decoder:
|
65 |
+
name: RCTCDecoder
|
66 |
+
|
67 |
+
Loss:
|
68 |
+
name: GTCLoss
|
69 |
+
gtc_loss:
|
70 |
+
name: ARLoss
|
71 |
+
|
72 |
+
PostProcess:
|
73 |
+
name: GTCLabelDecode
|
74 |
+
gtc_label_decode:
|
75 |
+
name: ARLabelDecode
|
76 |
+
character_dict_path: *character_dict_path
|
77 |
+
use_space_char: *use_space_char
|
78 |
+
|
79 |
+
Metric:
|
80 |
+
name: RecGTCMetric
|
81 |
+
main_indicator: acc
|
82 |
+
is_filter: True
|
83 |
+
|
84 |
+
Train:
|
85 |
+
dataset:
|
86 |
+
name: RatioDataSet
|
87 |
+
ds_width: True
|
88 |
+
# max_ratio: &max_ratio 4
|
89 |
+
# min_ratio: 1
|
90 |
+
# base_shape: &base_shape [[64, 64], [96, 48], [112, 40], [128, 32]]
|
91 |
+
# base_h: &base_h 32
|
92 |
+
# padding: &padding False
|
93 |
+
padding: false
|
94 |
+
# padding_rand: true
|
95 |
+
# padding_doub: true
|
96 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
97 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
98 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
99 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
100 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
101 |
+
]
|
102 |
+
transforms:
|
103 |
+
- DecodeImage: # load image
|
104 |
+
img_mode: BGR
|
105 |
+
channel_first: False
|
106 |
+
- PARSeqAug:
|
107 |
+
- GTCLabelEncode: # Class handling label
|
108 |
+
gtc_label_encode:
|
109 |
+
name: ARLabelEncode
|
110 |
+
character_dict_path: *character_dict_path
|
111 |
+
use_space_char: *use_space_char
|
112 |
+
max_text_length: *max_text_length
|
113 |
+
- KeepKeys:
|
114 |
+
keep_keys: ['image', 'label', 'length', 'ctc_label', 'ctc_length'] # dataloader will return list in this order
|
115 |
+
sampler:
|
116 |
+
name: RatioSampler
|
117 |
+
scales: [[128, 32]] # w, h
|
118 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
119 |
+
first_bs: &bs 256
|
120 |
+
fix_bs: false
|
121 |
+
divided_factor: [4, 16] # w, h
|
122 |
+
is_training: True
|
123 |
+
loader:
|
124 |
+
shuffle: True
|
125 |
+
batch_size_per_card: *bs
|
126 |
+
drop_last: True
|
127 |
+
max_ratio: &max_ratio 4
|
128 |
+
num_workers: 4
|
129 |
+
|
130 |
+
Eval:
|
131 |
+
dataset:
|
132 |
+
name: RatioDataSet
|
133 |
+
ds_width: True
|
134 |
+
padding: False
|
135 |
+
data_dir_list: [
|
136 |
+
'../evaluation/CUTE80',
|
137 |
+
'../evaluation/IC13_857',
|
138 |
+
'../evaluation/IC15_1811',
|
139 |
+
'../evaluation/IIIT5k',
|
140 |
+
'../evaluation/SVT',
|
141 |
+
'../evaluation/SVTP',
|
142 |
+
]
|
143 |
+
transforms:
|
144 |
+
- DecodeImage: # load image
|
145 |
+
img_mode: BGR
|
146 |
+
channel_first: False
|
147 |
+
- GTCLabelEncode: # Class handling label
|
148 |
+
gtc_label_encode:
|
149 |
+
name: ARLabelEncode
|
150 |
+
character_dict_path: *character_dict_path
|
151 |
+
use_space_char: *use_space_char
|
152 |
+
max_text_length: *max_text_length
|
153 |
+
- KeepKeys:
|
154 |
+
keep_keys: ['image', 'label', 'length', 'ctc_label', 'ctc_length'] # dataloader will return list in this order
|
155 |
+
sampler:
|
156 |
+
name: RatioSampler
|
157 |
+
scales: [[128, 32]] # w, h
|
158 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
159 |
+
first_bs: *bs
|
160 |
+
fix_bs: false
|
161 |
+
divided_factor: [4, 16] # w, h
|
162 |
+
is_training: False
|
163 |
+
loader:
|
164 |
+
shuffle: False
|
165 |
+
drop_last: False
|
166 |
+
batch_size_per_card: *bs
|
167 |
+
max_ratio: *max_ratio
|
168 |
+
num_workers: 4
|
configs/rec/gtc/svtrv2_lnconv_smtr_gtc_long_infer.yml
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/svtrv2_lnconv_smtr_gtc_long_infer
|
7 |
+
save_epoch_step: 1
|
8 |
+
# evaluation is run every 2000 iterations
|
9 |
+
eval_batch_step: [0, 1000]
|
10 |
+
eval_epoch_step: [0, 1]
|
11 |
+
cal_metric_during_train: True
|
12 |
+
pretrained_model:
|
13 |
+
checkpoints:
|
14 |
+
use_tensorboard: false
|
15 |
+
infer_img: ../ltb/img
|
16 |
+
# for data or label process
|
17 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
|
18 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
19 |
+
max_text_length: &max_text_length 25
|
20 |
+
use_space_char: &use_space_char False
|
21 |
+
save_res_path: ./output/rec/predicts_smtr.txt
|
22 |
+
use_amp: True
|
23 |
+
distributed: true
|
24 |
+
|
25 |
+
Optimizer:
|
26 |
+
name: AdamW
|
27 |
+
lr: 0.000325
|
28 |
+
weight_decay: 0.05
|
29 |
+
filter_bias_and_bn: True
|
30 |
+
|
31 |
+
LRScheduler:
|
32 |
+
name: OneCycleLR
|
33 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
34 |
+
cycle_momentum: False
|
35 |
+
|
36 |
+
Architecture:
|
37 |
+
model_type: rec
|
38 |
+
algorithm: BGPD
|
39 |
+
in_channels: 3
|
40 |
+
Transform:
|
41 |
+
Encoder:
|
42 |
+
name: SVTRv2LNConvTwo33
|
43 |
+
use_pos_embed: False
|
44 |
+
out_channels: 256
|
45 |
+
dims: [128, 256, 384]
|
46 |
+
depths: [6, 6, 6]
|
47 |
+
num_heads: [4, 8, 12]
|
48 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
49 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
50 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
51 |
+
last_stage: false
|
52 |
+
feat2d: True
|
53 |
+
Decoder:
|
54 |
+
name: GTCDecoder
|
55 |
+
infer_gtc: False
|
56 |
+
detach: False
|
57 |
+
gtc_decoder:
|
58 |
+
name: SMTRDecoder
|
59 |
+
num_layer: 1
|
60 |
+
ds: True
|
61 |
+
max_len: *max_text_length
|
62 |
+
next_mode: &next True
|
63 |
+
sub_str_len: &subsl 5
|
64 |
+
ctc_decoder:
|
65 |
+
name: RCTCDecoder
|
66 |
+
|
67 |
+
Loss:
|
68 |
+
name: CTCLoss
|
69 |
+
|
70 |
+
PostProcess:
|
71 |
+
name: CTCLabelDecode
|
72 |
+
character_dict_path: *character_dict_path
|
73 |
+
use_space_char: *use_space_char
|
74 |
+
|
75 |
+
Metric:
|
76 |
+
name: RecMetric
|
77 |
+
main_indicator: acc
|
78 |
+
is_filter: True
|
79 |
+
|
80 |
+
Train:
|
81 |
+
dataset:
|
82 |
+
name: RatioDataSetTVResize
|
83 |
+
ds_width: True
|
84 |
+
padding: false
|
85 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
86 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
87 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
88 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
89 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
90 |
+
]
|
91 |
+
transforms:
|
92 |
+
- DecodeImagePIL: # load image
|
93 |
+
img_mode: RGB
|
94 |
+
- PARSeqAugPIL:
|
95 |
+
- CTCLabelEncode: # Class handling label
|
96 |
+
character_dict_path: *character_dict_path
|
97 |
+
use_space_char: *use_space_char
|
98 |
+
max_text_length: *max_text_length
|
99 |
+
- KeepKeys:
|
100 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
101 |
+
sampler:
|
102 |
+
name: RatioSampler
|
103 |
+
scales: [[128, 32]] # w, h
|
104 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
105 |
+
first_bs: &bs 128
|
106 |
+
fix_bs: false
|
107 |
+
divided_factor: [4, 16] # w, h
|
108 |
+
is_training: True
|
109 |
+
loader:
|
110 |
+
shuffle: True
|
111 |
+
batch_size_per_card: *bs
|
112 |
+
drop_last: True
|
113 |
+
max_ratio: &max_ratio 12
|
114 |
+
num_workers: 4
|
115 |
+
|
116 |
+
Eval:
|
117 |
+
dataset:
|
118 |
+
name: RatioDataSetTVResize
|
119 |
+
ds_width: True
|
120 |
+
padding: False
|
121 |
+
data_dir_list: [
|
122 |
+
'../evaluation/CUTE80',
|
123 |
+
'../evaluation/IC13_857',
|
124 |
+
'../evaluation/IC15_1811',
|
125 |
+
'../evaluation/IIIT5k',
|
126 |
+
'../evaluation/SVT',
|
127 |
+
'../evaluation/SVTP',
|
128 |
+
]
|
129 |
+
transforms:
|
130 |
+
- DecodeImagePIL: # load image
|
131 |
+
img_mode: RGB
|
132 |
+
- CTCLabelEncode: # Class handling label
|
133 |
+
character_dict_path: *character_dict_path
|
134 |
+
use_space_char: *use_space_char
|
135 |
+
max_text_length: *max_text_length
|
136 |
+
- KeepKeys:
|
137 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
138 |
+
sampler:
|
139 |
+
name: RatioSampler
|
140 |
+
scales: [[128, 32]] # w, h
|
141 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
142 |
+
first_bs: *bs
|
143 |
+
fix_bs: false
|
144 |
+
divided_factor: [4, 16] # w, h
|
145 |
+
is_training: False
|
146 |
+
loader:
|
147 |
+
shuffle: False
|
148 |
+
drop_last: False
|
149 |
+
batch_size_per_card: *bs
|
150 |
+
max_ratio: *max_ratio
|
151 |
+
num_workers: 4
|
configs/rec/gtc/svtrv2_lnconv_smtr_gtc_smtr_long.yml
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/svtrv2_lnconv_smtr_gtc_nodetach_smtr_long_infer
|
7 |
+
save_epoch_step: 1
|
8 |
+
# evaluation is run every 2000 iterations
|
9 |
+
eval_batch_step: [0, 1000]
|
10 |
+
eval_epoch_step: [0, 1]
|
11 |
+
cal_metric_during_train: True
|
12 |
+
pretrained_model:
|
13 |
+
checkpoints:
|
14 |
+
use_tensorboard: false
|
15 |
+
infer_img:
|
16 |
+
# for data or label process
|
17 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
|
18 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
19 |
+
max_text_length: &max_text_length 25
|
20 |
+
use_space_char: &use_space_char False
|
21 |
+
save_res_path: ./output/rec/predicts_smtr.txt
|
22 |
+
use_amp: True
|
23 |
+
distributed: true
|
24 |
+
|
25 |
+
Optimizer:
|
26 |
+
name: AdamW
|
27 |
+
lr: 0.000325
|
28 |
+
weight_decay: 0.05
|
29 |
+
filter_bias_and_bn: True
|
30 |
+
|
31 |
+
LRScheduler:
|
32 |
+
name: OneCycleLR
|
33 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
34 |
+
cycle_momentum: False
|
35 |
+
Architecture:
|
36 |
+
model_type: rec
|
37 |
+
algorithm: BGPD
|
38 |
+
in_channels: 3
|
39 |
+
Transform:
|
40 |
+
Encoder:
|
41 |
+
name: SVTRv2LNConvTwo33
|
42 |
+
use_pos_embed: False
|
43 |
+
out_channels: 256
|
44 |
+
dims: [128, 256, 384]
|
45 |
+
depths: [6, 6, 6]
|
46 |
+
num_heads: [4, 8, 12]
|
47 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
48 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
49 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
50 |
+
last_stage: false
|
51 |
+
feat2d: True
|
52 |
+
Decoder:
|
53 |
+
name: GTCDecoder
|
54 |
+
infer_gtc: True
|
55 |
+
detach: False
|
56 |
+
gtc_decoder:
|
57 |
+
name: SMTRDecoder
|
58 |
+
num_layer: 1
|
59 |
+
ds: True
|
60 |
+
max_len: *max_text_length
|
61 |
+
next_mode: &next True
|
62 |
+
sub_str_len: &subsl 5
|
63 |
+
infer_aug: True
|
64 |
+
ctc_decoder:
|
65 |
+
name: RCTCDecoder
|
66 |
+
|
67 |
+
Loss:
|
68 |
+
name: GTCLoss
|
69 |
+
ctc_weight: 0.1
|
70 |
+
gtc_loss:
|
71 |
+
name: SMTRLoss
|
72 |
+
|
73 |
+
PostProcess:
|
74 |
+
name: GTCLabelDecode
|
75 |
+
gtc_label_decode:
|
76 |
+
name: SMTRLabelDecode
|
77 |
+
next_mode: *next
|
78 |
+
character_dict_path: *character_dict_path
|
79 |
+
use_space_char: *use_space_char
|
80 |
+
only_gtc: True
|
81 |
+
|
82 |
+
Metric:
|
83 |
+
name: RecGTCMetric
|
84 |
+
main_indicator: acc
|
85 |
+
is_filter: True
|
86 |
+
|
87 |
+
Train:
|
88 |
+
dataset:
|
89 |
+
name: RatioDataSetTVResize
|
90 |
+
ds_width: True
|
91 |
+
padding: false
|
92 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
93 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
94 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
95 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
96 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
97 |
+
]
|
98 |
+
transforms:
|
99 |
+
- DecodeImagePIL: # load image
|
100 |
+
img_mode: RGB
|
101 |
+
- PARSeqAugPIL:
|
102 |
+
- SMTRLabelEncode: # Class handling label
|
103 |
+
sub_str_len: *subsl
|
104 |
+
character_dict_path: *character_dict_path
|
105 |
+
use_space_char: *use_space_char
|
106 |
+
max_text_length: *max_text_length
|
107 |
+
- KeepKeys:
|
108 |
+
keep_keys: ['image', 'label', 'label_subs', 'label_next', 'length_subs',
|
109 |
+
'label_subs_pre', 'label_next_pre', 'length_subs_pre', 'length'] # dataloader will return list in this order
|
110 |
+
sampler:
|
111 |
+
name: RatioSampler
|
112 |
+
scales: [[128, 32]] # w, h
|
113 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
114 |
+
first_bs: &bs 256
|
115 |
+
fix_bs: false
|
116 |
+
divided_factor: [4, 16] # w, h
|
117 |
+
is_training: True
|
118 |
+
loader:
|
119 |
+
shuffle: True
|
120 |
+
batch_size_per_card: *bs
|
121 |
+
drop_last: True
|
122 |
+
max_ratio: &max_ratio 12
|
123 |
+
num_workers: 4
|
124 |
+
|
125 |
+
Eval:
|
126 |
+
dataset:
|
127 |
+
name: SimpleDataSet
|
128 |
+
data_dir: ../ltb/
|
129 |
+
label_file_list: ['../ltb/ultra_long_70_list.txt']
|
130 |
+
transforms:
|
131 |
+
- DecodeImage: # load image
|
132 |
+
img_mode: BGR
|
133 |
+
channel_first: False
|
134 |
+
- GTCLabelEncode: # Class handling label
|
135 |
+
gtc_label_encode:
|
136 |
+
name: ARLabelEncode
|
137 |
+
character_dict_path: *character_dict_path
|
138 |
+
use_space_char: *use_space_char
|
139 |
+
max_text_length: 200
|
140 |
+
- SliceResize:
|
141 |
+
image_shape: [3, 32, 128]
|
142 |
+
padding: False
|
143 |
+
max_ratio: 12
|
144 |
+
- KeepKeys:
|
145 |
+
keep_keys: ['image', 'label', 'length', 'ctc_label', 'ctc_length'] # dataloader will return list in this order
|
146 |
+
loader:
|
147 |
+
shuffle: False
|
148 |
+
drop_last: False
|
149 |
+
batch_size_per_card: 1
|
150 |
+
num_workers: 2
|
configs/rec/gtc/svtrv2_lnconv_smtr_gtc_stream.yml
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 60
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/svtrv2_lnconv_smtr_gtc_stream
|
7 |
+
save_epoch_step: 1
|
8 |
+
# evaluation is run every 2000 iterations
|
9 |
+
eval_batch_step: [0, 500]
|
10 |
+
eval_epoch_step: [0, 1]
|
11 |
+
cal_metric_during_train: True
|
12 |
+
pretrained_model:
|
13 |
+
checkpoints:
|
14 |
+
use_tensorboard: false
|
15 |
+
infer_img:
|
16 |
+
# for data or label process
|
17 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
|
18 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
19 |
+
max_text_length: &max_text_length 25
|
20 |
+
use_space_char: &use_space_char False
|
21 |
+
save_res_path: ./output/rec/predicts_smtr.txt
|
22 |
+
use_amp: True
|
23 |
+
distributed: true
|
24 |
+
grad_clip_val: 20
|
25 |
+
|
26 |
+
Optimizer:
|
27 |
+
name: AdamW
|
28 |
+
lr: 0.00065
|
29 |
+
weight_decay: 0.05
|
30 |
+
filter_bias_and_bn: True
|
31 |
+
|
32 |
+
LRScheduler:
|
33 |
+
name: OneCycleLR
|
34 |
+
warmup_epoch: 5 # pct_start 0.075*20 = 1.5ep
|
35 |
+
cycle_momentum: False
|
36 |
+
|
37 |
+
Architecture:
|
38 |
+
model_type: rec
|
39 |
+
algorithm: BGPD
|
40 |
+
in_channels: 3
|
41 |
+
Transform:
|
42 |
+
Encoder:
|
43 |
+
name: SVTRv2LNConvTwo33
|
44 |
+
use_pos_embed: False
|
45 |
+
out_channels: 256
|
46 |
+
dims: [128, 256, 384]
|
47 |
+
depths: [6, 6, 6]
|
48 |
+
num_heads: [4, 8, 12]
|
49 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
50 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
51 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
52 |
+
last_stage: false
|
53 |
+
feat2d: True
|
54 |
+
Decoder:
|
55 |
+
name: GTCDecoder
|
56 |
+
infer_gtc: True
|
57 |
+
detach: False
|
58 |
+
gtc_decoder:
|
59 |
+
name: SMTRDecoder
|
60 |
+
num_layer: 1
|
61 |
+
ds: True
|
62 |
+
max_len: *max_text_length
|
63 |
+
next_mode: &next True
|
64 |
+
sub_str_len: &subsl 5
|
65 |
+
infer_aug: False
|
66 |
+
ctc_decoder:
|
67 |
+
name: RCTCDecoder
|
68 |
+
|
69 |
+
Loss:
|
70 |
+
name: GTCLoss
|
71 |
+
ctc_weight: 0.25
|
72 |
+
gtc_loss:
|
73 |
+
name: SMTRLoss
|
74 |
+
|
75 |
+
PostProcess:
|
76 |
+
name: GTCLabelDecode
|
77 |
+
gtc_label_decode:
|
78 |
+
name: SMTRLabelDecode
|
79 |
+
next_mode: *next
|
80 |
+
character_dict_path: *character_dict_path
|
81 |
+
use_space_char: *use_space_char
|
82 |
+
only_gtc: True
|
83 |
+
|
84 |
+
Metric:
|
85 |
+
name: RecMetric
|
86 |
+
main_indicator: acc
|
87 |
+
is_filter: True
|
88 |
+
stream: True
|
89 |
+
|
90 |
+
Train:
|
91 |
+
dataset:
|
92 |
+
name: RatioDataSetTVResize
|
93 |
+
ds_width: True
|
94 |
+
padding: false
|
95 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
96 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
97 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
98 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
99 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
100 |
+
]
|
101 |
+
transforms:
|
102 |
+
- DecodeImagePIL: # load image
|
103 |
+
img_mode: RGB
|
104 |
+
- PARSeqAugPIL:
|
105 |
+
- SMTRLabelEncode: # Class handling label
|
106 |
+
sub_str_len: *subsl
|
107 |
+
character_dict_path: *character_dict_path
|
108 |
+
use_space_char: *use_space_char
|
109 |
+
max_text_length: *max_text_length
|
110 |
+
- KeepKeys:
|
111 |
+
keep_keys: ['image', 'label', 'label_subs', 'label_next', 'length_subs',
|
112 |
+
'label_subs_pre', 'label_next_pre', 'length_subs_pre', 'length'] # dataloader will return list in this order
|
113 |
+
sampler:
|
114 |
+
name: RatioSampler
|
115 |
+
scales: [[128, 32]] # w, h
|
116 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
117 |
+
first_bs: &bs 256
|
118 |
+
fix_bs: false
|
119 |
+
divided_factor: [4, 16] # w, h
|
120 |
+
is_training: True
|
121 |
+
loader:
|
122 |
+
shuffle: True
|
123 |
+
batch_size_per_card: *bs
|
124 |
+
drop_last: True
|
125 |
+
max_ratio: &max_ratio 12
|
126 |
+
num_workers: 4
|
127 |
+
|
128 |
+
Eval:
|
129 |
+
dataset:
|
130 |
+
name: SimpleDataSet
|
131 |
+
data_dir: ../ltb/
|
132 |
+
label_file_list: ['../ltb/ultra_long_70_list.txt']
|
133 |
+
transforms:
|
134 |
+
- DecodeImagePIL: # load image
|
135 |
+
img_mode: RGB
|
136 |
+
- GTCLabelEncode: # Class handling label
|
137 |
+
gtc_label_encode:
|
138 |
+
name: ARLabelEncode
|
139 |
+
character_dict_path: *character_dict_path
|
140 |
+
use_space_char: *use_space_char
|
141 |
+
max_text_length: *max_text_length
|
142 |
+
- SliceTVResize:
|
143 |
+
image_shape: [32, 128]
|
144 |
+
padding: False
|
145 |
+
max_ratio: 4
|
146 |
+
- KeepKeys:
|
147 |
+
keep_keys: ['image', 'label', 'length', 'ctc_label', 'ctc_length'] # dataloader will return list in this order
|
148 |
+
loader:
|
149 |
+
shuffle: False
|
150 |
+
drop_last: False
|
151 |
+
batch_size_per_card: 1
|
152 |
+
num_workers: 2
|
configs/rec/igtr/readme.md
ADDED
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# IGTR
|
2 |
+
|
3 |
+
- [IGTR](#igtr)
|
4 |
+
- [1. Introduction](#1-introduction)
|
5 |
+
- [2. Environment](#2-environment)
|
6 |
+
- [Dataset Preparation](#dataset-preparation)
|
7 |
+
- [3. Model Training / Evaluation](#3-model-training--evaluation)
|
8 |
+
- [Citation](#citation)
|
9 |
+
|
10 |
+
<a name="1"></a>
|
11 |
+
|
12 |
+
## 1. Introduction
|
13 |
+
|
14 |
+
Paper:
|
15 |
+
|
16 |
+
> [Instruction-Guided Scene Text Recognition](https://arxiv.org/abs/2401.17851)
|
17 |
+
> Yongkun Du, Zhineng Chen, Yuchen Su, Caiyan Jia, Yu-Gang Jiang
|
18 |
+
|
19 |
+
<a name="model"></a>
|
20 |
+
Multi-modal models show appealing performance in visual recognition tasks recently, as free-form text-guided training evokes the ability to understand fine-grained visual content. However, current models are either inefficient or cannot be trivially upgraded to scene text recognition (STR) due to the composition difference between natural and text images. We propose a novel instruction-guided scene text recognition (IGTR) paradigm that formulates STR as an instruction learning problem and understands text images by predicting character attributes, e.g., character frequency, position, etc. IGTR first devises $\\left \\langle condition,question,answer\\right \\rangle$ instruction triplets, providing rich and diverse descriptions of character attributes. To effectively learn these attributes through question-answering, IGTR develops lightweight instruction encoder, cross-modal feature fusion module and multi-task answer head, which guides nuanced text image understanding. Furthermore, IGTR realizes different recognition pipelines simply by using different instructions, enabling a character-understanding-based text reasoning paradigm that considerably differs from current methods. Experiments on English and Chinese benchmarks show that IGTR outperforms existing models by significant margins, while maintaining a small model size and efficient inference speed. Moreover, by adjusting the sampling of instructions, IGTR offers an elegant way to tackle the recognition of both rarely appearing and morphologically similar characters, which were previous challenges.
|
21 |
+
|
22 |
+
<a name="model"></a>
|
23 |
+
The accuracy (%) and model files of IGTR on the public dataset of scene text recognition are as follows:
|
24 |
+
|
25 |
+
- Trained on Synth dataset(MJ+ST), test on Common Benchmarks, training and test datasets both from [PARSeq](https://github.com/baudm/parseq).
|
26 |
+
|
27 |
+
| Model | IC13<br/>857 | SVT | IIIT5k<br/>3000 | IC15<br/>1811 | SVTP | CUTE80 | Avg | Config&Model&Log |
|
28 |
+
| :-----: | :----------: | :--: | :-------------: | :-----------: | :--: | :----: | :---: | :---------------------------------------------------------------------------------------------: |
|
29 |
+
| IGTR-PD | 97.6 | 95.2 | 97.6 | 88.4 | 91.6 | 95.5 | 94.30 | [link](https://drive.google.com/drive/folders/1Pv0CW2hiWC_dIyaB74W1fsXqiX3z5yXA?usp=drive_link) |
|
30 |
+
| IGTR-AR | 98.6 | 95.7 | 98.2 | 88.4 | 92.4 | 95.5 | 94.78 | as above |
|
31 |
+
|
32 |
+
- Test on Union14M-L benchmark, from [Union14M](https://github.com/Mountchicken/Union14M/).
|
33 |
+
|
34 |
+
| Model | Curve | Multi-<br/>Oriented | Artistic | Contextless | Salient | Multi-<br/>word | General | Avg | Config&Model&Log |
|
35 |
+
| :-----: | :---: | :-----------------: | :------: | :---------: | :-----: | :-------------: | :-----: | :---: | :---------------------: |
|
36 |
+
| IGTR-PD | 76.9 | 30.6 | 59.1 | 63.3 | 77.8 | 62.5 | 66.7 | 62.40 | Same as the above table |
|
37 |
+
| IGTR-AR | 78.4 | 31.9 | 61.3 | 66.5 | 80.2 | 69.3 | 67.9 | 65.07 | as above |
|
38 |
+
|
39 |
+
- Trained on Union14M-L training dataset.
|
40 |
+
|
41 |
+
| Model | IC13<br/>857 | SVT | IIIT5k<br/>3000 | IC15<br/>1811 | SVTP | CUTE80 | Avg | Config&Model&Log |
|
42 |
+
| :----------: | :----------: | :--: | :-------------: | :-----------: | :--: | :----: | :---: | :---------------------------------------------------------------------------------------------: |
|
43 |
+
| IGTR-PD | 97.7 | 97.7 | 98.3 | 89.8 | 93.7 | 97.9 | 95.86 | [link](https://drive.google.com/drive/folders/1ZGlzDqEzjrBg8qG2wBkbOm3bLRzFbTzo?usp=drive_link) |
|
44 |
+
| IGTR-AR | 98.1 | 98.4 | 98.7 | 90.5 | 94.9 | 98.3 | 96.48 | as above |
|
45 |
+
| IGTR-PD-60ep | 97.9 | 98.3 | 99.2 | 90.8 | 93.7 | 97.6 | 96.24 | [link](https://drive.google.com/drive/folders/1ik4hxZDRsjU1RbCA19nwE45Kg1bCnMoa?usp=drive_link) |
|
46 |
+
| IGTR-AR-60ep | 98.4 | 98.1 | 99.3 | 91.5 | 94.3 | 97.6 | 96.54 | as above |
|
47 |
+
| IGTR-PD-PT | 98.6 | 98.0 | 99.1 | 91.7 | 96.8 | 99.0 | 97.20 | [link](https://drive.google.com/drive/folders/1QM0EWV66IfYI1G0Xm066V2zJA62hH6-1?usp=drive_link) |
|
48 |
+
| IGTR-AR-PT | 98.8 | 98.3 | 99.2 | 92.0 | 96.8 | 99.0 | 97.34 | as above |
|
49 |
+
|
50 |
+
| Model | Curve | Multi-<br/>Oriented | Artistic | Contextless | Salient | Multi-<br/>word | General | Avg | Config&Model&Log |
|
51 |
+
| :----------: | :---: | :-----------------: | :------: | :---------: | :-----: | :-------------: | :-----: | :---: | :---------------------: |
|
52 |
+
| IGTR-PD | 88.1 | 89.9 | 74.2 | 80.3 | 82.8 | 79.2 | 83.0 | 82.51 | Same as the above table |
|
53 |
+
| IGTR-AR | 90.4 | 91.2 | 77.0 | 82.4 | 84.7 | 84.0 | 84.4 | 84.86 | as above |
|
54 |
+
| IGTR-PD-60ep | 90.0 | 92.1 | 77.5 | 82.8 | 86.0 | 83.0 | 84.8 | 85.18 | Same as the above table |
|
55 |
+
| IGTR-AR-60ep | 91.0 | 93.0 | 78.7 | 84.6 | 87.3 | 84.8 | 85.6 | 86.43 | as above |
|
56 |
+
| IGTR-PD-PT | 92.4 | 92.1 | 80.7 | 83.6 | 87.7 | 86.9 | 85.0 | 86.92 | Same as the above table |
|
57 |
+
| IGTR-AR-PT | 93.0 | 92.9 | 81.3 | 83.4 | 88.6 | 88.7 | 85.6 | 87.65 | as above |
|
58 |
+
|
59 |
+
- Trained and test on Chinese dataset, from [Chinese Benckmark](https://github.com/FudanVI/benchmarking-chinese-text-recognition).
|
60 |
+
|
61 |
+
| Model | Scene | Web | Document | Handwriting | Avg | Config&Model&Log |
|
62 |
+
| :---------: | :---: | :--: | :------: | :---------: | :---: | :---------------------------------------------------------------------------------------------: |
|
63 |
+
| IGTR-PD | 73.1 | 74.8 | 98.6 | 52.5 | 74.75 | |
|
64 |
+
| IGTR-AR | 75.1 | 76.4 | 98.7 | 55.3 | 76.37 | |
|
65 |
+
| IGTR-PD-TS | 73.5 | 75.9 | 98.7 | 54.5 | 75.65 | [link](https://drive.google.com/drive/folders/1H3VRdGHjhawd6fkSC-qlBzVzvYYTpHRg?usp=drive_link) |
|
66 |
+
| IGTR-AR-TS | 75.6 | 77.0 | 98.8 | 57.3 | 77.17 | as above |
|
67 |
+
| IGTR-PD-Aug | 79.5 | 80.0 | 99.4 | 58.9 | 79.45 | [link](https://drive.google.com/drive/folders/1XFQkCILwcFwA7iYyQY9crnrouaI5sqcZ?usp=drive_link) |
|
68 |
+
| IGTR-AR-Aug | 82.0 | 81.7 | 99.5 | 63.8 | 81.74 | as above |
|
69 |
+
|
70 |
+
Download all Configs, Models, and Logs from [Google Drive](https://drive.google.com/drive/folders/1mSRDg9Mj5R6PspAdFGXZHDHTCQmjkd8d?usp=drive_link).
|
71 |
+
|
72 |
+
<a name="2"></a>
|
73 |
+
|
74 |
+
## 2. Environment
|
75 |
+
|
76 |
+
- [PyTorch](http://pytorch.org/) version >= 1.13.0
|
77 |
+
- Python version >= 3.7
|
78 |
+
|
79 |
+
```shell
|
80 |
+
git clone -b develop https://github.com/Topdu/OpenOCR.git
|
81 |
+
cd OpenOCR
|
82 |
+
# A100 Ubuntu 20.04 Cuda 11.8
|
83 |
+
conda create -n openocr python==3.8
|
84 |
+
conda activate openocr
|
85 |
+
conda install pytorch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 pytorch-cuda=11.8 -c pytorch -c nvidia
|
86 |
+
pip install -r requirements.txt
|
87 |
+
```
|
88 |
+
|
89 |
+
#### Dataset Preparation
|
90 |
+
|
91 |
+
[English dataset download](https://github.com/baudm/parseq)
|
92 |
+
|
93 |
+
[Union14M-L download](https://github.com/Mountchicken/Union14M)
|
94 |
+
|
95 |
+
[Chinese dataset download](https://github.com/fudanvi/benchmarking-chinese-text-recognition#download)
|
96 |
+
|
97 |
+
The expected filesystem structure is as follows:
|
98 |
+
|
99 |
+
```
|
100 |
+
benchmark_bctr
|
101 |
+
├── benchmark_bctr_test
|
102 |
+
│ ├── document_test
|
103 |
+
│ ├── handwriting_test
|
104 |
+
│ ├── scene_test
|
105 |
+
│ └── web_test
|
106 |
+
└── benchmark_bctr_train
|
107 |
+
├── document_train
|
108 |
+
├── handwriting_train
|
109 |
+
├── scene_train
|
110 |
+
└── web_train
|
111 |
+
evaluation
|
112 |
+
├── CUTE80
|
113 |
+
├── IC13_857
|
114 |
+
├── IC15_1811
|
115 |
+
├── IIIT5k
|
116 |
+
├── SVT
|
117 |
+
└── SVTP
|
118 |
+
OpenOCR
|
119 |
+
synth
|
120 |
+
├── MJ
|
121 |
+
│ ├── test
|
122 |
+
│ ├── train
|
123 |
+
│ └── val
|
124 |
+
└── ST
|
125 |
+
test # from PARSeq
|
126 |
+
├── ArT
|
127 |
+
├── COCOv1.4
|
128 |
+
├── CUTE80
|
129 |
+
├── IC13_1015
|
130 |
+
├── IC13_1095
|
131 |
+
├── IC13_857
|
132 |
+
├── IC15_1811
|
133 |
+
├── IC15_2077
|
134 |
+
├── IIIT5k
|
135 |
+
├── SVT
|
136 |
+
├── SVTP
|
137 |
+
└── Uber
|
138 |
+
u14m # lmdb format
|
139 |
+
├── artistic
|
140 |
+
├── contextless
|
141 |
+
├── curve
|
142 |
+
├── general
|
143 |
+
├── multi_oriented
|
144 |
+
├── multi_words
|
145 |
+
└── salient
|
146 |
+
Union14M-LMDB-L # lmdb format
|
147 |
+
├── train_challenging
|
148 |
+
├── train_easy
|
149 |
+
├── train_hard
|
150 |
+
├── train_medium
|
151 |
+
└── train_normal
|
152 |
+
```
|
153 |
+
|
154 |
+
<a name="3"></a>
|
155 |
+
|
156 |
+
## 3. Model Training / Evaluation
|
157 |
+
|
158 |
+
Training:
|
159 |
+
|
160 |
+
```shell
|
161 |
+
# The configuration file is available from the link provided in the table above.
|
162 |
+
# Multi GPU training
|
163 |
+
CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 tools/train_rec.py --c PATH/svtr_base_igtr_XXX.yml
|
164 |
+
```
|
165 |
+
|
166 |
+
Evaluation:
|
167 |
+
|
168 |
+
```shell
|
169 |
+
# The configuration file is available from the link provided in the table above.
|
170 |
+
# en
|
171 |
+
python tools/eval_rec_all_ratio.py --c PATH/svtr_base_igtr_syn.yml
|
172 |
+
# ch
|
173 |
+
python tools/eval_rec_all_ch.py --c PATH/svtr_base_igtr_ch_aug.yml
|
174 |
+
```
|
175 |
+
|
176 |
+
## Citation
|
177 |
+
|
178 |
+
```bibtex
|
179 |
+
@article{Du2024IGTR,
|
180 |
+
title = {Instruction-Guided Scene Text Recognition},
|
181 |
+
author = {Du, Yongkun and Chen, Zhineng and Su, Yuchen and Jia, Caiyan and Jiang, Yu-Gang},
|
182 |
+
journal = {CoRR},
|
183 |
+
eprinttype = {arXiv},
|
184 |
+
primaryClass={cs.CV},
|
185 |
+
volume = {abs/2401.17851},
|
186 |
+
year = {2024},
|
187 |
+
url = {https://arxiv.org/abs/2401.17851}
|
188 |
+
}
|
189 |
+
```
|
configs/rec/igtr/svtr_base_ds_igtr.yml
ADDED
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/svtr_base_igtr
|
7 |
+
save_epoch_step: 1
|
8 |
+
# evaluation is run every 2000 iterations
|
9 |
+
eval_batch_step: [0, 500]
|
10 |
+
eval_epoch_step: [0, 1]
|
11 |
+
cal_metric_during_train: True
|
12 |
+
pretrained_model:
|
13 |
+
checkpoints:
|
14 |
+
use_tensorboard: false
|
15 |
+
infer_img:
|
16 |
+
# for data or label process
|
17 |
+
character_dict_path: &character_dict_path
|
18 |
+
# ./tools/utils/EN_symbol_dict.txt # 96en
|
19 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
20 |
+
max_text_length: &max_text_length 25
|
21 |
+
use_space_char: &use_space_char False
|
22 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtr_base_igtr.txt
|
23 |
+
use_amp: True
|
24 |
+
|
25 |
+
Optimizer:
|
26 |
+
name: AdamW
|
27 |
+
lr: 0.0005 # 2gpus 384bs/gpu
|
28 |
+
weight_decay: 0.05
|
29 |
+
filter_bias_and_bn: True
|
30 |
+
|
31 |
+
LRScheduler:
|
32 |
+
name: OneCycleLR
|
33 |
+
warmup_epoch: 1.5
|
34 |
+
cycle_momentum: False
|
35 |
+
|
36 |
+
Architecture:
|
37 |
+
model_type: rec
|
38 |
+
algorithm: IGTR
|
39 |
+
in_channels: 3
|
40 |
+
Transform:
|
41 |
+
Encoder:
|
42 |
+
name: SVTRNet2DPos
|
43 |
+
img_size: [32, -1]
|
44 |
+
out_char_num: 25
|
45 |
+
out_channels: 256
|
46 |
+
patch_merging: 'Conv'
|
47 |
+
embed_dim: [128, 256, 384]
|
48 |
+
depth: [6, 6, 6]
|
49 |
+
num_heads: [4, 8, 12]
|
50 |
+
mixer: ['ConvB','ConvB','ConvB','ConvB','ConvB','ConvB', 'ConvB','ConvB', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
|
51 |
+
local_mixer: [[5, 5], [5, 5], [5, 5]]
|
52 |
+
last_stage: False
|
53 |
+
prenorm: True
|
54 |
+
use_first_sub: False
|
55 |
+
Decoder:
|
56 |
+
name: IGTRDecoder
|
57 |
+
dim: 384
|
58 |
+
num_layer: 1
|
59 |
+
ar: False
|
60 |
+
refine_iter: 0
|
61 |
+
# next_pred: True
|
62 |
+
next_pred: False
|
63 |
+
pos2d: True
|
64 |
+
ds: True
|
65 |
+
# pos_len: False
|
66 |
+
# rec_layer: 1
|
67 |
+
|
68 |
+
|
69 |
+
Loss:
|
70 |
+
name: IGTRLoss
|
71 |
+
|
72 |
+
PostProcess:
|
73 |
+
name: IGTRLabelDecode
|
74 |
+
character_dict_path: *character_dict_path
|
75 |
+
use_space_char: *use_space_char
|
76 |
+
|
77 |
+
Metric:
|
78 |
+
name: RecMetric
|
79 |
+
main_indicator: acc
|
80 |
+
|
81 |
+
Train:
|
82 |
+
dataset:
|
83 |
+
name: RatioDataSet
|
84 |
+
ds_width: True
|
85 |
+
padding: &padding False
|
86 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
87 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
88 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
89 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
90 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
91 |
+
]
|
92 |
+
transforms:
|
93 |
+
- DecodeImage: # load image
|
94 |
+
img_mode: BGR
|
95 |
+
channel_first: False
|
96 |
+
- PARSeqAug:
|
97 |
+
- IGTRLabelEncode: # Class handling label
|
98 |
+
k: 8
|
99 |
+
prompt_error: False
|
100 |
+
character_dict_path: *character_dict_path
|
101 |
+
use_space_char: *use_space_char
|
102 |
+
max_text_length: *max_text_length
|
103 |
+
- KeepKeys:
|
104 |
+
keep_keys: ['image', 'label', 'prompt_pos_idx_list',
|
105 |
+
'prompt_char_idx_list', 'ques_pos_idx_list', 'ques1_answer_list',
|
106 |
+
'ques2_char_idx_list', 'ques2_answer_list', 'ques3_answer', 'ques4_char_num_list',
|
107 |
+
'ques_len_list', 'ques2_len_list', 'prompt_len_list', 'length'] # dataloader will return list in this order
|
108 |
+
sampler:
|
109 |
+
name: RatioSampler
|
110 |
+
scales: [[128, 32]] # w, h
|
111 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
112 |
+
first_bs: &bs 384
|
113 |
+
fix_bs: false
|
114 |
+
divided_factor: [4, 16] # w, h
|
115 |
+
is_training: True
|
116 |
+
loader:
|
117 |
+
shuffle: True
|
118 |
+
batch_size_per_card: *bs
|
119 |
+
drop_last: True
|
120 |
+
max_ratio: &max_ratio 4
|
121 |
+
num_workers: 4
|
122 |
+
|
123 |
+
Eval:
|
124 |
+
dataset:
|
125 |
+
name: RatioDataSet
|
126 |
+
ds_width: True
|
127 |
+
padding: *padding
|
128 |
+
data_dir_list: ['../evaluation/CUTE80',
|
129 |
+
'../evaluation/IC13_857',
|
130 |
+
'../evaluation/IC15_1811',
|
131 |
+
'../evaluation/IIIT5k',
|
132 |
+
'../evaluation/SVT',
|
133 |
+
'../evaluation/SVTP']
|
134 |
+
transforms:
|
135 |
+
- DecodeImage: # load image
|
136 |
+
img_mode: BGR
|
137 |
+
channel_first: False
|
138 |
+
- ARLabelEncode: # Class handling label
|
139 |
+
character_dict_path: *character_dict_path
|
140 |
+
use_space_char: *use_space_char
|
141 |
+
max_text_length: *max_text_length
|
142 |
+
- KeepKeys:
|
143 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
144 |
+
sampler:
|
145 |
+
name: RatioSampler
|
146 |
+
scales: [[128, 32]] # w, h
|
147 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
148 |
+
first_bs: 256
|
149 |
+
fix_bs: false
|
150 |
+
divided_factor: [4, 16] # w, h
|
151 |
+
is_training: False
|
152 |
+
loader:
|
153 |
+
shuffle: False
|
154 |
+
drop_last: False
|
155 |
+
batch_size_per_card: 256
|
156 |
+
max_ratio: *max_ratio
|
157 |
+
num_workers: 4
|
configs/rec/lister/focalsvtr_lister_wo_fem_maxratio12.yml
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/focalsvtr_lister_wo_fem_maxratio12/
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: True
|
10 |
+
pretrained_model:
|
11 |
+
checkpoints:
|
12 |
+
use_tensorboard: false
|
13 |
+
infer_img:
|
14 |
+
# for data or label process
|
15 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
|
16 |
+
max_text_length: &max_text_length 25
|
17 |
+
use_space_char: &use_space_char False
|
18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_focalsvtr_lister_wo_fem_maxratio12.txt
|
19 |
+
use_amp: True
|
20 |
+
grad_clip_val: 20
|
21 |
+
|
22 |
+
Optimizer:
|
23 |
+
name: AdamW
|
24 |
+
lr: 0.00065
|
25 |
+
weight_decay: 0.05
|
26 |
+
filter_bias_and_bn: True
|
27 |
+
|
28 |
+
LRScheduler:
|
29 |
+
name: OneCycleLR
|
30 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
31 |
+
cycle_momentum: False
|
32 |
+
|
33 |
+
Architecture:
|
34 |
+
model_type: rec
|
35 |
+
algorithm: LISTER
|
36 |
+
Transform:
|
37 |
+
Encoder:
|
38 |
+
name: FocalSVTR
|
39 |
+
img_size: [32, 128]
|
40 |
+
depths: [6, 6, 9]
|
41 |
+
embed_dim: 96
|
42 |
+
sub_k: [[1, 1], [2, 1], [1, 1]]
|
43 |
+
focal_levels: [3, 3, 3]
|
44 |
+
last_stage: False
|
45 |
+
feat2d: True
|
46 |
+
Decoder:
|
47 |
+
name: LISTERDecoder
|
48 |
+
detach_grad: False
|
49 |
+
attn_scaling: True
|
50 |
+
use_fem: False
|
51 |
+
|
52 |
+
Loss:
|
53 |
+
name: LISTERLoss
|
54 |
+
|
55 |
+
PostProcess:
|
56 |
+
name: LISTERLabelDecode
|
57 |
+
|
58 |
+
Metric:
|
59 |
+
name: RecMetric
|
60 |
+
main_indicator: acc
|
61 |
+
is_filter: True
|
62 |
+
|
63 |
+
Train:
|
64 |
+
dataset:
|
65 |
+
name: RatioDataSetTVResize
|
66 |
+
ds_width: True
|
67 |
+
padding: False
|
68 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_filter_train_challenging',
|
69 |
+
'../Union14M-L-LMDB-Filtered/filter_filter_train_hard',
|
70 |
+
'../Union14M-L-LMDB-Filtered/filter_filter_train_medium',
|
71 |
+
'../Union14M-L-LMDB-Filtered/filter_filter_train_normal',
|
72 |
+
'../Union14M-L-LMDB-Filtered/filter_filter_train_easy',
|
73 |
+
]
|
74 |
+
transforms:
|
75 |
+
- DecodeImagePIL: # load image
|
76 |
+
img_mode: RGB
|
77 |
+
- PARSeqAugPIL:
|
78 |
+
- EPLabelEncode: # Class handling label
|
79 |
+
character_dict_path: *character_dict_path
|
80 |
+
use_space_char: *use_space_char
|
81 |
+
max_text_length: *max_text_length
|
82 |
+
- KeepKeys:
|
83 |
+
keep_keys: ['image', 'label', 'length']
|
84 |
+
sampler:
|
85 |
+
name: RatioSampler
|
86 |
+
scales: [[128, 32]] # w, h
|
87 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
88 |
+
first_bs: &bs 256
|
89 |
+
fix_bs: false
|
90 |
+
divided_factor: [4, 16] # w, h
|
91 |
+
is_training: True
|
92 |
+
loader:
|
93 |
+
shuffle: True
|
94 |
+
batch_size_per_card: *bs
|
95 |
+
drop_last: True
|
96 |
+
max_ratio: 12
|
97 |
+
num_workers: 4
|
98 |
+
|
99 |
+
Eval:
|
100 |
+
dataset:
|
101 |
+
name: RatioDataSetTVResize
|
102 |
+
ds_width: True
|
103 |
+
padding: False
|
104 |
+
data_dir_list: ['../evaluation/CUTE80',
|
105 |
+
'../evaluation/IC13_857',
|
106 |
+
'../evaluation/IC15_1811',
|
107 |
+
'../evaluation/IIIT5k',
|
108 |
+
'../evaluation/SVT',
|
109 |
+
'../evaluation/SVTP',
|
110 |
+
]
|
111 |
+
transforms:
|
112 |
+
- DecodeImagePIL: # load image
|
113 |
+
img_mode: RGB
|
114 |
+
- EPLabelEncode: # Class handling label
|
115 |
+
character_dict_path: *character_dict_path
|
116 |
+
use_space_char: *use_space_char
|
117 |
+
max_text_length: *max_text_length
|
118 |
+
- KeepKeys:
|
119 |
+
keep_keys: ['image', 'label', 'length']
|
120 |
+
sampler:
|
121 |
+
name: RatioSampler
|
122 |
+
scales: [[128, 32]] # w, h
|
123 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
124 |
+
first_bs: 256
|
125 |
+
fix_bs: false
|
126 |
+
divided_factor: [4, 16] # w, h
|
127 |
+
is_training: False
|
128 |
+
loader:
|
129 |
+
shuffle: False
|
130 |
+
drop_last: False
|
131 |
+
batch_size_per_card: *bs
|
132 |
+
max_ratio: 12
|
133 |
+
num_workers: 4
|
configs/rec/lister/svtrv2_lister_wo_fem_maxratio12.yml
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_lister_wo_fem_maxratio12/
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: True
|
10 |
+
pretrained_model:
|
11 |
+
checkpoints:
|
12 |
+
use_tensorboard: false
|
13 |
+
infer_img:
|
14 |
+
# for data or label process
|
15 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
|
16 |
+
max_text_length: &max_text_length 25
|
17 |
+
use_space_char: &use_space_char False
|
18 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_lister_wo_fem_maxratio12.txt
|
19 |
+
use_amp: True
|
20 |
+
grad_clip_val: 20
|
21 |
+
|
22 |
+
Optimizer:
|
23 |
+
name: AdamW
|
24 |
+
lr: 0.000325
|
25 |
+
weight_decay: 0.05
|
26 |
+
filter_bias_and_bn: True
|
27 |
+
|
28 |
+
LRScheduler:
|
29 |
+
name: OneCycleLR
|
30 |
+
|
31 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
32 |
+
cycle_momentum: False
|
33 |
+
|
34 |
+
Architecture:
|
35 |
+
model_type: rec
|
36 |
+
algorithm: LISTER
|
37 |
+
Transform:
|
38 |
+
Encoder:
|
39 |
+
name: SVTRv2LNConvTwo33
|
40 |
+
use_pos_embed: False
|
41 |
+
out_channels: 256
|
42 |
+
dims: [128, 256, 384]
|
43 |
+
depths: [6, 6, 6]
|
44 |
+
num_heads: [4, 8, 12]
|
45 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
46 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
47 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
48 |
+
last_stage: false
|
49 |
+
feat2d: True
|
50 |
+
Decoder:
|
51 |
+
name: LISTERDecoder
|
52 |
+
detach_grad: False
|
53 |
+
attn_scaling: True
|
54 |
+
use_fem: False
|
55 |
+
|
56 |
+
Loss:
|
57 |
+
name: LISTERLoss
|
58 |
+
|
59 |
+
PostProcess:
|
60 |
+
name: LISTERLabelDecode
|
61 |
+
|
62 |
+
Metric:
|
63 |
+
name: RecMetric
|
64 |
+
main_indicator: acc
|
65 |
+
is_filter: True
|
66 |
+
|
67 |
+
Train:
|
68 |
+
dataset:
|
69 |
+
name: RatioDataSetTVResize
|
70 |
+
ds_width: True
|
71 |
+
padding: False
|
72 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
73 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
74 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
75 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
76 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
77 |
+
]
|
78 |
+
transforms:
|
79 |
+
- DecodeImagePIL: # load image
|
80 |
+
img_mode: RGB
|
81 |
+
- PARSeqAugPIL:
|
82 |
+
- EPLabelEncode: # Class handling label
|
83 |
+
character_dict_path: *character_dict_path
|
84 |
+
use_space_char: *use_space_char
|
85 |
+
max_text_length: *max_text_length
|
86 |
+
- KeepKeys:
|
87 |
+
keep_keys: ['image', 'label', 'length']
|
88 |
+
sampler:
|
89 |
+
name: RatioSampler
|
90 |
+
scales: [[128, 32]] # w, h
|
91 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
92 |
+
first_bs: &bs 128
|
93 |
+
fix_bs: false
|
94 |
+
divided_factor: [4, 16] # w, h
|
95 |
+
is_training: True
|
96 |
+
loader:
|
97 |
+
shuffle: True
|
98 |
+
batch_size_per_card: *bs
|
99 |
+
drop_last: True
|
100 |
+
max_ratio: 12
|
101 |
+
num_workers: 4
|
102 |
+
|
103 |
+
Eval:
|
104 |
+
dataset:
|
105 |
+
name: RatioDataSetTVResize
|
106 |
+
ds_width: True
|
107 |
+
padding: False
|
108 |
+
data_dir_list: ['../evaluation/CUTE80',
|
109 |
+
'../evaluation/IC13_857',
|
110 |
+
'../evaluation/IC15_1811',
|
111 |
+
'../evaluation/IIIT5k',
|
112 |
+
'../evaluation/SVT',
|
113 |
+
'../evaluation/SVTP',
|
114 |
+
]
|
115 |
+
transforms:
|
116 |
+
- DecodeImagePIL: # load image
|
117 |
+
img_mode: RGB
|
118 |
+
- EPLabelEncode: # Class handling label
|
119 |
+
character_dict_path: *character_dict_path
|
120 |
+
use_space_char: *use_space_char
|
121 |
+
max_text_length: *max_text_length
|
122 |
+
|
123 |
+
- KeepKeys:
|
124 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
125 |
+
sampler:
|
126 |
+
name: RatioSampler
|
127 |
+
scales: [[128, 32]] # w, h
|
128 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
129 |
+
first_bs: 256
|
130 |
+
fix_bs: false
|
131 |
+
divided_factor: [4, 16] # w, h
|
132 |
+
is_training: False
|
133 |
+
loader:
|
134 |
+
shuffle: False
|
135 |
+
drop_last: False
|
136 |
+
batch_size_per_card: *bs
|
137 |
+
max_ratio: 12
|
138 |
+
num_workers: 4
|
configs/rec/lpv/svtr_base_lpv.yml
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/svtr_base_lpv/
|
7 |
+
save_epoch_step: 1
|
8 |
+
# evaluation is run every 2000 iterations
|
9 |
+
eval_batch_step: [0, 500]
|
10 |
+
eval_epoch_step: [0, 1]
|
11 |
+
cal_metric_during_train: True
|
12 |
+
pretrained_model:
|
13 |
+
# ./output/rec/u14m_filter/svtr_base_lpv_wo_glrm/best.pth
|
14 |
+
checkpoints:
|
15 |
+
use_tensorboard: false
|
16 |
+
infer_img:
|
17 |
+
# for data or label process
|
18 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
|
19 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
20 |
+
max_text_length: &max_text_length 25
|
21 |
+
use_space_char: &use_space_char False
|
22 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtr_lpv.txt
|
23 |
+
use_amp: True
|
24 |
+
grad_clip_val: 20
|
25 |
+
|
26 |
+
Optimizer:
|
27 |
+
name: Adam
|
28 |
+
lr: 0.0001 # for 4gpus bs128/gpu
|
29 |
+
weight_decay: 0.0
|
30 |
+
filter_bias_and_bn: False
|
31 |
+
betas: [0.9, 0.99]
|
32 |
+
|
33 |
+
LRScheduler:
|
34 |
+
name: MultiStepLR
|
35 |
+
milestones: [12]
|
36 |
+
gamma: 0.1
|
37 |
+
|
38 |
+
Architecture:
|
39 |
+
model_type: rec
|
40 |
+
algorithm: LPV
|
41 |
+
in_channels: 3
|
42 |
+
Transform:
|
43 |
+
Encoder:
|
44 |
+
name: SVTRNet
|
45 |
+
img_size: [32, 128]
|
46 |
+
out_char_num: 25
|
47 |
+
out_channels: 256
|
48 |
+
patch_merging: 'Conv'
|
49 |
+
embed_dim: [128, 256, 384]
|
50 |
+
depth: [6, 6, 6]
|
51 |
+
num_heads: [4, 8, 12]
|
52 |
+
mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
|
53 |
+
local_mixer: [[5, 5], [5, 5], [5, 5]]
|
54 |
+
sub_k: [[1, 1], [1, 1]]
|
55 |
+
feature2d: True
|
56 |
+
last_stage: False
|
57 |
+
prenorm: True
|
58 |
+
Decoder:
|
59 |
+
name: LPVDecoder
|
60 |
+
num_layer: 3
|
61 |
+
max_len: *max_text_length
|
62 |
+
use_mask: True
|
63 |
+
dim_feedforward: 1536
|
64 |
+
nhead: 12
|
65 |
+
dropout: 0.1
|
66 |
+
trans_layer: 3
|
67 |
+
|
68 |
+
Loss:
|
69 |
+
name: LPVLoss
|
70 |
+
|
71 |
+
PostProcess:
|
72 |
+
name: ARLabelDecode
|
73 |
+
character_dict_path: *character_dict_path
|
74 |
+
use_space_char: *use_space_char
|
75 |
+
|
76 |
+
Metric:
|
77 |
+
name: RecMetric
|
78 |
+
main_indicator: acc
|
79 |
+
is_filter: True
|
80 |
+
|
81 |
+
Train:
|
82 |
+
dataset:
|
83 |
+
name: LMDBDataSet
|
84 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
85 |
+
transforms:
|
86 |
+
- DecodeImagePIL: # load image
|
87 |
+
img_mode: RGB
|
88 |
+
- PARSeqAugPIL:
|
89 |
+
- ARLabelEncode: # Class handling label
|
90 |
+
character_dict_path: *character_dict_path
|
91 |
+
use_space_char: *use_space_char
|
92 |
+
max_text_length: *max_text_length
|
93 |
+
- RecTVResize:
|
94 |
+
image_shape: [32, 128]
|
95 |
+
padding: False
|
96 |
+
- KeepKeys:
|
97 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
98 |
+
loader:
|
99 |
+
shuffle: True
|
100 |
+
batch_size_per_card: 128
|
101 |
+
drop_last: True
|
102 |
+
num_workers: 4
|
103 |
+
|
104 |
+
Eval:
|
105 |
+
dataset:
|
106 |
+
name: LMDBDataSet
|
107 |
+
data_dir: ../evaluation/
|
108 |
+
transforms:
|
109 |
+
- DecodeImagePIL: # load image
|
110 |
+
img_mode: RGB
|
111 |
+
- ARLabelEncode: # Class handling label
|
112 |
+
character_dict_path: *character_dict_path
|
113 |
+
use_space_char: *use_space_char
|
114 |
+
max_text_length: *max_text_length
|
115 |
+
- RecTVResize:
|
116 |
+
image_shape: [32, 128]
|
117 |
+
padding: False
|
118 |
+
- KeepKeys:
|
119 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
120 |
+
loader:
|
121 |
+
shuffle: False
|
122 |
+
drop_last: False
|
123 |
+
batch_size_per_card: 128
|
124 |
+
num_workers: 4
|
configs/rec/lpv/svtr_base_lpv_wo_glrm.yml
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/svtr_base_lpv_wo_glrm/
|
7 |
+
save_epoch_step: 1
|
8 |
+
# evaluation is run every 2000 iterations
|
9 |
+
eval_batch_step: [0, 500]
|
10 |
+
eval_epoch_step: [0, 1]
|
11 |
+
cal_metric_during_train: True
|
12 |
+
pretrained_model:
|
13 |
+
checkpoints:
|
14 |
+
use_tensorboard: false
|
15 |
+
infer_img:
|
16 |
+
# for data or label process
|
17 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
|
18 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
19 |
+
max_text_length: &max_text_length 25
|
20 |
+
use_space_char: &use_space_char False
|
21 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtr_base_lpv_wo_glrm.txt
|
22 |
+
use_amp: True
|
23 |
+
grad_clip_val: 20
|
24 |
+
|
25 |
+
Optimizer:
|
26 |
+
name: Adam
|
27 |
+
lr: 0.0001 # for 4gpus bs128/gpu
|
28 |
+
weight_decay: 0.0
|
29 |
+
filter_bias_and_bn: False
|
30 |
+
betas: [0.9, 0.99]
|
31 |
+
|
32 |
+
LRScheduler:
|
33 |
+
name: MultiStepLR
|
34 |
+
milestones: [12]
|
35 |
+
gamma: 0.1
|
36 |
+
|
37 |
+
Architecture:
|
38 |
+
model_type: rec
|
39 |
+
algorithm: LPV
|
40 |
+
in_channels: 3
|
41 |
+
Transform:
|
42 |
+
Encoder:
|
43 |
+
name: SVTRNet
|
44 |
+
img_size: [32, 128]
|
45 |
+
out_char_num: 25
|
46 |
+
out_channels: 256
|
47 |
+
patch_merging: 'Conv'
|
48 |
+
embed_dim: [128, 256, 384]
|
49 |
+
depth: [6, 6, 6]
|
50 |
+
num_heads: [4, 8, 12]
|
51 |
+
mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
|
52 |
+
local_mixer: [[5, 5], [5, 5], [5, 5]]
|
53 |
+
sub_k: [[1, 1], [1, 1]]
|
54 |
+
feature2d: True
|
55 |
+
last_stage: False
|
56 |
+
prenorm: True
|
57 |
+
Decoder:
|
58 |
+
name: LPVDecoder
|
59 |
+
num_layer: 3
|
60 |
+
max_len: *max_text_length
|
61 |
+
use_mask: False
|
62 |
+
dim_feedforward: 1536
|
63 |
+
nhead: 12
|
64 |
+
dropout: 0.1
|
65 |
+
trans_layer: 3
|
66 |
+
|
67 |
+
Loss:
|
68 |
+
name: LPVLoss
|
69 |
+
|
70 |
+
PostProcess:
|
71 |
+
name: ARLabelDecode
|
72 |
+
character_dict_path: *character_dict_path
|
73 |
+
use_space_char: *use_space_char
|
74 |
+
|
75 |
+
Metric:
|
76 |
+
name: RecMetric
|
77 |
+
main_indicator: acc
|
78 |
+
is_filter: True
|
79 |
+
|
80 |
+
Train:
|
81 |
+
dataset:
|
82 |
+
name: LMDBDataSet
|
83 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
84 |
+
transforms:
|
85 |
+
- DecodeImagePIL: # load image
|
86 |
+
img_mode: RGB
|
87 |
+
- PARSeqAugPIL:
|
88 |
+
- ARLabelEncode: # Class handling label
|
89 |
+
character_dict_path: *character_dict_path
|
90 |
+
use_space_char: *use_space_char
|
91 |
+
max_text_length: *max_text_length
|
92 |
+
- RecTVResize:
|
93 |
+
image_shape: [32, 128]
|
94 |
+
padding: False
|
95 |
+
- KeepKeys:
|
96 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
97 |
+
loader:
|
98 |
+
shuffle: True
|
99 |
+
batch_size_per_card: 128
|
100 |
+
drop_last: True
|
101 |
+
num_workers: 4
|
102 |
+
|
103 |
+
Eval:
|
104 |
+
dataset:
|
105 |
+
name: LMDBDataSet
|
106 |
+
data_dir: ../evaluation/
|
107 |
+
transforms:
|
108 |
+
- DecodeImagePIL: # load image
|
109 |
+
img_mode: RGB
|
110 |
+
- ARLabelEncode: # Class handling label
|
111 |
+
character_dict_path: *character_dict_path
|
112 |
+
use_space_char: *use_space_char
|
113 |
+
max_text_length: *max_text_length
|
114 |
+
- RecTVResize:
|
115 |
+
image_shape: [32, 128]
|
116 |
+
padding: False
|
117 |
+
- KeepKeys:
|
118 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
119 |
+
loader:
|
120 |
+
shuffle: False
|
121 |
+
drop_last: False
|
122 |
+
batch_size_per_card: 128
|
123 |
+
num_workers: 4
|
configs/rec/lpv/svtrv2_lpv.yml
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_lpv/
|
7 |
+
save_epoch_step: 1
|
8 |
+
# evaluation is run every 2000 iterations
|
9 |
+
eval_batch_step: [0, 500]
|
10 |
+
eval_epoch_step: [0, 1]
|
11 |
+
cal_metric_during_train: True
|
12 |
+
pretrained_model:
|
13 |
+
# ./output/rec/u14m_filter/svtrv2_lpv_wo_glrm/best.pth
|
14 |
+
checkpoints:
|
15 |
+
use_tensorboard: false
|
16 |
+
infer_img:
|
17 |
+
# for data or label process
|
18 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
|
19 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
20 |
+
max_text_length: &max_text_length 25
|
21 |
+
use_space_char: &use_space_char False
|
22 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_lpv.txt
|
23 |
+
use_amp: True
|
24 |
+
grad_clip_val: 20
|
25 |
+
|
26 |
+
Optimizer:
|
27 |
+
name: AdamW
|
28 |
+
lr: 0.000325 # for 4gpus bs128/gpu
|
29 |
+
weight_decay: 0.05
|
30 |
+
filter_bias_and_bn: True
|
31 |
+
|
32 |
+
LRScheduler:
|
33 |
+
name: OneCycleLR
|
34 |
+
warmup_epoch: 1 # pct_start 0.075*20 = 1.5ep
|
35 |
+
cycle_momentum: False
|
36 |
+
|
37 |
+
Architecture:
|
38 |
+
model_type: rec
|
39 |
+
algorithm: LPV
|
40 |
+
in_channels: 3
|
41 |
+
Transform:
|
42 |
+
Encoder:
|
43 |
+
name: SVTRv2LNConvTwo33
|
44 |
+
use_pos_embed: False
|
45 |
+
dims: [128, 256, 384]
|
46 |
+
depths: [6, 6, 6]
|
47 |
+
num_heads: [4, 8, 12]
|
48 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
49 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
50 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
51 |
+
last_stage: false
|
52 |
+
feat2d: True
|
53 |
+
Decoder:
|
54 |
+
name: LPVDecoder
|
55 |
+
num_layer: 3
|
56 |
+
max_len: *max_text_length
|
57 |
+
use_mask: True
|
58 |
+
dim_feedforward: 1536
|
59 |
+
nhead: 12
|
60 |
+
dropout: 0.1
|
61 |
+
trans_layer: 3
|
62 |
+
|
63 |
+
Loss:
|
64 |
+
name: LPVLoss
|
65 |
+
|
66 |
+
PostProcess:
|
67 |
+
name: ARLabelDecode
|
68 |
+
character_dict_path: *character_dict_path
|
69 |
+
use_space_char: *use_space_char
|
70 |
+
|
71 |
+
Metric:
|
72 |
+
name: RecMetric
|
73 |
+
main_indicator: acc
|
74 |
+
is_filter: True
|
75 |
+
|
76 |
+
Train:
|
77 |
+
dataset:
|
78 |
+
name: RatioDataSetTVResize
|
79 |
+
ds_width: True
|
80 |
+
padding: false
|
81 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
82 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
83 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
84 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
85 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
86 |
+
]
|
87 |
+
transforms:
|
88 |
+
- DecodeImagePIL: # load image
|
89 |
+
img_mode: RGB
|
90 |
+
- PARSeqAugPIL:
|
91 |
+
- ARLabelEncode: # Class handling label
|
92 |
+
character_dict_path: *character_dict_path
|
93 |
+
use_space_char: *use_space_char
|
94 |
+
max_text_length: *max_text_length
|
95 |
+
- KeepKeys:
|
96 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
97 |
+
sampler:
|
98 |
+
name: RatioSampler
|
99 |
+
scales: [[128, 32]] # w, h
|
100 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
101 |
+
first_bs: &bs 128
|
102 |
+
fix_bs: false
|
103 |
+
divided_factor: [4, 16] # w, h
|
104 |
+
is_training: True
|
105 |
+
loader:
|
106 |
+
shuffle: True
|
107 |
+
batch_size_per_card: *bs
|
108 |
+
drop_last: True
|
109 |
+
max_ratio: &max_ratio 4
|
110 |
+
num_workers: 4
|
111 |
+
|
112 |
+
Eval:
|
113 |
+
dataset:
|
114 |
+
name: RatioDataSetTVResize
|
115 |
+
ds_width: True
|
116 |
+
padding: False
|
117 |
+
data_dir_list: [
|
118 |
+
'../evaluation/CUTE80',
|
119 |
+
'../evaluation/IC13_857',
|
120 |
+
'../evaluation/IC15_1811',
|
121 |
+
'../evaluation/IIIT5k',
|
122 |
+
'../evaluation/SVT',
|
123 |
+
'../evaluation/SVTP',
|
124 |
+
]
|
125 |
+
transforms:
|
126 |
+
- DecodeImagePIL: # load image
|
127 |
+
img_mode: RGB
|
128 |
+
- ARLabelEncode: # Class handling label
|
129 |
+
character_dict_path: *character_dict_path
|
130 |
+
use_space_char: *use_space_char
|
131 |
+
max_text_length: *max_text_length
|
132 |
+
- KeepKeys:
|
133 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
134 |
+
sampler:
|
135 |
+
name: RatioSampler
|
136 |
+
scales: [[128, 32]] # w, h
|
137 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
138 |
+
first_bs: *bs
|
139 |
+
fix_bs: false
|
140 |
+
divided_factor: [4, 16] # w, h
|
141 |
+
is_training: False
|
142 |
+
loader:
|
143 |
+
shuffle: False
|
144 |
+
drop_last: False
|
145 |
+
batch_size_per_card: *bs
|
146 |
+
max_ratio: *max_ratio
|
147 |
+
num_workers: 4
|
configs/rec/lpv/svtrv2_lpv_wo_glrm.yml
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_lpv_wo_glrm/
|
7 |
+
save_epoch_step: 1
|
8 |
+
# evaluation is run every 2000 iterations
|
9 |
+
eval_batch_step: [0, 500]
|
10 |
+
eval_epoch_step: [0, 1]
|
11 |
+
cal_metric_during_train: True
|
12 |
+
pretrained_model:
|
13 |
+
checkpoints:
|
14 |
+
use_tensorboard: false
|
15 |
+
infer_img:
|
16 |
+
# for data or label process
|
17 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
|
18 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
19 |
+
max_text_length: &max_text_length 25
|
20 |
+
use_space_char: &use_space_char False
|
21 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_lpv_wo_glrm.txt
|
22 |
+
use_amp: True
|
23 |
+
grad_clip_val: 20
|
24 |
+
|
25 |
+
Optimizer:
|
26 |
+
name: AdamW
|
27 |
+
lr: 0.000325 # for 4gpus bs128/gpu
|
28 |
+
weight_decay: 0.05
|
29 |
+
filter_bias_and_bn: True
|
30 |
+
|
31 |
+
LRScheduler:
|
32 |
+
name: OneCycleLR
|
33 |
+
warmup_epoch: 1 # pct_start 0.075*20 = 1.5ep
|
34 |
+
cycle_momentum: False
|
35 |
+
|
36 |
+
Architecture:
|
37 |
+
model_type: rec
|
38 |
+
algorithm: LPV
|
39 |
+
in_channels: 3
|
40 |
+
Transform:
|
41 |
+
Encoder:
|
42 |
+
name: SVTRv2LNConvTwo33
|
43 |
+
use_pos_embed: False
|
44 |
+
dims: [128, 256, 384]
|
45 |
+
depths: [6, 6, 6]
|
46 |
+
num_heads: [4, 8, 12]
|
47 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
48 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
49 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
50 |
+
last_stage: false
|
51 |
+
feat2d: True
|
52 |
+
Decoder:
|
53 |
+
name: LPVDecoder
|
54 |
+
num_layer: 3
|
55 |
+
max_len: *max_text_length
|
56 |
+
use_mask: False
|
57 |
+
dim_feedforward: 1536
|
58 |
+
nhead: 12
|
59 |
+
dropout: 0.1
|
60 |
+
trans_layer: 3
|
61 |
+
|
62 |
+
Loss:
|
63 |
+
name: LPVLoss
|
64 |
+
|
65 |
+
PostProcess:
|
66 |
+
name: ARLabelDecode
|
67 |
+
character_dict_path: *character_dict_path
|
68 |
+
use_space_char: *use_space_char
|
69 |
+
|
70 |
+
Metric:
|
71 |
+
name: RecMetric
|
72 |
+
main_indicator: acc
|
73 |
+
is_filter: True
|
74 |
+
|
75 |
+
Train:
|
76 |
+
dataset:
|
77 |
+
name: RatioDataSetTVResize
|
78 |
+
ds_width: True
|
79 |
+
padding: false
|
80 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
81 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
82 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
83 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
84 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
85 |
+
]
|
86 |
+
transforms:
|
87 |
+
- DecodeImagePIL: # load image
|
88 |
+
img_mode: RGB
|
89 |
+
- PARSeqAugPIL:
|
90 |
+
- ARLabelEncode: # Class handling label
|
91 |
+
character_dict_path: *character_dict_path
|
92 |
+
use_space_char: *use_space_char
|
93 |
+
max_text_length: *max_text_length
|
94 |
+
- KeepKeys:
|
95 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
96 |
+
sampler:
|
97 |
+
name: RatioSampler
|
98 |
+
scales: [[128, 32]] # w, h
|
99 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
100 |
+
first_bs: &bs 128
|
101 |
+
fix_bs: false
|
102 |
+
divided_factor: [4, 16] # w, h
|
103 |
+
is_training: True
|
104 |
+
loader:
|
105 |
+
shuffle: True
|
106 |
+
batch_size_per_card: *bs
|
107 |
+
drop_last: True
|
108 |
+
max_ratio: &max_ratio 4
|
109 |
+
num_workers: 4
|
110 |
+
|
111 |
+
Eval:
|
112 |
+
dataset:
|
113 |
+
name: RatioDataSetTVResize
|
114 |
+
ds_width: True
|
115 |
+
padding: False
|
116 |
+
data_dir_list: [
|
117 |
+
'../evaluation/CUTE80',
|
118 |
+
'../evaluation/IC13_857',
|
119 |
+
'../evaluation/IC15_1811',
|
120 |
+
'../evaluation/IIIT5k',
|
121 |
+
'../evaluation/SVT',
|
122 |
+
'../evaluation/SVTP',
|
123 |
+
]
|
124 |
+
transforms:
|
125 |
+
- DecodeImagePIL: # load image
|
126 |
+
img_mode: RGB
|
127 |
+
- ARLabelEncode: # Class handling label
|
128 |
+
character_dict_path: *character_dict_path
|
129 |
+
use_space_char: *use_space_char
|
130 |
+
max_text_length: *max_text_length
|
131 |
+
- KeepKeys:
|
132 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
133 |
+
sampler:
|
134 |
+
name: RatioSampler
|
135 |
+
scales: [[128, 32]] # w, h
|
136 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
137 |
+
first_bs: *bs
|
138 |
+
fix_bs: false
|
139 |
+
divided_factor: [4, 16] # w, h
|
140 |
+
is_training: False
|
141 |
+
loader:
|
142 |
+
shuffle: False
|
143 |
+
drop_last: False
|
144 |
+
batch_size_per_card: *bs
|
145 |
+
max_ratio: *max_ratio
|
146 |
+
num_workers: 4
|
configs/rec/maerec/vit_nrtr.yml
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 10
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/vit_nrtr_ft_mae/
|
7 |
+
save_epoch_step: 1
|
8 |
+
# evaluation is run every 2000 iterations
|
9 |
+
eval_batch_step: [0, 500]
|
10 |
+
eval_epoch_step: [0, 1]
|
11 |
+
cal_metric_during_train: True
|
12 |
+
pretrained_model:
|
13 |
+
# ./open_ocr_vit_small_params.pth
|
14 |
+
checkpoints:
|
15 |
+
use_tensorboard: false
|
16 |
+
infer_img:
|
17 |
+
# for data or label process
|
18 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
|
19 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
20 |
+
max_text_length: &max_text_length 25
|
21 |
+
use_space_char: &use_space_char False
|
22 |
+
save_res_path: ./output/rec/u14m_filter/predicts_vit_nrtr_ft_mae.txt
|
23 |
+
use_amp: True
|
24 |
+
project_name: maerec
|
25 |
+
|
26 |
+
Optimizer:
|
27 |
+
name: AdamW
|
28 |
+
lr: 0.00065 # for 4gpus bs256/gpu
|
29 |
+
weight_decay: 0.05
|
30 |
+
filter_bias_and_bn: True
|
31 |
+
|
32 |
+
LRScheduler:
|
33 |
+
name: OneCycleLR
|
34 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 : 1.5ep
|
35 |
+
cycle_momentum: False
|
36 |
+
|
37 |
+
Architecture:
|
38 |
+
model_type: rec
|
39 |
+
algorithm: BGPD
|
40 |
+
in_channels: 3
|
41 |
+
Transform:
|
42 |
+
Encoder:
|
43 |
+
name: ViT
|
44 |
+
img_size: [32, 128]
|
45 |
+
patch_size: [4, 4]
|
46 |
+
embed_dim: 384
|
47 |
+
depth: 12
|
48 |
+
num_heads: 6
|
49 |
+
mlp_ratio: 4
|
50 |
+
qkv_bias: True
|
51 |
+
use_cls_token: True
|
52 |
+
Decoder:
|
53 |
+
name: NRTRDecoder
|
54 |
+
num_encoder_layers: -1
|
55 |
+
beam_size: 0
|
56 |
+
num_decoder_layers: 6
|
57 |
+
nhead: 8
|
58 |
+
max_len: *max_text_length
|
59 |
+
|
60 |
+
Loss:
|
61 |
+
name: ARLoss
|
62 |
+
|
63 |
+
PostProcess:
|
64 |
+
name: ARLabelDecode
|
65 |
+
character_dict_path: *character_dict_path
|
66 |
+
use_space_char: *use_space_char
|
67 |
+
|
68 |
+
Metric:
|
69 |
+
name: RecMetric
|
70 |
+
main_indicator: acc
|
71 |
+
is_filter: True
|
72 |
+
|
73 |
+
Train:
|
74 |
+
dataset:
|
75 |
+
name: LMDBDataSet
|
76 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
77 |
+
transforms:
|
78 |
+
- DecodeImagePIL: # load image
|
79 |
+
img_mode: RGB
|
80 |
+
- PARSeqAugPIL:
|
81 |
+
- ARLabelEncode: # Class handling label
|
82 |
+
character_dict_path: *character_dict_path
|
83 |
+
use_space_char: *use_space_char
|
84 |
+
max_text_length: *max_text_length
|
85 |
+
- RecTVResize:
|
86 |
+
image_shape: [32, 128]
|
87 |
+
padding: False
|
88 |
+
- KeepKeys:
|
89 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
90 |
+
loader:
|
91 |
+
shuffle: True
|
92 |
+
batch_size_per_card: 256
|
93 |
+
drop_last: True
|
94 |
+
num_workers: 4
|
95 |
+
|
96 |
+
Eval:
|
97 |
+
dataset:
|
98 |
+
name: LMDBDataSet
|
99 |
+
data_dir: ../evaluation/
|
100 |
+
transforms:
|
101 |
+
- DecodeImagePIL: # load image
|
102 |
+
img_mode: RGB
|
103 |
+
- ARLabelEncode: # Class handling label
|
104 |
+
character_dict_path: *character_dict_path
|
105 |
+
use_space_char: *use_space_char
|
106 |
+
max_text_length: *max_text_length
|
107 |
+
- RecTVResize:
|
108 |
+
image_shape: [32, 128]
|
109 |
+
padding: False
|
110 |
+
- KeepKeys:
|
111 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
112 |
+
loader:
|
113 |
+
shuffle: False
|
114 |
+
drop_last: False
|
115 |
+
batch_size_per_card: 256
|
116 |
+
num_workers: 4
|
configs/rec/matrn/resnet45_trans_matrn.yml
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/resnet45_trans_matrn/
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: True
|
10 |
+
pretrained_model:
|
11 |
+
# ./openocr_nolang_abinet_lang.pth
|
12 |
+
checkpoints:
|
13 |
+
use_tensorboard: false
|
14 |
+
infer_img:
|
15 |
+
# for data or label process
|
16 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
17 |
+
max_text_length: 25
|
18 |
+
use_space_char: False
|
19 |
+
save_res_path: ./output/rec/u14m_filter/predicts_resnet45_trans_matrn.txt
|
20 |
+
grad_clip_val: 20
|
21 |
+
use_amp: True
|
22 |
+
|
23 |
+
Optimizer:
|
24 |
+
name: Adam
|
25 |
+
lr: 0.000133 # 4gpus 128bs/gpu
|
26 |
+
weight_decay: 0.0
|
27 |
+
filter_bias_and_bn: False
|
28 |
+
|
29 |
+
LRScheduler:
|
30 |
+
name: MultiStepLR
|
31 |
+
milestones: [12, 18]
|
32 |
+
gamma: 0.1
|
33 |
+
|
34 |
+
Architecture:
|
35 |
+
model_type: rec
|
36 |
+
algorithm: MATRN
|
37 |
+
Transform:
|
38 |
+
Encoder:
|
39 |
+
name: ResNet45
|
40 |
+
in_channels: 3
|
41 |
+
strides: [2, 1, 2, 1, 1]
|
42 |
+
Decoder:
|
43 |
+
name: MATRNDecoder
|
44 |
+
iter_size: 3
|
45 |
+
|
46 |
+
Loss:
|
47 |
+
name: ABINetLoss
|
48 |
+
align_weight: 3.0
|
49 |
+
|
50 |
+
PostProcess:
|
51 |
+
name: ABINetLabelDecode
|
52 |
+
|
53 |
+
Metric:
|
54 |
+
name: RecMetric
|
55 |
+
main_indicator: acc
|
56 |
+
is_filter: True
|
57 |
+
|
58 |
+
Train:
|
59 |
+
dataset:
|
60 |
+
name: LMDBDataSet
|
61 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
62 |
+
transforms:
|
63 |
+
- DecodeImagePIL: # load image
|
64 |
+
img_mode: RGB
|
65 |
+
- PARSeqAugPIL:
|
66 |
+
- ABINetLabelEncode:
|
67 |
+
- RecTVResize:
|
68 |
+
image_shape: [32, 128]
|
69 |
+
padding: False
|
70 |
+
- KeepKeys:
|
71 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
72 |
+
loader:
|
73 |
+
shuffle: True
|
74 |
+
batch_size_per_card: 128
|
75 |
+
drop_last: True
|
76 |
+
num_workers: 4
|
77 |
+
|
78 |
+
Eval:
|
79 |
+
dataset:
|
80 |
+
name: LMDBDataSet
|
81 |
+
data_dir: ../evaluation
|
82 |
+
transforms:
|
83 |
+
- DecodeImagePIL: # load image
|
84 |
+
img_mode: RGB
|
85 |
+
- ABINetLabelEncode:
|
86 |
+
- RecTVResize:
|
87 |
+
image_shape: [32, 128]
|
88 |
+
padding: False
|
89 |
+
- KeepKeys:
|
90 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
91 |
+
loader:
|
92 |
+
shuffle: False
|
93 |
+
drop_last: False
|
94 |
+
batch_size_per_card: 256
|
95 |
+
num_workers: 2
|
configs/rec/matrn/svtrv2_matrn.yml
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_matrn/
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: True
|
10 |
+
pretrained_model:
|
11 |
+
# ./openocr_svtrv2_nolang_abinet_lang.pth
|
12 |
+
checkpoints:
|
13 |
+
use_tensorboard: false
|
14 |
+
infer_img:
|
15 |
+
# for data or label process
|
16 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
17 |
+
max_text_length: 25
|
18 |
+
use_space_char: False
|
19 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_matrn.txt
|
20 |
+
use_amp: True
|
21 |
+
grad_clip_val: 20
|
22 |
+
|
23 |
+
Optimizer:
|
24 |
+
name: AdamW
|
25 |
+
lr: 0.000325 # for 4gpus bs128/gpu
|
26 |
+
weight_decay: 0.05
|
27 |
+
filter_bias_and_bn: True
|
28 |
+
|
29 |
+
LRScheduler:
|
30 |
+
name: OneCycleLR
|
31 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
32 |
+
cycle_momentum: False
|
33 |
+
|
34 |
+
Architecture:
|
35 |
+
model_type: rec
|
36 |
+
algorithm: MATRN
|
37 |
+
Transform:
|
38 |
+
Encoder:
|
39 |
+
name: SVTRv2LNConvTwo33
|
40 |
+
use_pos_embed: False
|
41 |
+
dims: [128, 256, 384]
|
42 |
+
depths: [6, 6, 6]
|
43 |
+
num_heads: [4, 8, 12]
|
44 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
45 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
46 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
47 |
+
last_stage: false
|
48 |
+
feat2d: True
|
49 |
+
Decoder:
|
50 |
+
name: MATRNDecoder
|
51 |
+
iter_size: 3
|
52 |
+
num_layers: 0
|
53 |
+
|
54 |
+
Loss:
|
55 |
+
name: ABINetLoss
|
56 |
+
|
57 |
+
PostProcess:
|
58 |
+
name: ABINetLabelDecode
|
59 |
+
|
60 |
+
Metric:
|
61 |
+
name: RecMetric
|
62 |
+
main_indicator: acc
|
63 |
+
is_filter: True
|
64 |
+
|
65 |
+
Train:
|
66 |
+
dataset:
|
67 |
+
name: RatioDataSetTVResize
|
68 |
+
ds_width: True
|
69 |
+
padding: false
|
70 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
71 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
72 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
73 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
74 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
75 |
+
]
|
76 |
+
transforms:
|
77 |
+
- DecodeImagePIL: # load image
|
78 |
+
img_mode: RGB
|
79 |
+
- PARSeqAugPIL:
|
80 |
+
- ABINetLabelEncode:
|
81 |
+
- KeepKeys:
|
82 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
83 |
+
sampler:
|
84 |
+
name: RatioSampler
|
85 |
+
scales: [[128, 32]] # w, h
|
86 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
87 |
+
first_bs: &bs 128
|
88 |
+
fix_bs: false
|
89 |
+
divided_factor: [4, 16] # w, h
|
90 |
+
is_training: True
|
91 |
+
loader:
|
92 |
+
shuffle: True
|
93 |
+
batch_size_per_card: *bs
|
94 |
+
drop_last: True
|
95 |
+
max_ratio: &max_ratio 4
|
96 |
+
num_workers: 4
|
97 |
+
|
98 |
+
Eval:
|
99 |
+
dataset:
|
100 |
+
name: RatioDataSetTVResize
|
101 |
+
ds_width: True
|
102 |
+
padding: False
|
103 |
+
data_dir_list: [
|
104 |
+
'../evaluation/CUTE80',
|
105 |
+
'../evaluation/IC13_857',
|
106 |
+
'../evaluation/IC15_1811',
|
107 |
+
'../evaluation/IIIT5k',
|
108 |
+
'../evaluation/SVT',
|
109 |
+
'../evaluation/SVTP',
|
110 |
+
]
|
111 |
+
transforms:
|
112 |
+
- DecodeImagePIL: # load image
|
113 |
+
img_mode: RGB
|
114 |
+
- ABINetLabelEncode:
|
115 |
+
- KeepKeys:
|
116 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
117 |
+
sampler:
|
118 |
+
name: RatioSampler
|
119 |
+
scales: [[128, 32]] # w, h
|
120 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
121 |
+
first_bs: *bs
|
122 |
+
fix_bs: false
|
123 |
+
divided_factor: [4, 16] # w, h
|
124 |
+
is_training: False
|
125 |
+
loader:
|
126 |
+
shuffle: False
|
127 |
+
drop_last: False
|
128 |
+
batch_size_per_card: *bs
|
129 |
+
max_ratio: *max_ratio
|
130 |
+
num_workers: 4
|
configs/rec/mgpstr/svtrv2_mgpstr_only_char.yml
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/svtrv2_mgpstr_only_char/
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: True
|
10 |
+
pretrained_model:
|
11 |
+
checkpoints:
|
12 |
+
use_tensorboard: false
|
13 |
+
infer_img:
|
14 |
+
# for data or label process
|
15 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
|
16 |
+
max_text_length: &max_text_length 25
|
17 |
+
use_space_char: &use_space_char False
|
18 |
+
use_amp: True
|
19 |
+
save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_mgpstr_only_char.txt
|
20 |
+
|
21 |
+
Optimizer:
|
22 |
+
name: AdamW
|
23 |
+
lr: 0.00065 # 4gpus 256bs/gpu
|
24 |
+
weight_decay: 0.05
|
25 |
+
filter_bias_and_bn: True
|
26 |
+
|
27 |
+
LRScheduler:
|
28 |
+
name: OneCycleLR
|
29 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
30 |
+
cycle_momentum: False
|
31 |
+
|
32 |
+
Architecture:
|
33 |
+
model_type: rec
|
34 |
+
algorithm: MGPSTR
|
35 |
+
Transform:
|
36 |
+
Encoder:
|
37 |
+
name: SVTRv2LNConvTwo33
|
38 |
+
use_pos_embed: False
|
39 |
+
out_channels: 256
|
40 |
+
dims: [128, 256, 384]
|
41 |
+
depths: [6, 6, 6]
|
42 |
+
num_heads: [4, 8, 12]
|
43 |
+
mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
|
44 |
+
local_k: [[5, 5], [5, 5], [-1, -1]]
|
45 |
+
sub_k: [[1, 1], [2, 1], [-1, -1]]
|
46 |
+
last_stage: false
|
47 |
+
feat2d: false
|
48 |
+
Decoder:
|
49 |
+
name: MGPDecoder
|
50 |
+
only_char: &only_char True
|
51 |
+
|
52 |
+
Loss:
|
53 |
+
name: MGPLoss
|
54 |
+
only_char: *only_char
|
55 |
+
|
56 |
+
PostProcess:
|
57 |
+
name: MPGLabelDecode
|
58 |
+
character_dict_path: *character_dict_path
|
59 |
+
use_space_char: *use_space_char
|
60 |
+
only_char: *only_char
|
61 |
+
|
62 |
+
Metric:
|
63 |
+
name: RecMetric
|
64 |
+
main_indicator: acc
|
65 |
+
is_filter: True
|
66 |
+
|
67 |
+
Train:
|
68 |
+
dataset:
|
69 |
+
name: RatioDataSetTVResize
|
70 |
+
ds_width: True
|
71 |
+
padding: false
|
72 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_filter_train_challenging',
|
73 |
+
'../Union14M-L-LMDB-Filtered/filter_filter_train_hard',
|
74 |
+
'../Union14M-L-LMDB-Filtered/filter_filter_train_medium',
|
75 |
+
'../Union14M-L-LMDB-Filtered/filter_filter_train_normal',
|
76 |
+
'../Union14M-L-LMDB-Filtered/filter_filter_train_easy',
|
77 |
+
]
|
78 |
+
transforms:
|
79 |
+
- DecodeImagePIL: # load image
|
80 |
+
img_mode: RGB
|
81 |
+
- PARSeqAugPIL:
|
82 |
+
- MGPLabelEncode: # Class handling label
|
83 |
+
character_dict_path: *character_dict_path
|
84 |
+
use_space_char: *use_space_char
|
85 |
+
max_text_length: *max_text_length
|
86 |
+
only_char: *only_char
|
87 |
+
- KeepKeys:
|
88 |
+
keep_keys: ['image', 'char_label', 'length'] # dataloader will return list in this order
|
89 |
+
sampler:
|
90 |
+
name: RatioSampler
|
91 |
+
scales: [[128, 32]] # w, h
|
92 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
93 |
+
first_bs: &bs 256
|
94 |
+
fix_bs: false
|
95 |
+
divided_factor: [4, 16] # w, h
|
96 |
+
is_training: True
|
97 |
+
loader:
|
98 |
+
shuffle: True
|
99 |
+
batch_size_per_card: *bs
|
100 |
+
drop_last: True
|
101 |
+
max_ratio: &max_ratio 4
|
102 |
+
num_workers: 4
|
103 |
+
|
104 |
+
Eval:
|
105 |
+
dataset:
|
106 |
+
name: RatioDataSetTVResize
|
107 |
+
ds_width: True
|
108 |
+
padding: False
|
109 |
+
data_dir_list: [
|
110 |
+
'../evaluation/CUTE80',
|
111 |
+
'../evaluation/IC13_857',
|
112 |
+
'../evaluation/IC15_1811',
|
113 |
+
'../evaluation/IIIT5k',
|
114 |
+
'../evaluation/SVT',
|
115 |
+
'../evaluation/SVTP',
|
116 |
+
]
|
117 |
+
transforms:
|
118 |
+
- DecodeImagePIL: # load image
|
119 |
+
img_mode: RGB
|
120 |
+
- MGPLabelEncode: # Class handling label
|
121 |
+
character_dict_path: *character_dict_path
|
122 |
+
use_space_char: *use_space_char
|
123 |
+
max_text_length: *max_text_length
|
124 |
+
only_char: *only_char
|
125 |
+
- KeepKeys:
|
126 |
+
keep_keys: ['image', 'char_label', 'length'] # dataloader will return list in this order
|
127 |
+
sampler:
|
128 |
+
name: RatioSampler
|
129 |
+
scales: [[128, 32]] # w, h
|
130 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
131 |
+
first_bs: *bs
|
132 |
+
fix_bs: false
|
133 |
+
divided_factor: [4, 16] # w, h
|
134 |
+
is_training: False
|
135 |
+
loader:
|
136 |
+
shuffle: False
|
137 |
+
drop_last: False
|
138 |
+
batch_size_per_card: *bs
|
139 |
+
max_ratio: *max_ratio
|
140 |
+
num_workers: 4
|
configs/rec/mgpstr/vit_base_mgpstr_only_char.yml
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/vit_base_mgpstr/
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: False
|
10 |
+
pretrained_model:
|
11 |
+
checkpoints:
|
12 |
+
use_tensorboard: false
|
13 |
+
infer_img:
|
14 |
+
# for data or label process
|
15 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
|
16 |
+
max_text_length: &max_text_length 25
|
17 |
+
use_space_char: &use_space_char False
|
18 |
+
use_amp: True
|
19 |
+
save_res_path: ./output/rec/u14m_filter/predicts_vit_mgpstr_only_char.txt
|
20 |
+
grad_clip_val: 5
|
21 |
+
project_name: mgpstr_base
|
22 |
+
|
23 |
+
Optimizer:
|
24 |
+
name: Adam
|
25 |
+
lr: 0.000325 # 4gpus 128bs/gpu
|
26 |
+
weight_decay: 0.
|
27 |
+
filter_bias_and_bn: False
|
28 |
+
|
29 |
+
LRScheduler:
|
30 |
+
name: OneCycleLR
|
31 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
32 |
+
cycle_momentum: False
|
33 |
+
|
34 |
+
Architecture:
|
35 |
+
model_type: rec
|
36 |
+
algorithm: MGPSTR
|
37 |
+
Transform:
|
38 |
+
Encoder:
|
39 |
+
name: ViT
|
40 |
+
img_size: [32,128]
|
41 |
+
patch_size: [4, 4]
|
42 |
+
embed_dim: 768
|
43 |
+
depth: 12
|
44 |
+
num_heads: 12
|
45 |
+
mlp_ratio: 4
|
46 |
+
qkv_bias: True
|
47 |
+
Decoder:
|
48 |
+
name: MGPDecoder
|
49 |
+
only_char: &only_char True
|
50 |
+
|
51 |
+
Loss:
|
52 |
+
name: MGPLoss
|
53 |
+
only_char: *only_char
|
54 |
+
|
55 |
+
PostProcess:
|
56 |
+
name: MPGLabelDecode
|
57 |
+
character_dict_path: *character_dict_path
|
58 |
+
use_space_char: *use_space_char
|
59 |
+
only_char: *only_char
|
60 |
+
|
61 |
+
Metric:
|
62 |
+
name: RecMetric
|
63 |
+
main_indicator: acc
|
64 |
+
is_filter: True
|
65 |
+
|
66 |
+
Train:
|
67 |
+
dataset:
|
68 |
+
name: LMDBDataSet
|
69 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
70 |
+
transforms:
|
71 |
+
- DecodeImagePIL: # load image
|
72 |
+
img_mode: RGB
|
73 |
+
- PARSeqAugPIL:
|
74 |
+
- MGPLabelEncode: # Class handling label
|
75 |
+
character_dict_path: *character_dict_path
|
76 |
+
use_space_char: *use_space_char
|
77 |
+
max_text_length: *max_text_length
|
78 |
+
only_char: *only_char
|
79 |
+
- RecTVResize:
|
80 |
+
image_shape: [32, 128]
|
81 |
+
padding: False
|
82 |
+
- KeepKeys:
|
83 |
+
keep_keys: ['image', 'char_label', 'length'] # dataloader will return list in this order
|
84 |
+
loader:
|
85 |
+
shuffle: True
|
86 |
+
batch_size_per_card: 128
|
87 |
+
drop_last: True
|
88 |
+
num_workers: 4
|
89 |
+
|
90 |
+
Eval:
|
91 |
+
dataset:
|
92 |
+
name: LMDBDataSet
|
93 |
+
data_dir: ../evaluation/
|
94 |
+
transforms:
|
95 |
+
- DecodeImagePIL: # load image
|
96 |
+
img_mode: RGB
|
97 |
+
- MGPLabelEncode: # Class handling label
|
98 |
+
character_dict_path: *character_dict_path
|
99 |
+
use_space_char: *use_space_char
|
100 |
+
max_text_length: *max_text_length
|
101 |
+
only_char: *only_char
|
102 |
+
- RecTVResize:
|
103 |
+
image_shape: [32, 128]
|
104 |
+
padding: False
|
105 |
+
- KeepKeys:
|
106 |
+
keep_keys: ['image', 'char_label', 'length'] # dataloader will return list in this order
|
107 |
+
loader:
|
108 |
+
shuffle: False
|
109 |
+
drop_last: False
|
110 |
+
batch_size_per_card: 256
|
111 |
+
num_workers: 2
|
configs/rec/mgpstr/vit_large_mgpstr_only_char.yml
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/vit_base_mgpstr_only_char/
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: False
|
10 |
+
pretrained_model:
|
11 |
+
checkpoints:
|
12 |
+
use_tensorboard: false
|
13 |
+
infer_img:
|
14 |
+
# for data or label process
|
15 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
|
16 |
+
max_text_length: &max_text_length 25
|
17 |
+
use_space_char: &use_space_char False
|
18 |
+
use_amp: True
|
19 |
+
save_res_path: ./output/rec/u14m_filter/predicts_vit_mgpstr_only_char.txt
|
20 |
+
grad_clip_val: 5
|
21 |
+
|
22 |
+
Optimizer:
|
23 |
+
name: Adam
|
24 |
+
lr: 0.000325 # 4gpus 128bs/gpu
|
25 |
+
weight_decay: 0.
|
26 |
+
filter_bias_and_bn: False
|
27 |
+
|
28 |
+
LRScheduler:
|
29 |
+
name: OneCycleLR
|
30 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
31 |
+
cycle_momentum: False
|
32 |
+
|
33 |
+
Architecture:
|
34 |
+
model_type: rec
|
35 |
+
algorithm: MGPSTR
|
36 |
+
Transform:
|
37 |
+
Encoder:
|
38 |
+
name: ViT
|
39 |
+
img_size: [32,128]
|
40 |
+
patch_size: [4, 4]
|
41 |
+
embed_dim: 1024
|
42 |
+
depth: 24
|
43 |
+
num_heads: 16
|
44 |
+
mlp_ratio: 4
|
45 |
+
qkv_bias: True
|
46 |
+
Decoder:
|
47 |
+
name: MGPDecoder
|
48 |
+
only_char: &only_char True
|
49 |
+
|
50 |
+
Loss:
|
51 |
+
name: MGPLoss
|
52 |
+
only_char: *only_char
|
53 |
+
|
54 |
+
PostProcess:
|
55 |
+
name: MPGLabelDecode
|
56 |
+
character_dict_path: *character_dict_path
|
57 |
+
use_space_char: *use_space_char
|
58 |
+
only_char: *only_char
|
59 |
+
|
60 |
+
Metric:
|
61 |
+
name: RecMetric
|
62 |
+
main_indicator: acc
|
63 |
+
is_filter: True
|
64 |
+
|
65 |
+
Train:
|
66 |
+
dataset:
|
67 |
+
name: LMDBDataSet
|
68 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
69 |
+
transforms:
|
70 |
+
- DecodeImagePIL: # load image
|
71 |
+
img_mode: RGB
|
72 |
+
- PARSeqAugPIL:
|
73 |
+
- MGPLabelEncode: # Class handling label
|
74 |
+
character_dict_path: *character_dict_path
|
75 |
+
use_space_char: *use_space_char
|
76 |
+
max_text_length: *max_text_length
|
77 |
+
only_char: *only_char
|
78 |
+
- RecTVResize:
|
79 |
+
image_shape: [32, 128]
|
80 |
+
padding: False
|
81 |
+
- KeepKeys:
|
82 |
+
keep_keys: ['image', 'char_label', 'length'] # dataloader will return list in this order
|
83 |
+
loader:
|
84 |
+
shuffle: True
|
85 |
+
batch_size_per_card: 128
|
86 |
+
drop_last: True
|
87 |
+
num_workers: 4
|
88 |
+
|
89 |
+
Eval:
|
90 |
+
dataset:
|
91 |
+
name: LMDBDataSet
|
92 |
+
data_dir: ../evaluation/
|
93 |
+
transforms:
|
94 |
+
- DecodeImagePIL: # load image
|
95 |
+
img_mode: RGB
|
96 |
+
- MGPLabelEncode: # Class handling label
|
97 |
+
character_dict_path: *character_dict_path
|
98 |
+
use_space_char: *use_space_char
|
99 |
+
max_text_length: *max_text_length
|
100 |
+
only_char: *only_char
|
101 |
+
- RecTVResize:
|
102 |
+
image_shape: [32, 128]
|
103 |
+
padding: False
|
104 |
+
- KeepKeys:
|
105 |
+
keep_keys: ['image', 'char_label', 'length'] # dataloader will return list in this order
|
106 |
+
loader:
|
107 |
+
shuffle: False
|
108 |
+
drop_last: False
|
109 |
+
batch_size_per_card: 256
|
110 |
+
num_workers: 2
|
configs/rec/mgpstr/vit_mgpstr.yml
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/vit_mgpstr/
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [100000, 2000]
|
9 |
+
cal_metric_during_train: False
|
10 |
+
pretrained_model:
|
11 |
+
checkpoints:
|
12 |
+
use_tensorboard: false
|
13 |
+
infer_img:
|
14 |
+
# for data or label process
|
15 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
|
16 |
+
max_text_length: &max_text_length 25
|
17 |
+
use_space_char: &use_space_char False
|
18 |
+
use_amp: True
|
19 |
+
save_res_path: ./output/rec/u14m_filter/predicts_vit_mgpstr.txt
|
20 |
+
grad_clip_val: 5
|
21 |
+
|
22 |
+
Optimizer:
|
23 |
+
name: Adam
|
24 |
+
lr: 0.000325 # 4gpus 128bs/gpu
|
25 |
+
weight_decay: 0.
|
26 |
+
filter_bias_and_bn: False
|
27 |
+
|
28 |
+
LRScheduler:
|
29 |
+
name: OneCycleLR
|
30 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
31 |
+
cycle_momentum: False
|
32 |
+
|
33 |
+
Architecture:
|
34 |
+
model_type: rec
|
35 |
+
algorithm: MGPSTR
|
36 |
+
Transform:
|
37 |
+
Encoder:
|
38 |
+
name: ViT
|
39 |
+
img_size: [32,128]
|
40 |
+
patch_size: [4, 4]
|
41 |
+
embed_dim: 384
|
42 |
+
depth: 12
|
43 |
+
num_heads: 6
|
44 |
+
mlp_ratio: 4
|
45 |
+
qkv_bias: True
|
46 |
+
Decoder:
|
47 |
+
name: MGPDecoder
|
48 |
+
only_char: &only_char False
|
49 |
+
|
50 |
+
Loss:
|
51 |
+
name: MGPLoss
|
52 |
+
only_char: *only_char
|
53 |
+
|
54 |
+
PostProcess:
|
55 |
+
name: MPGLabelDecode
|
56 |
+
character_dict_path: *character_dict_path
|
57 |
+
use_space_char: *use_space_char
|
58 |
+
only_char: *only_char
|
59 |
+
|
60 |
+
Metric:
|
61 |
+
name: RecMPGMetric
|
62 |
+
main_indicator: acc
|
63 |
+
is_filter: True
|
64 |
+
|
65 |
+
Train:
|
66 |
+
dataset:
|
67 |
+
name: LMDBDataSet
|
68 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
69 |
+
transforms:
|
70 |
+
- DecodeImagePIL: # load image
|
71 |
+
img_mode: RGB
|
72 |
+
- PARSeqAugPIL:
|
73 |
+
- MGPLabelEncode: # Class handling label
|
74 |
+
character_dict_path: *character_dict_path
|
75 |
+
use_space_char: *use_space_char
|
76 |
+
max_text_length: *max_text_length
|
77 |
+
only_char: *only_char
|
78 |
+
- RecTVResize:
|
79 |
+
image_shape: [32, 128]
|
80 |
+
padding: False
|
81 |
+
- KeepKeys:
|
82 |
+
keep_keys: ['image', 'char_label', 'bpe_label', 'wp_label', 'length'] # dataloader will return list in this order
|
83 |
+
loader:
|
84 |
+
shuffle: True
|
85 |
+
batch_size_per_card: 128
|
86 |
+
drop_last: True
|
87 |
+
num_workers: 4
|
88 |
+
|
89 |
+
Eval:
|
90 |
+
dataset:
|
91 |
+
name: LMDBDataSet
|
92 |
+
data_dir: ../evaluation/
|
93 |
+
transforms:
|
94 |
+
- DecodeImagePIL: # load image
|
95 |
+
img_mode: RGB
|
96 |
+
- MGPLabelEncode: # Class handling label
|
97 |
+
character_dict_path: *character_dict_path
|
98 |
+
use_space_char: *use_space_char
|
99 |
+
max_text_length: *max_text_length
|
100 |
+
only_char: *only_char
|
101 |
+
- RecTVResize:
|
102 |
+
image_shape: [32, 128]
|
103 |
+
padding: False
|
104 |
+
- KeepKeys:
|
105 |
+
keep_keys: ['image', 'char_label', 'bpe_label', 'wp_label', 'length'] # dataloader will return list in this order
|
106 |
+
loader:
|
107 |
+
shuffle: False
|
108 |
+
drop_last: False
|
109 |
+
batch_size_per_card: 256
|
110 |
+
num_workers: 2
|
configs/rec/mgpstr/vit_mgpstr_only_char.yml
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/vit_mgpstr_only_char/
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: False
|
10 |
+
pretrained_model:
|
11 |
+
checkpoints:
|
12 |
+
use_tensorboard: false
|
13 |
+
infer_img:
|
14 |
+
# for data or label process
|
15 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
|
16 |
+
max_text_length: &max_text_length 25
|
17 |
+
use_space_char: &use_space_char False
|
18 |
+
use_amp: True
|
19 |
+
save_res_path: ./output/rec/u14m_filter/predicts_vit_mgpstr_only_char.txt
|
20 |
+
grad_clip_val: 5
|
21 |
+
|
22 |
+
Optimizer:
|
23 |
+
name: Adam
|
24 |
+
lr: 0.000325 # 4gpus 128bs/gpu
|
25 |
+
weight_decay: 0.
|
26 |
+
filter_bias_and_bn: False
|
27 |
+
|
28 |
+
LRScheduler:
|
29 |
+
name: OneCycleLR
|
30 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
31 |
+
cycle_momentum: False
|
32 |
+
|
33 |
+
Architecture:
|
34 |
+
model_type: rec
|
35 |
+
algorithm: MGPSTR
|
36 |
+
Transform:
|
37 |
+
Encoder:
|
38 |
+
name: ViT
|
39 |
+
img_size: [32,128]
|
40 |
+
patch_size: [4, 4]
|
41 |
+
embed_dim: 384
|
42 |
+
depth: 12
|
43 |
+
num_heads: 6
|
44 |
+
mlp_ratio: 4
|
45 |
+
qkv_bias: True
|
46 |
+
Decoder:
|
47 |
+
name: MGPDecoder
|
48 |
+
only_char: &only_char True
|
49 |
+
|
50 |
+
Loss:
|
51 |
+
name: MGPLoss
|
52 |
+
only_char: *only_char
|
53 |
+
|
54 |
+
PostProcess:
|
55 |
+
name: MPGLabelDecode
|
56 |
+
character_dict_path: *character_dict_path
|
57 |
+
use_space_char: *use_space_char
|
58 |
+
only_char: *only_char
|
59 |
+
|
60 |
+
Metric:
|
61 |
+
name: RecMetric
|
62 |
+
main_indicator: acc
|
63 |
+
is_filter: True
|
64 |
+
|
65 |
+
Train:
|
66 |
+
dataset:
|
67 |
+
name: LMDBDataSet
|
68 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
69 |
+
transforms:
|
70 |
+
- DecodeImagePIL: # load image
|
71 |
+
img_mode: RGB
|
72 |
+
- PARSeqAugPIL:
|
73 |
+
- MGPLabelEncode: # Class handling label
|
74 |
+
character_dict_path: *character_dict_path
|
75 |
+
use_space_char: *use_space_char
|
76 |
+
max_text_length: *max_text_length
|
77 |
+
only_char: *only_char
|
78 |
+
- RecTVResize:
|
79 |
+
image_shape: [32, 128]
|
80 |
+
padding: False
|
81 |
+
- KeepKeys:
|
82 |
+
keep_keys: ['image', 'char_label', 'length'] # dataloader will return list in this order
|
83 |
+
loader:
|
84 |
+
shuffle: True
|
85 |
+
batch_size_per_card: 128
|
86 |
+
drop_last: True
|
87 |
+
num_workers: 4
|
88 |
+
|
89 |
+
Eval:
|
90 |
+
dataset:
|
91 |
+
name: LMDBDataSet
|
92 |
+
data_dir: ../evaluation/
|
93 |
+
transforms:
|
94 |
+
- DecodeImagePIL: # load image
|
95 |
+
img_mode: RGB
|
96 |
+
- MGPLabelEncode: # Class handling label
|
97 |
+
character_dict_path: *character_dict_path
|
98 |
+
use_space_char: *use_space_char
|
99 |
+
max_text_length: *max_text_length
|
100 |
+
only_char: *only_char
|
101 |
+
- RecTVResize:
|
102 |
+
image_shape: [32, 128]
|
103 |
+
padding: False
|
104 |
+
- KeepKeys:
|
105 |
+
keep_keys: ['image', 'char_label', 'length'] # dataloader will return list in this order
|
106 |
+
loader:
|
107 |
+
shuffle: False
|
108 |
+
drop_last: False
|
109 |
+
batch_size_per_card: 256
|
110 |
+
num_workers: 2
|
configs/rec/moran/resnet31_lstm_moran.yml
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/resnet31_lstm_moran
|
7 |
+
eval_epoch_step: [0, 1]
|
8 |
+
eval_batch_step: [0, 500]
|
9 |
+
cal_metric_during_train: True
|
10 |
+
pretrained_model:
|
11 |
+
checkpoints:
|
12 |
+
use_tensorboard: false
|
13 |
+
infer_img:
|
14 |
+
# for data or label process
|
15 |
+
character_dict_path: ./tools/utils/EN_symbol_dict.txt
|
16 |
+
max_text_length: 25
|
17 |
+
use_space_char: False
|
18 |
+
save_res_path: ./output/rec/predicts_moran.txt
|
19 |
+
use_amp: True
|
20 |
+
grad_clip_val: 1.0
|
21 |
+
|
22 |
+
Optimizer:
|
23 |
+
name: Adam
|
24 |
+
lr: 0.002 # for 1gpus bs1024/gpu
|
25 |
+
weight_decay: 0.05
|
26 |
+
filter_bias_and_bn: False
|
27 |
+
|
28 |
+
LRScheduler:
|
29 |
+
name: OneCycleLR
|
30 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
31 |
+
cycle_momentum: False
|
32 |
+
|
33 |
+
Architecture:
|
34 |
+
model_type: rec
|
35 |
+
algorithm: MORAN
|
36 |
+
Transform:
|
37 |
+
name: MORN
|
38 |
+
target_shape: [32, 128]
|
39 |
+
Encoder:
|
40 |
+
name: ResNet_ASTER
|
41 |
+
Decoder:
|
42 |
+
name: ASTERDecoder
|
43 |
+
|
44 |
+
Loss:
|
45 |
+
name: ARLoss
|
46 |
+
|
47 |
+
Metric:
|
48 |
+
name: RecMetric
|
49 |
+
main_indicator: acc
|
50 |
+
is_filter: True
|
51 |
+
|
52 |
+
PostProcess:
|
53 |
+
name: ARLabelDecode
|
54 |
+
|
55 |
+
Train:
|
56 |
+
dataset:
|
57 |
+
name: LMDBDataSet
|
58 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
59 |
+
transforms:
|
60 |
+
- DecodeImagePIL: # load image
|
61 |
+
img_mode: RGB
|
62 |
+
- PARSeqAugPIL:
|
63 |
+
- ARLabelEncode: # Class handling label
|
64 |
+
- RecTVResize:
|
65 |
+
image_shape: [64, 256]
|
66 |
+
padding: False
|
67 |
+
- KeepKeys:
|
68 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
69 |
+
loader:
|
70 |
+
shuffle: True
|
71 |
+
batch_size_per_card: 1024
|
72 |
+
drop_last: True
|
73 |
+
num_workers: 4
|
74 |
+
|
75 |
+
Eval:
|
76 |
+
dataset:
|
77 |
+
name: LMDBDataSet
|
78 |
+
data_dir: ../evaluation
|
79 |
+
transforms:
|
80 |
+
- DecodeImagePIL: # load image
|
81 |
+
img_mode: RGB
|
82 |
+
- ARLabelEncode: # Class handling label
|
83 |
+
- RecTVResize:
|
84 |
+
image_shape: [64, 256]
|
85 |
+
padding: False
|
86 |
+
- KeepKeys:
|
87 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
88 |
+
loader:
|
89 |
+
shuffle: False
|
90 |
+
drop_last: False
|
91 |
+
batch_size_per_card: 256
|
92 |
+
num_workers: 2
|
configs/rec/nrtr/focalsvtr_nrtr_maxraio12.yml
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/focalsvtr_nrtr_maxrtio12
|
7 |
+
save_epoch_step: 1
|
8 |
+
# evaluation is run every 2000 iterations
|
9 |
+
eval_batch_step: [0, 500]
|
10 |
+
eval_epoch_step: [0, 1]
|
11 |
+
cal_metric_during_train: True
|
12 |
+
pretrained_model:
|
13 |
+
checkpoints:
|
14 |
+
use_tensorboard: false
|
15 |
+
infer_img: ../ltb/img
|
16 |
+
# for data or label process
|
17 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
|
18 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
19 |
+
max_text_length: &max_text_length 25
|
20 |
+
use_space_char: &use_space_char False
|
21 |
+
save_res_path: ./output/rec/u14m_filter/predicts_focalsvtr_nrtr_maxrtio12.txt
|
22 |
+
use_amp: True
|
23 |
+
|
24 |
+
Optimizer:
|
25 |
+
name: AdamW
|
26 |
+
lr: 0.00065 # for 4gpus bs256/gpu
|
27 |
+
weight_decay: 0.05
|
28 |
+
filter_bias_and_bn: True
|
29 |
+
|
30 |
+
LRScheduler:
|
31 |
+
name: OneCycleLR
|
32 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
33 |
+
cycle_momentum: False
|
34 |
+
|
35 |
+
Architecture:
|
36 |
+
model_type: rec
|
37 |
+
algorithm: NRTR
|
38 |
+
in_channels: 3
|
39 |
+
Transform:
|
40 |
+
Encoder:
|
41 |
+
name: FocalSVTR
|
42 |
+
img_size: [32, 128]
|
43 |
+
depths: [6, 6, 6]
|
44 |
+
embed_dim: 96
|
45 |
+
sub_k: [[1, 1], [2, 1], [1, 1]]
|
46 |
+
focal_levels: [3, 3, 3]
|
47 |
+
last_stage: False
|
48 |
+
Decoder:
|
49 |
+
name: NRTRDecoder
|
50 |
+
num_encoder_layers: -1
|
51 |
+
beam_size: 0
|
52 |
+
num_decoder_layers: 2
|
53 |
+
nhead: 12
|
54 |
+
max_len: *max_text_length
|
55 |
+
|
56 |
+
Loss:
|
57 |
+
name: ARLoss
|
58 |
+
|
59 |
+
PostProcess:
|
60 |
+
name: ARLabelDecode
|
61 |
+
character_dict_path: *character_dict_path
|
62 |
+
use_space_char: *use_space_char
|
63 |
+
|
64 |
+
Metric:
|
65 |
+
name: RecMetric
|
66 |
+
main_indicator: acc
|
67 |
+
is_filter: True
|
68 |
+
|
69 |
+
Train:
|
70 |
+
dataset:
|
71 |
+
name: RatioDataSet
|
72 |
+
ds_width: True
|
73 |
+
padding: &padding True
|
74 |
+
padding_rand: True
|
75 |
+
padding_doub: True
|
76 |
+
data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
|
77 |
+
'../Union14M-L-LMDB-Filtered/filter_train_hard',
|
78 |
+
'../Union14M-L-LMDB-Filtered/filter_train_medium',
|
79 |
+
'../Union14M-L-LMDB-Filtered/filter_train_normal',
|
80 |
+
'../Union14M-L-LMDB-Filtered/filter_train_easy',
|
81 |
+
]
|
82 |
+
transforms:
|
83 |
+
- DecodeImage: # load image
|
84 |
+
img_mode: BGR
|
85 |
+
channel_first: False
|
86 |
+
- PARSeqAug:
|
87 |
+
- ARLabelEncode: # Class handling label
|
88 |
+
character_dict_path: *character_dict_path
|
89 |
+
use_space_char: *use_space_char
|
90 |
+
max_text_length: *max_text_length
|
91 |
+
- KeepKeys:
|
92 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
93 |
+
sampler:
|
94 |
+
name: RatioSampler
|
95 |
+
scales: [[128, 32]] # w, h
|
96 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
97 |
+
first_bs: &bs 256
|
98 |
+
fix_bs: false
|
99 |
+
divided_factor: [4, 16] # w, h
|
100 |
+
is_training: True
|
101 |
+
loader:
|
102 |
+
shuffle: True
|
103 |
+
batch_size_per_card: *bs
|
104 |
+
drop_last: True
|
105 |
+
max_ratio: &max_ratio 12
|
106 |
+
num_workers: 4
|
107 |
+
|
108 |
+
Eval:
|
109 |
+
dataset:
|
110 |
+
name: RatioDataSet
|
111 |
+
ds_width: True
|
112 |
+
padding: False
|
113 |
+
padding_rand: False
|
114 |
+
data_dir_list: [
|
115 |
+
'../evaluation/CUTE80',
|
116 |
+
'../evaluation/IC13_857',
|
117 |
+
'../evaluation/IC15_1811',
|
118 |
+
'../evaluation/IIIT5k',
|
119 |
+
'../evaluation/SVT',
|
120 |
+
'../evaluation/SVTP',
|
121 |
+
]
|
122 |
+
transforms:
|
123 |
+
- DecodeImage: # load image
|
124 |
+
img_mode: BGR
|
125 |
+
channel_first: False
|
126 |
+
- ARLabelEncode: # Class handling label
|
127 |
+
character_dict_path: *character_dict_path
|
128 |
+
use_space_char: *use_space_char
|
129 |
+
max_text_length: *max_text_length
|
130 |
+
- KeepKeys:
|
131 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
132 |
+
sampler:
|
133 |
+
name: RatioSampler
|
134 |
+
scales: [[128, 32]] # w, h
|
135 |
+
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
136 |
+
first_bs: 128
|
137 |
+
fix_bs: false
|
138 |
+
divided_factor: [4, 16] # w, h
|
139 |
+
is_training: False
|
140 |
+
loader:
|
141 |
+
shuffle: False
|
142 |
+
drop_last: False
|
143 |
+
max_ratio: *max_ratio
|
144 |
+
batch_size_per_card: 128
|
145 |
+
num_workers: 4
|
configs/rec/nrtr/nrtr.yml
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Global:
|
2 |
+
device: gpu
|
3 |
+
epoch_num: 20
|
4 |
+
log_smooth_window: 20
|
5 |
+
print_batch_step: 10
|
6 |
+
output_dir: ./output/rec/u14m_filter/nrtr/
|
7 |
+
save_epoch_step: 1
|
8 |
+
# evaluation is run every 2000 iterations
|
9 |
+
eval_batch_step: [0, 500]
|
10 |
+
eval_epoch_step: [0, 1]
|
11 |
+
cal_metric_during_train: True
|
12 |
+
pretrained_model:
|
13 |
+
checkpoints:
|
14 |
+
use_tensorboard: false
|
15 |
+
infer_img:
|
16 |
+
# for data or label process
|
17 |
+
character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
|
18 |
+
# ./tools/utils/ppocr_keys_v1.txt # ch
|
19 |
+
max_text_length: &max_text_length 25
|
20 |
+
use_space_char: &use_space_char False
|
21 |
+
save_res_path: ./output/rec/u14m_filter/predicts_nrtr.txt
|
22 |
+
use_amp: True
|
23 |
+
|
24 |
+
Optimizer:
|
25 |
+
name: AdamW
|
26 |
+
lr: 0.00065 # for 4gpus bs256/gpu
|
27 |
+
weight_decay: 0.05
|
28 |
+
filter_bias_and_bn: True
|
29 |
+
|
30 |
+
LRScheduler:
|
31 |
+
name: OneCycleLR
|
32 |
+
warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
|
33 |
+
cycle_momentum: False
|
34 |
+
|
35 |
+
Architecture:
|
36 |
+
model_type: rec
|
37 |
+
algorithm: BGPD
|
38 |
+
in_channels: 3
|
39 |
+
Transform:
|
40 |
+
Encoder:
|
41 |
+
name: NRTREncoder
|
42 |
+
Decoder:
|
43 |
+
name: NRTRDecoder
|
44 |
+
num_encoder_layers: 6
|
45 |
+
beam_size: 0
|
46 |
+
num_decoder_layers: 6
|
47 |
+
nhead: 8
|
48 |
+
max_len: *max_text_length
|
49 |
+
|
50 |
+
|
51 |
+
Loss:
|
52 |
+
name: ARLoss
|
53 |
+
|
54 |
+
PostProcess:
|
55 |
+
name: ARLabelDecode
|
56 |
+
character_dict_path: *character_dict_path
|
57 |
+
use_space_char: *use_space_char
|
58 |
+
|
59 |
+
Metric:
|
60 |
+
name: RecMetric
|
61 |
+
main_indicator: acc
|
62 |
+
is_filter: True
|
63 |
+
|
64 |
+
Train:
|
65 |
+
dataset:
|
66 |
+
name: LMDBDataSet
|
67 |
+
data_dir: ../Union14M-L-LMDB-Filtered
|
68 |
+
transforms:
|
69 |
+
- DecodeImagePIL: # load image
|
70 |
+
img_mode: RGB
|
71 |
+
- PARSeqAugPIL:
|
72 |
+
- ARLabelEncode: # Class handling label
|
73 |
+
character_dict_path: *character_dict_path
|
74 |
+
use_space_char: *use_space_char
|
75 |
+
max_text_length: *max_text_length
|
76 |
+
- RecTVResize:
|
77 |
+
image_shape: [32, 128]
|
78 |
+
padding: False
|
79 |
+
- KeepKeys:
|
80 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
81 |
+
loader:
|
82 |
+
shuffle: True
|
83 |
+
batch_size_per_card: 256
|
84 |
+
drop_last: True
|
85 |
+
num_workers: 4
|
86 |
+
|
87 |
+
Eval:
|
88 |
+
dataset:
|
89 |
+
name: LMDBDataSet
|
90 |
+
data_dir: ../evaluation/
|
91 |
+
transforms:
|
92 |
+
- DecodeImagePIL: # load image
|
93 |
+
img_mode: RGB
|
94 |
+
- ARLabelEncode: # Class handling label
|
95 |
+
character_dict_path: *character_dict_path
|
96 |
+
use_space_char: *use_space_char
|
97 |
+
max_text_length: *max_text_length
|
98 |
+
- RecTVResize:
|
99 |
+
image_shape: [32, 128]
|
100 |
+
padding: False
|
101 |
+
- KeepKeys:
|
102 |
+
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
|
103 |
+
loader:
|
104 |
+
shuffle: False
|
105 |
+
drop_last: False
|
106 |
+
batch_size_per_card: 256
|
107 |
+
num_workers: 2
|