Spaces:

topdu
/

OpenOCR-Demo

Running

App Files Files Community

topdu commited on 18 days ago

Commit

29f689c

•

1 Parent(s): 2d03ea4

openocr demo

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +127 -0
configs/det/dbnet/repvit_db.yml +173 -0
configs/rec/abinet/resnet45_trans_abinet_lang.yml +94 -0
configs/rec/abinet/resnet45_trans_abinet_wo_lang.yml +93 -0
configs/rec/abinet/svtrv2_abinet_lang.yml +130 -0
configs/rec/abinet/svtrv2_abinet_wo_lang.yml +128 -0
configs/rec/aster/resnet31_lstm_aster_tps_on.yml +93 -0
configs/rec/aster/svtrv2_aster.yml +127 -0
configs/rec/aster/svtrv2_aster_tps_on.yml +102 -0
configs/rec/autostr/autostr_lstm_aster_tps_on.yml +95 -0
configs/rec/busnet/svtrv2_busnet.yml +135 -0
configs/rec/busnet/svtrv2_busnet_pretraining.yml +134 -0
configs/rec/busnet/vit_busnet.yml +104 -0
configs/rec/busnet/vit_busnet_pretraining.yml +104 -0
configs/rec/cam/convnextv2_cam_tps_on.yml +118 -0
configs/rec/cam/convnextv2_tiny_cam_tps_on.yml +118 -0
configs/rec/cam/svtrv2_cam_tps_on.yml +123 -0
configs/rec/cdistnet/resnet45_trans_cdistnet.yml +93 -0
configs/rec/cdistnet/svtrv2_cdistnet.yml +139 -0
configs/rec/cppd/svtr_base_cppd.yml +123 -0
configs/rec/cppd/svtr_base_cppd_ch.yml +126 -0
configs/rec/cppd/svtr_base_cppd_h8.yml +123 -0
configs/rec/cppd/svtr_base_cppd_syn.yml +124 -0
configs/rec/cppd/svtrv2_cppd.yml +150 -0
configs/rec/dan/resnet45_fpn_dan.yml +98 -0
configs/rec/dan/svtrv2_dan.yml +130 -0
configs/rec/focalsvtr/focalsvtr_ctc.yml +137 -0
configs/rec/gtc/svtrv2_lnconv_nrtr_gtc.yml +168 -0
configs/rec/gtc/svtrv2_lnconv_smtr_gtc_long_infer.yml +151 -0
configs/rec/gtc/svtrv2_lnconv_smtr_gtc_smtr_long.yml +150 -0
configs/rec/gtc/svtrv2_lnconv_smtr_gtc_stream.yml +152 -0
configs/rec/igtr/readme.md +189 -0
configs/rec/igtr/svtr_base_ds_igtr.yml +157 -0
configs/rec/lister/focalsvtr_lister_wo_fem_maxratio12.yml +133 -0
configs/rec/lister/svtrv2_lister_wo_fem_maxratio12.yml +138 -0
configs/rec/lpv/svtr_base_lpv.yml +124 -0
configs/rec/lpv/svtr_base_lpv_wo_glrm.yml +123 -0
configs/rec/lpv/svtrv2_lpv.yml +147 -0
configs/rec/lpv/svtrv2_lpv_wo_glrm.yml +146 -0
configs/rec/maerec/vit_nrtr.yml +116 -0
configs/rec/matrn/resnet45_trans_matrn.yml +95 -0
configs/rec/matrn/svtrv2_matrn.yml +130 -0
configs/rec/mgpstr/svtrv2_mgpstr_only_char.yml +140 -0
configs/rec/mgpstr/vit_base_mgpstr_only_char.yml +111 -0
configs/rec/mgpstr/vit_large_mgpstr_only_char.yml +110 -0
configs/rec/mgpstr/vit_mgpstr.yml +110 -0
configs/rec/mgpstr/vit_mgpstr_only_char.yml +110 -0
configs/rec/moran/resnet31_lstm_moran.yml +92 -0
configs/rec/nrtr/focalsvtr_nrtr_maxraio12.yml +145 -0
configs/rec/nrtr/nrtr.yml +107 -0

app.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import os
+import gradio as gr  # gradio==4.20.0
+os.environ['FLAGS_allocator_strategy'] = 'auto_growth'
+import cv2
+import numpy as np
+import json
+import time
+from PIL import Image
+from tools.infer_e2e import OpenOCR, check_and_download_font, draw_ocr_box_txt
+drop_score = 0.01
+text_sys = OpenOCR(drop_score=drop_score)
+# warm up 5 times
+if True:
+    img = np.random.uniform(0, 255, [640, 640, 3]).astype(np.uint8)
+    for i in range(5):
+        res = text_sys(img_numpy=img)
+font_path = './simfang.ttf'
+check_and_download_font(font_path)
+def main(input_image):
+    img = input_image[:, :, ::-1]
+    starttime = time.time()
+    results, time_dict, mask = text_sys(img_numpy=img, return_mask=True)
+    elapse = time.time() - starttime
+    save_pred = json.dumps(results[0], ensure_ascii=False)
+    image = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
+    boxes = [res['points'] for res in results[0]]
+    txts = [res['transcription'] for res in results[0]]
+    scores = [res['score'] for res in results[0]]
+    draw_img = draw_ocr_box_txt(
+        image,
+        boxes,
+        txts,
+        scores,
+        drop_score=drop_score,
+        font_path=font_path,
+    )
+    mask = mask[0, 0, :, :] > 0.3
+    return save_pred, elapse, draw_img, mask.astype('uint8') * 255
+def get_all_file_names_including_subdirs(dir_path):
+    all_file_names = []
+    for root, dirs, files in os.walk(dir_path):
+        for file_name in files:
+            all_file_names.append(os.path.join(root, file_name))
+    file_names_only = [os.path.basename(file) for file in all_file_names]
+    return file_names_only
+def list_image_paths(directory):
+    image_extensions = ('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff')
+    image_paths = []
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if file.lower().endswith(image_extensions):
+                relative_path = os.path.relpath(os.path.join(root, file),
+                                                directory)
+                full_path = os.path.join(directory, relative_path)
+                image_paths.append(full_path)
+    image_paths = sorted(image_paths)
+    return image_paths
+def find_file_in_current_dir_and_subdirs(file_name):
+    for root, dirs, files in os.walk('.'):
+        if file_name in files:
+            relative_path = os.path.join(root, file_name)
+            return relative_path
+def predict1(input_image, Model_type, OCR_type):
+    if OCR_type == 'E2E':
+        return 11111, 'E2E', input_image
+    elif OCR_type == 'STR':
+        return 11111, 'STR', input_image
+    else:
+        return 11111, 'STD', input_image
+e2e_img_example = list_image_paths('./OCR_e2e_img')
+if __name__ == '__main__':
+    css = '.image-container img { width: 100%; max-height: 320px;}'
+    with gr.Blocks(css=css) as demo:
+        gr.HTML("""
+                <h1 style='text-align: center;'>OpenOCR</h1>""")
+        with gr.Row():
+            with gr.Column(scale=1):
+                input_image = gr.Image(label='Input image',
+                                       elem_classes=['image-container'])
+                examples = gr.Examples(examples=e2e_img_example,
+                                       inputs=input_image,
+                                       label='Examples')
+                downstream = gr.Button('Run')
+            with gr.Column(scale=1):
+                img_mask = gr.Image(label='mask',
+                                    interactive=False,
+                                    elem_classes=['image-container'])
+                img_output = gr.Image(label=' ',
+                                      interactive=False,
+                                      elem_classes=['image-container'])
+                output = gr.Textbox(label='Result')
+                confidence = gr.Textbox(label='Latency')
+            downstream.click(fn=main,
+                             inputs=[
+                                 input_image,
+                             ],
+                             outputs=[
+                                 output,
+                                 confidence,
+                                 img_output,
+                                 img_mask,
+                             ])
+    demo.launch(share=True)

configs/det/dbnet/repvit_db.yml ADDED Viewed

	@@ -0,0 +1,173 @@

+Global:
+  device: gpu
+  epoch_num: &epoch_num 500
+  log_smooth_window: 20
+  print_batch_step: 100
+  save_model_dir: ./output/det_repsvtr_db
+  save_epoch_step: 10
+  eval_batch_step:
+  - 0
+  - 1000
+  cal_metric_during_train: false
+  checkpoints:
+  pretrained_model: openocr_det_repvit_ch.pth
+  save_inference_dir: null
+  use_visualdl: false
+  infer_img: ./testA
+  save_res_path: ./checkpoints/det_db/predicts_db.txt
+  distributed: true
+  model_type: det
+Architecture:
+  algorithm: DB
+  Backbone:
+    name: RepSVTR_det
+  Neck:
+    name: RSEFPN
+    out_channels: 96
+    shortcut: True
+  Head:
+    name: DBHead
+    k: 50
+# Loss:
+#   name: DBLoss
+#   balance_loss: true
+#   main_loss_type: DiceLoss
+#   alpha: 5
+#   beta: 10
+#   ohem_ratio: 3
+# Optimizer:
+#   name: Adam
+#   beta1: 0.9
+#   beta2: 0.999
+#   lr:
+#     name: Cosine
+#     learning_rate: 0.001 #(8*8c)
+#     warmup_epoch: 2
+#   regularizer:
+#     name: L2
+#     factor: 5.0e-05
+PostProcess:
+  name: DBPostProcess
+  thresh: 0.3
+  box_thresh: 0.4
+  max_candidates: 1000
+  unclip_ratio: 1.5
+  score_mode: 'slow'
+# Metric:
+#   name: DetMetric
+#   main_indicator: hmean
+# Train:
+#   dataset:
+#     name: SimpleDataSet
+#     data_dir: ./train_data/icdar2015/text_localization/
+#     label_file_list:
+#       - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt
+#     ratio_list: [1.0]
+#     transforms:
+#     - DecodeImage:
+#         img_mode: BGR
+#         channel_first: false
+#     - DetLabelEncode: null
+#     - CopyPaste: null
+#     - IaaAugment:
+#         augmenter_args:
+#         - type: Fliplr
+#           args:
+#             p: 0.5
+#         - type: Affine
+#           args:
+#             rotate:
+#             - -10
+#             - 10
+#         - type: Resize
+#           args:
+#             size:
+#             - 0.5
+#             - 3
+#     - EastRandomCropData:
+#         size:
+#         - 640
+#         - 640
+#         max_tries: 50
+#         keep_ratio: true
+#     - MakeBorderMap:
+#         shrink_ratio: 0.4
+#         thresh_min: 0.3
+#         thresh_max: 0.7
+#         total_epoch: *epoch_num
+#     - MakeShrinkMap:
+#         shrink_ratio: 0.4
+#         min_text_size: 8
+#         total_epoch: *epoch_num
+#     - NormalizeImage:
+#         scale: 1./255.
+#         mean:
+#         - 0.485
+#         - 0.456
+#         - 0.406
+#         std:
+#         - 0.229
+#         - 0.224
+#         - 0.225
+#         order: hwc
+#     - ToCHWImage: null
+#     - KeepKeys:
+#         keep_keys:
+#         - image
+#         - threshold_map
+#         - threshold_mask
+#         - shrink_map
+#         - shrink_mask
+#   loader:
+#     shuffle: true
+#     drop_last: false
+#     batch_size_per_card: 8
+#     num_workers: 8
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/icdar2015/text_localization/
+    label_file_list:
+      - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
+    transforms:
+    - DecodeImage:
+        img_mode: BGR
+        channel_first: false
+    - DetLabelEncode: null
+    - DetResizeForTest:
+        # image_shape: [1280, 1280]
+        # keep_ratio: True
+        # padding: True
+        limit_side_len: 960
+        limit_type: max
+    - NormalizeImage:
+        scale: 1./255.
+        mean:
+        - 0.485
+        - 0.456
+        - 0.406
+        std:
+        - 0.229
+        - 0.224
+        - 0.225
+        order: hwc
+    - ToCHWImage: null
+    - KeepKeys:
+        keep_keys:
+        - image
+        - shape
+        - polys
+        - ignore_tags
+  loader:
+    shuffle: false
+    drop_last: false
+    batch_size_per_card: 1
+    num_workers: 2
+profiler_options: null

configs/rec/abinet/resnet45_trans_abinet_lang.yml ADDED Viewed

	@@ -0,0 +1,94 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/resnet45_trans_abinet_lang/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  # ./openocr_nolang_abinet_lang.pth
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_resnet45_trans_abinet_lang.txt
+  grad_clip_val: 20
+  use_amp: True
+Optimizer:
+  name: Adam
+  lr: 0.000267
+  weight_decay: 0.0
+  filter_bias_and_bn: False
+LRScheduler:
+  name: MultiStepLR
+  milestones: [12]
+  gamma: 0.1
+Architecture:
+  model_type: rec
+  algorithm: ABINet
+  Transform:
+  Encoder:
+    name: ResNet45
+    in_channels: 3
+    strides: [2, 1, 2, 1, 1]
+  Decoder:
+    name: ABINetDecoder
+    iter_size: 3
+Loss:
+  name: ABINetLoss
+PostProcess:
+  name: ABINetLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ABINetLabelEncode:
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ABINetLabelEncode:
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/abinet/resnet45_trans_abinet_wo_lang.yml ADDED Viewed

	@@ -0,0 +1,93 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/resnet45_trans_abinet_wo_lang/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_resnet45_trans_abinet_wo_lang.txt
+  grad_clip_val: 20
+  use_amp: True
+Optimizer:
+  name: Adam
+  lr: 0.000267
+  weight_decay: 0.0
+  filter_bias_and_bn: False
+LRScheduler:
+  name: MultiStepLR
+  milestones: [12]
+  gamma: 0.1
+Architecture:
+  model_type: rec
+  algorithm: ABINet
+  Transform:
+  Encoder:
+    name: ResNet45
+    in_channels: 3
+    strides: [2, 1, 2, 1, 1]
+  Decoder:
+    name: ABINetDecoder
+    iter_size: 0
+Loss:
+  name: ABINetLoss
+PostProcess:
+  name: ABINetLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ABINetLabelEncode:
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ABINetLabelEncode:
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/abinet/svtrv2_abinet_lang.yml ADDED Viewed

	@@ -0,0 +1,130 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtrv2_abinet_lang/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  # ./openocr_svtrv2_nolang_abinet_lang.pth
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_abinet_lang.txt
+  use_amp: True
+  grad_clip_val: 20
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # for 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: ABINet
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: True
+  Decoder:
+    name: ABINetDecoder
+    iter_size: 3
+    num_layers: 0
+Loss:
+  name: ABINetLoss
+PostProcess:
+  name: ABINetLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: false
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ABINetLabelEncode:
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 4
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: [
+      '../evaluation/CUTE80',
+      '../evaluation/IC13_857',
+      '../evaluation/IC15_1811',
+      '../evaluation/IIIT5k',
+      '../evaluation/SVT',
+      '../evaluation/SVTP',
+      ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ABINetLabelEncode:
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: *bs
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: *max_ratio
+    num_workers: 4

configs/rec/abinet/svtrv2_abinet_wo_lang.yml ADDED Viewed

	@@ -0,0 +1,128 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtrv2_abinet_wo_lang/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_abinet_wo_lang.txt
+  use_amp: True
+  grad_clip_val: 20
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # for 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: ABINet
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: True
+  Decoder:
+    name: ABINetDecoder
+    iter_size: 0
+    num_layers: 0
+Loss:
+  name: ABINetLoss
+PostProcess:
+  name: ABINetLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: false
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ABINetLabelEncode:
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 4
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: [
+      '../evaluation/CUTE80',
+      '../evaluation/IC13_857',
+      '../evaluation/IC15_1811',
+      '../evaluation/IIIT5k',
+      '../evaluation/SVT',
+      '../evaluation/SVTP',
+      ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ABINetLabelEncode:
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: *bs
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: *max_ratio
+    num_workers: 4

configs/rec/aster/resnet31_lstm_aster_tps_on.yml ADDED Viewed

	@@ -0,0 +1,93 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/resnet31_lstm_aster_tps_on
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/predicts_aster_tps.txt
+  use_amp: True
+  grad_clip_val: 1.0
+Optimizer:
+  name: Adam
+  lr: 0.002 # for 1gpus bs1024/gpu
+  weight_decay: 0.0
+  filter_bias_and_bn: False
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: aster
+  Transform:
+    name: Aster_TPS
+    tps_inputsize: [32, 64]
+    tps_outputsize: [32, 128]
+  Encoder:
+    name: ResNet_ASTER
+  Decoder:
+    name: ASTERDecoder
+Loss:
+  name: ARLoss
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+PostProcess:
+  name: ARLabelDecode
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ARLabelEncode: # Class handling label
+      - RecTVResize:
+          image_shape: [64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 1024
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+      - RecTVResize:
+          image_shape: [64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/aster/svtrv2_aster.yml ADDED Viewed

	@@ -0,0 +1,127 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtrv2_aster
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_aster.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # for 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: aster
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    out_channels: 256
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: False
+  Decoder:
+    name: ASTERDecoder
+Loss:
+  name: ARLoss
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+PostProcess:
+  name: ARLabelDecode
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: false
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ARLabelEncode: # Class handling label
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 4
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: [
+      '../evaluation/CUTE80',
+      '../evaluation/IC13_857',
+      '../evaluation/IC15_1811',
+      '../evaluation/IIIT5k',
+      '../evaluation/SVT',
+      '../evaluation/SVTP',
+      ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: *bs
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: *max_ratio
+    num_workers: 4

configs/rec/aster/svtrv2_aster_tps_on.yml ADDED Viewed

	@@ -0,0 +1,102 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtrv2_aster_tps_on
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_aster_tps_on.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # for 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: aster
+  Transform:
+    name: Aster_TPS
+    tps_inputsize: [32, 64]
+    tps_outputsize: [32, 128]
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    out_channels: 256
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: False
+  Decoder:
+    name: ASTERDecoder
+Loss:
+  name: ARLoss
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+PostProcess:
+  name: ARLabelDecode
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ARLabelEncode: # Class handling label
+      - RecTVResize:
+          image_shape: [64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+      - RecTVResize:
+          image_shape: [64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/autostr/autostr_lstm_aster_tps_on.yml ADDED Viewed

	@@ -0,0 +1,95 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/autostr_lstm_aster_tps_on
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_autostr_lstm_aster_tps_on.txt
+  use_amp: True
+  grad_clip_val: 1.0
+Optimizer:
+  name: Adam
+  lr: 0.002 # for 4gpus bs256/gpu
+  weight_decay: 0.0
+  filter_bias_and_bn: False
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: autostr
+  Transform:
+    name: Aster_TPS
+    tps_inputsize: [32, 64]
+    tps_outputsize: [32, 128]
+  Encoder:
+    name: AutoSTREncoder
+    stride_stages: '[(2, 2), (2, 1), (2, 2), (2, 1), (2, 1)]'
+    conv_op_ids: [2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 4, 1, 1, 6, 6]
+  Decoder:
+    name: ASTERDecoder
+Loss:
+  name: ARLoss
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+PostProcess:
+  name: ARLabelDecode
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ARLabelEncode: # Class handling label
+      - RecTVResize:
+          image_shape: [64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+      - RecTVResize:
+          image_shape: [64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/busnet/svtrv2_busnet.yml ADDED Viewed

	@@ -0,0 +1,135 @@

+Global:
+  device: gpu
+  epoch_num: 10
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtrv2_busnet/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  # ./output/rec/u14m_filter/svtrv2_busnet_pretraining/best.pth
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_busnet.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: BUSBet
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: False
+  Decoder:
+    name: BUSDecoder
+    nhead: 6
+    num_layers: 6
+    dim_feedforward: 1536
+    ignore_index: &ignore_index 100
+    pretraining: False
+    # return_id: 2
+Loss:
+  name: ABINetLoss
+  ignore_index: *ignore_index
+PostProcess:
+  name: ABINetLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: false
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ABINetLabelEncode:
+          ignore_index: *ignore_index
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 4
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: [
+      '../evaluation/CUTE80',
+      '../evaluation/IC13_857',
+      '../evaluation/IC15_1811',
+      '../evaluation/IIIT5k',
+      '../evaluation/SVT',
+      '../evaluation/SVTP',
+      ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ABINetLabelEncode:
+          ignore_index: *ignore_index
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: *bs
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: *max_ratio
+    num_workers: 4

configs/rec/busnet/svtrv2_busnet_pretraining.yml ADDED Viewed

	@@ -0,0 +1,134 @@

+Global:
+  device: gpu
+  epoch_num: 10
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtrv2_busnet_pretraining/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_busnet_pretraining.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: BUSBet
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: False
+  Decoder:
+    name: BUSDecoder
+    nhead: 6
+    num_layers: 6
+    dim_feedforward: 1536
+    ignore_index: &ignore_index 100
+    pretraining: True
+    # return_id: 0
+Loss:
+  name: ABINetLoss
+  ignore_index: *ignore_index
+PostProcess:
+  name: ABINetLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: false
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ABINetLabelEncode:
+          ignore_index: *ignore_index
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 4
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: [
+      '../evaluation/CUTE80',
+      '../evaluation/IC13_857',
+      '../evaluation/IC15_1811',
+      '../evaluation/IIIT5k',
+      '../evaluation/SVT',
+      '../evaluation/SVTP',
+      ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ABINetLabelEncode:
+          ignore_index: *ignore_index
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: *bs
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: *max_ratio
+    num_workers: 4

configs/rec/busnet/vit_busnet.yml ADDED Viewed

	@@ -0,0 +1,104 @@

+Global:
+  device: gpu
+  epoch_num: 10
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/vit_busnet/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_vit_busnet.txt
+  grad_clip_val: 20
+  use_amp: True
+Optimizer:
+  name: Adam
+  lr: 0.00053 # 4gpus bs256/gpu
+  weight_decay: 0.0
+  filter_bias_and_bn: False
+LRScheduler:
+  name: MultiStepLR
+  milestones: [6]
+  gamma: 0.1
+Architecture:
+  model_type: rec
+  algorithm: BUSBet
+  Transform:
+  Encoder:
+    name: ViT
+    img_size: [32,128]
+    patch_size: [4, 8]
+    embed_dim: 384
+    depth: 12
+    num_heads: 6
+    mlp_ratio: 4
+    qkv_bias: True
+  Decoder:
+    name: BUSDecoder
+    nhead: 6
+    num_layers: 6
+    dim_feedforward: 1536
+    ignore_index: &ignore_index 100
+    pretraining: False
+Loss:
+  name: ABINetLoss
+  ignore_index: *ignore_index
+PostProcess:
+  name: ABINetLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ABINetLabelEncode:
+          ignore_index: *ignore_index
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ABINetLabelEncode:
+          ignore_index: *ignore_index
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/busnet/vit_busnet_pretraining.yml ADDED Viewed

	@@ -0,0 +1,104 @@

+Global:
+  device: gpu
+  epoch_num: 10
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/vit_busnet_pretraining/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_vit_busnet_pretraining.txt
+  grad_clip_val: 20
+  use_amp: True
+Optimizer:
+  name: Adam
+  lr: 0.00053 # 4gpus bs256/gpu
+  weight_decay: 0.0
+  filter_bias_and_bn: False
+LRScheduler:
+  name: MultiStepLR
+  milestones: [6]
+  gamma: 0.1
+Architecture:
+  model_type: rec
+  algorithm: BUSBet
+  Transform:
+  Encoder:
+    name: ViT
+    img_size: [32,128]
+    patch_size: [4, 8]
+    embed_dim: 384
+    depth: 12
+    num_heads: 6
+    mlp_ratio: 4
+    qkv_bias: True
+  Decoder:
+    name: BUSDecoder
+    nhead: 6
+    num_layers: 6
+    dim_feedforward: 1536
+    ignore_index: &ignore_index 100
+    pretraining: True
+Loss:
+  name: ABINetLoss
+  ignore_index: *ignore_index
+PostProcess:
+  name: ABINetLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ABINetLabelEncode:
+          ignore_index: *ignore_index
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ABINetLabelEncode:
+          ignore_index: *ignore_index
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/cam/convnextv2_cam_tps_on.yml ADDED Viewed

	@@ -0,0 +1,118 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/convnextv2_cam_tps_on
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: False
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: &max_text_length 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_convnextv2_cam_tps_on.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.0008 # for 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+  eps: 1.e-8
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 :  1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: CAM
+  Transform:
+    name: Aster_TPS
+    tps_inputsize: [32, 64]
+    tps_outputsize: &img_shape [32, 128]
+  Encoder:
+    name: CAMEncoder
+    encoder_config:
+      name: ConvNeXtV2
+      depths: [2, 2, 8, 2]
+      dims: [80, 160, 320, 640]
+      strides: [[4,4], [2,1], [2,1], [1,1]]
+      drop_path_rate: 0.2
+      feat2d: True
+    nb_classes: 97
+    strides: [[4,4], [2,1], [2,1], [1,1]]
+    deform_stride: 2
+    stage_idx: 2
+    use_depthwise_unet: True
+    use_more_unet: False
+    binary_loss_type: BanlanceMultiClassCrossEntropyLoss
+    mid_size: True
+    d_embedding: 384
+  Decoder:
+    name: CAMDecoder
+    num_encoder_layers: -1
+    beam_size: 0
+    num_decoder_layers: 2
+    nhead: 8
+    max_len: *max_text_length
+Loss:
+  name: CAMLoss
+  loss_weight_binary: 1.5
+  label_smoothing: 0.
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+PostProcess:
+  name: ARLabelDecode
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - CAMLabelEncode: # Class handling label
+          font_path: ./arial.ttf
+          image_shape: *img_shape
+      - RecTVResize:
+          image_shape: [64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length', 'binary_mask'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+      - RecTVResize:
+          image_shape: [64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/cam/convnextv2_tiny_cam_tps_on.yml ADDED Viewed

	@@ -0,0 +1,118 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/convnextv2_tiny_cam_tps_on
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: False
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: &max_text_length 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_convnextv2_cam_tps_on.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.0008 # for 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+  eps: 1.e-8
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 :  1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: CAM
+  Transform:
+    name: Aster_TPS
+    tps_inputsize: [32, 64]
+    tps_outputsize: &img_shape [32, 128]
+  Encoder:
+    name: CAMEncoder
+    encoder_config:
+      name: ConvNeXtV2
+      depths: [3, 3, 9, 3]
+      dims: [96, 192, 384, 768]
+      strides: [[4,4], [2,1], [2,1], [1,1]]
+      drop_path_rate: 0.2
+      feat2d: True
+    nb_classes: 97
+    strides: [[4,4], [2,1], [2,1], [1,1]]
+    deform_stride: 2
+    stage_idx: 2
+    use_depthwise_unet: True
+    use_more_unet: False
+    binary_loss_type: BanlanceMultiClassCrossEntropyLoss
+    mid_size: False
+    d_embedding: 512
+  Decoder:
+    name: CAMDecoder
+    num_encoder_layers: -1
+    beam_size: 0
+    num_decoder_layers: 2
+    nhead: 8
+    max_len: *max_text_length
+Loss:
+  name: CAMLoss
+  loss_weight_binary: 1.5
+  label_smoothing: 0.
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+PostProcess:
+  name: ARLabelDecode
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - CAMLabelEncode: # Class handling label
+          font_path: ./arial.ttf
+          image_shape: *img_shape
+      - RecTVResize:
+          image_shape: [64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length', 'binary_mask'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+      - RecTVResize:
+          image_shape: [64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/cam/svtrv2_cam_tps_on.yml ADDED Viewed

	@@ -0,0 +1,123 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtrv2_cam_tps_on
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: False
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: &max_text_length 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_cam_tps_on.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # for 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 :  1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: CAM
+  Transform:
+    name: Aster_TPS
+    tps_inputsize: [32, 64]
+    tps_outputsize: &img_shape [32, 128]
+  Encoder:
+    name: CAMEncoder
+    encoder_config:
+      name: SVTRv2LNConvTwo33
+      use_pos_embed: False
+      dims: [128, 256, 384]
+      depths: [6, 6, 6]
+      num_heads: [4, 8, 12]
+      mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+      local_k: [[5, 5], [5, 5], [-1, -1]]
+      sub_k: [[1, 1], [2, 1], [-1, -1]]
+      last_stage: false
+      feat2d: True
+    nb_classes: 97
+    strides: [[4, 4], [1, 1], [2, 1], [1, 1]]
+    k_size: [[2, 2], [1, 1], [2, 1], [1, 1]]
+    q_size: [4, 32]
+    deform_stride: 2
+    stage_idx: 2
+    use_depthwise_unet: True
+    use_more_unet: False
+    binary_loss_type: BanlanceMultiClassCrossEntropyLoss
+    mid_size: True
+    d_embedding: 384
+  Decoder:
+    name: CAMDecoder
+    num_encoder_layers: -1
+    beam_size: 0
+    num_decoder_layers: 2
+    nhead: 8
+    max_len: *max_text_length
+Loss:
+  name: CAMLoss
+  loss_weight_binary: 1.5
+  label_smoothing: 0.
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+PostProcess:
+  name: ARLabelDecode
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - CAMLabelEncode: # Class handling label
+          font_path: ./arial.ttf
+          image_shape: *img_shape
+      - RecTVResize:
+          image_shape: [64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length', 'binary_mask'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+      - RecTVResize:
+          image_shape: [64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/cdistnet/resnet45_trans_cdistnet.yml ADDED Viewed

	@@ -0,0 +1,93 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/resnet45_trans_cdistnet
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_resnet45_trans_cdistnet.txt
+  use_amp: True
+  grad_clip_val: 5
+Optimizer:
+  name: Adam
+  lr: 0.002 # for 4gpus bs256/gpu
+  weight_decay: 0.0
+  filter_bias_and_bn: False
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: CDistNet
+  Transform:
+  Encoder:
+    name: ResNet45
+    in_channels: 3
+    strides: [2, 1, 2, 1, 1]
+  Decoder:
+    name: CDistNetDecoder
+    add_conv: True
+Loss:
+  name: ARLoss
+PostProcess:
+  name: ARLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ARLabelEncode: # Class handling label
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/cdistnet/svtrv2_cdistnet.yml ADDED Viewed

	@@ -0,0 +1,139 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtrv2_cdistnet/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_cdistnet.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.00065 #4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: CDistNet
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    out_channels: 256
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: True
+  Decoder:
+    name: CDistNetDecoder
+    add_conv: False
+    num_encoder_blocks: 0
+Loss:
+  name: ARLoss
+PostProcess:
+  name: ARLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: false
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ARLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 4
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: [
+      '../evaluation/CUTE80',
+      '../evaluation/IC13_857',
+      '../evaluation/IC15_1811',
+      '../evaluation/IIIT5k',
+      '../evaluation/SVT',
+      '../evaluation/SVTP',
+      ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: *bs
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: *max_ratio
+    num_workers: 4

configs/rec/cppd/svtr_base_cppd.yml ADDED Viewed

	@@ -0,0 +1,123 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtr_base_cppd/
+  save_epoch_step: 1
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 500]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path
+  # ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtr_base_cppd.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # for 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: CPPD
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRNet
+    img_size: [32, 128]
+    out_char_num: 25
+    out_channels: 256
+    patch_merging: 'Conv'
+    embed_dim: [128, 256, 384]
+    depth: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
+    local_mixer: [[5, 5], [5, 5], [5, 5]]
+    last_stage: False
+    prenorm: True
+  Decoder:
+    name: CPPDDecoder
+    vis_seq: 64
+    num_layer: 2
+    pos_len: False
+    rec_layer: 1
+Loss:
+  name: CPPDLoss
+  ignore_index: 100
+  smoothing: True
+  pos_len: False
+  sideloss_weight: 1.0
+PostProcess:
+  name: CPPDLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - CPPDLabelEncode: # Class handling label
+          pos_len: False
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation/
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - CPPDLabelEncode: # Class handling label
+          pos_len: False
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 128
+    num_workers: 4

configs/rec/cppd/svtr_base_cppd_ch.yml ADDED Viewed

	@@ -0,0 +1,126 @@

+Global:
+  device: gpu
+  epoch_num: 100
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/ch/svtr_base_cppd/
+  save_epoch_step: 1
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 2000]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: False
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/ppocr_keys_v1.txt
+  # ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/ch/predicts_svtr_base_cppd.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.0005 # for 4gpus bs128/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: CosineAnnealingLR
+  warmup_epoch: 5
+Architecture:
+  model_type: rec
+  algorithm: CPPD
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRNet
+    img_size: [32, 256]
+    patch_merging: 'Conv'
+    embed_dim: [128, 256, 384]
+    depth: [6, 6, 4]
+    num_heads: [4, 8, 12]
+    mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
+    local_mixer: [[5, 5], [5, 5], [5, 5]]
+    last_stage: False
+    prenorm: True
+  Decoder:
+    name: CPPDDecoder
+    vis_seq: 128
+    num_layer: 3
+    pos_len: False
+    rec_layer: 1
+    ch: True
+Loss:
+  name: CPPDLoss
+  ignore_index: 7000
+  smoothing: True
+  pos_len: False
+  sideloss_weight: 1.0
+PostProcess:
+  name: CPPDLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../benchmark_bctr/benchmark_bctr_train
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - CPPDLabelEncode: # Class handling label
+          pos_len: False
+          ch: True
+          ignore_index: 7000
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - SVTRResize:
+          image_shape: [3, 32, 256]
+          padding: True
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'label_node', 'label_index', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 128
+    drop_last: True
+    num_workers: 8
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../benchmark_bctr/benchmark_bctr_test/scene_test
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - CPPDLabelEncode: # Class handling label
+          pos_len: False
+          ch: True
+          ignore_index: 7000
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - SVTRResize:
+          image_shape: [3, 32, 256]
+          padding: True
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 4

configs/rec/cppd/svtr_base_cppd_h8.yml ADDED Viewed

	@@ -0,0 +1,123 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtr_base_h8_cppd/
+  save_epoch_step: 1
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 500]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtr_base_cppd.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # for 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: CPPD
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRNet
+    img_size: [32, 128]
+    out_char_num: 25
+    out_channels: 256
+    patch_merging: 'Conv'
+    embed_dim: [128, 256, 384]
+    depth: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    sub_k: [[1, 1], [2, 1]]
+    mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
+    local_mixer: [[5, 5], [5, 5], [5, 5]]
+    last_stage: False
+    prenorm: True
+  Decoder:
+    name: CPPDDecoder
+    vis_seq: 128
+    num_layer: 2
+    pos_len: False
+    rec_layer: 1
+Loss:
+  name: CPPDLoss
+  ignore_index: 100
+  smoothing: True
+  pos_len: False
+  sideloss_weight: 1.0
+PostProcess:
+  name: CPPDLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - CPPDLabelEncode: # Class handling label
+          pos_len: False
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation/
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - CPPDLabelEncode: # Class handling label
+          pos_len: False
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 128
+    num_workers: 4

configs/rec/cppd/svtr_base_cppd_syn.yml ADDED Viewed

	@@ -0,0 +1,124 @@

+Global:
+  device: gpu
+  epoch_num: 60
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/syn/svtr_base_cppd/
+  save_epoch_step: 1
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 500]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path
+  # ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/syn/predicts_svtr_base_cppd.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.0005 # for 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: CosineAnnealingLR
+  warmup_epoch: 6
+Architecture:
+  model_type: rec
+  algorithm: CPPD
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRNet
+    img_size: [32, 100]
+    out_char_num: 25
+    out_channels: 256
+    patch_merging: 'Conv'
+    embed_dim: [128, 256, 384]
+    depth: [6, 6, 4]
+    num_heads: [4, 8, 12]
+    mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
+    local_mixer: [[5, 5], [5, 5], [5, 5]]
+    last_stage: False
+    prenorm: True
+  Decoder:
+    name: CPPDDecoder
+    vis_seq: 50
+    num_layer: 3
+    pos_len: False
+    rec_layer: 1
+Loss:
+  name: CPPDLoss
+  ignore_index: 100
+  smoothing: True
+  pos_len: False
+  sideloss_weight: 1.0
+PostProcess:
+  name: CPPDLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+Train:
+  dataset:
+    name: STRLMDBDataSet
+    data_dir: ./
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+    #   - SVTRRAug:
+      - CPPDLabelEncode: # Class handling label
+          pos_len: False
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - SVTRResize:
+          image_shape: [3, 32, 100]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 8
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation/
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - CPPDLabelEncode: # Class handling label
+          pos_len: False
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - SVTRResize:
+          image_shape: [3, 32, 100]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 4

configs/rec/cppd/svtrv2_cppd.yml ADDED Viewed

	@@ -0,0 +1,150 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtrv2_cppd/
+  save_epoch_step: 1
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 500]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_cppd.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # for 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: CPPD
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    out_channels: 256
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: False
+  Decoder:
+    name: CPPDDecoder
+    ds: True
+    num_layer: 2
+    pos_len: False
+    rec_layer: 1
+Loss:
+  name: CPPDLoss
+  ignore_index: 100
+  smoothing: True
+  pos_len: False
+  sideloss_weight: 1.0
+PostProcess:
+  name: CPPDLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: false
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - CPPDLabelEncode: # Class handling label
+          pos_len: False
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'label_node', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 4
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: [
+      '../evaluation/CUTE80',
+      '../evaluation/IC13_857',
+      '../evaluation/IC15_1811',
+      '../evaluation/IIIT5k',
+      '../evaluation/SVT',
+      '../evaluation/SVTP',
+      ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - CPPDLabelEncode: # Class handling label
+          pos_len: False
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: *bs
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: *max_ratio
+    num_workers: 4

configs/rec/dan/resnet45_fpn_dan.yml ADDED Viewed

	@@ -0,0 +1,98 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/resnet45_fpn_dan/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_resnet45_fpn_dan.txt
+  use_amp: True
+  grad_clip_val: 20
+Optimizer:
+  name: Adam
+  lr: 0.00065 # for 4gpus bs256/gpu
+  weight_decay: 0.0
+  filter_bias_and_bn: False
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: DAN
+  Transform:
+  Encoder:
+    name: ResNet45
+    in_channels: 3
+    strides: [2, 1, 2, 1, 1]
+    return_list: True
+  Decoder:
+    name: DANDecoder
+    max_len: 25
+    channels_list: [64, 128, 256, 512]
+    strides_list: [[2, 2], [1, 1], [1, 1]]
+    in_shape: [8, 32]
+    depth: 4
+Loss:
+  name: ARLoss
+PostProcess:
+  name: ARLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ARLabelEncode:
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode:
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/dan/svtrv2_dan.yml ADDED Viewed

	@@ -0,0 +1,130 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtrv2_dan
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_dan.txt
+  use_amp: True
+  grad_clip_val: 20
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # 4gpus 256bs/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: DAN
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    out_channels: 256
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: True
+  Decoder:
+    name: DANDecoder
+    use_cam: False
+    max_len: 25
+Loss:
+  name: ARLoss
+PostProcess:
+  name: ARLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: false
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ARLabelEncode:
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 4
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: [
+      '../evaluation/CUTE80',
+      '../evaluation/IC13_857',
+      '../evaluation/IC15_1811',
+      '../evaluation/IIIT5k',
+      '../evaluation/SVT',
+      '../evaluation/SVTP',
+      ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode:
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: *bs
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: *max_ratio
+    num_workers: 4

configs/rec/focalsvtr/focalsvtr_ctc.yml ADDED Viewed

	@@ -0,0 +1,137 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/focalsvtr_ctc/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path
+  # ./tools/utils/EN_symbol_dict.txt
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/u14m_filter/predicts_focalsvtr_ctc.txt
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # for 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: SVTR
+  Transform:
+  Encoder:
+    name: FocalSVTR
+    img_size: [32, 128]
+    depths: [6, 6, 6]
+    embed_dim: 96
+    sub_k: [[1, 1], [2, 1], [1, 1]]
+    focal_levels: [3, 3, 3]
+    out_channels: 256
+    last_stage: True
+  Decoder:
+    name: CTCDecoder
+Loss:
+  name: CTCLoss
+  zero_infinity: True
+PostProcess:
+  name: CTCLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSet
+    ds_width: True
+    padding: &padding False
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - PARSeqAug:
+      - CTCLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length']
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: 12
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSet
+    ds_width: True
+    padding: True
+    data_dir_list: ['../evaluation/CUTE80',
+    '../evaluation/IC13_857',
+    '../evaluation/IC15_1811',
+    '../evaluation/IIIT5k',
+    '../evaluation/SVT',
+    '../evaluation/SVTP',
+    ]
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - CTCLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length']
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: 128
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 128
+    max_ratio: 12
+    num_workers: 4

configs/rec/gtc/svtrv2_lnconv_nrtr_gtc.yml ADDED Viewed

	@@ -0,0 +1,168 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/svtrv2_lnconv_nrtr_gtc
+  save_epoch_step: 1
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 500]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img: ../ltb/img
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/predicts_smtr.txt
+  use_amp: True
+  distributed: true
+Optimizer:
+  name: AdamW
+  lr: 0.00065
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: BGPD
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    out_channels: 256
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: True
+  Decoder:
+    name: GTCDecoder
+    infer_gtc: True
+    detach: False
+    gtc_decoder:
+      name: NRTRDecoder
+      num_encoder_layers: -1
+      beam_size: 0
+      num_decoder_layers: 2
+      nhead: 12
+      max_len: *max_text_length
+    ctc_decoder:
+      name: RCTCDecoder
+Loss:
+  name: GTCLoss
+  gtc_loss:
+    name: ARLoss
+PostProcess:
+  name: GTCLabelDecode
+  gtc_label_decode:
+    name: ARLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecGTCMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSet
+    ds_width: True
+    # max_ratio: &max_ratio 4
+    # min_ratio: 1
+    # base_shape: &base_shape [[64, 64], [96, 48], [112, 40], [128, 32]]
+    # base_h: &base_h 32
+    # padding: &padding False
+    padding: false
+    # padding_rand: true
+    # padding_doub: true
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - PARSeqAug:
+      - GTCLabelEncode: # Class handling label
+          gtc_label_encode:
+            name: ARLabelEncode
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length', 'ctc_label', 'ctc_length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 4
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSet
+    ds_width: True
+    padding: False
+    data_dir_list: [
+      '../evaluation/CUTE80',
+      '../evaluation/IC13_857',
+      '../evaluation/IC15_1811',
+      '../evaluation/IIIT5k',
+      '../evaluation/SVT',
+      '../evaluation/SVTP',
+      ]
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - GTCLabelEncode: # Class handling label
+          gtc_label_encode:
+            name: ARLabelEncode
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length', 'ctc_label', 'ctc_length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: *bs
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: *max_ratio
+    num_workers: 4

configs/rec/gtc/svtrv2_lnconv_smtr_gtc_long_infer.yml ADDED Viewed

	@@ -0,0 +1,151 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/svtrv2_lnconv_smtr_gtc_long_infer
+  save_epoch_step: 1
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 1000]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img: ../ltb/img
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/predicts_smtr.txt
+  use_amp: True
+  distributed: true
+Optimizer:
+  name: AdamW
+  lr: 0.000325
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: BGPD
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    out_channels: 256
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: True
+  Decoder:
+    name: GTCDecoder
+    infer_gtc: False
+    detach: False
+    gtc_decoder:
+      name: SMTRDecoder
+      num_layer: 1
+      ds: True
+      max_len: *max_text_length
+      next_mode: &next True
+      sub_str_len: &subsl 5
+    ctc_decoder:
+      name: RCTCDecoder
+Loss:
+  name: CTCLoss
+PostProcess:
+  name: CTCLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: false
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - CTCLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 128
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 12
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: [
+      '../evaluation/CUTE80',
+      '../evaluation/IC13_857',
+      '../evaluation/IC15_1811',
+      '../evaluation/IIIT5k',
+      '../evaluation/SVT',
+      '../evaluation/SVTP',
+      ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - CTCLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: *bs
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: *max_ratio
+    num_workers: 4

configs/rec/gtc/svtrv2_lnconv_smtr_gtc_smtr_long.yml ADDED Viewed

	@@ -0,0 +1,150 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/svtrv2_lnconv_smtr_gtc_nodetach_smtr_long_infer
+  save_epoch_step: 1
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 1000]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/predicts_smtr.txt
+  use_amp: True
+  distributed: true
+Optimizer:
+  name: AdamW
+  lr: 0.000325
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: BGPD
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    out_channels: 256
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: True
+  Decoder:
+    name: GTCDecoder
+    infer_gtc: True
+    detach: False
+    gtc_decoder:
+      name: SMTRDecoder
+      num_layer: 1
+      ds: True
+      max_len: *max_text_length
+      next_mode: &next True
+      sub_str_len: &subsl 5
+      infer_aug: True
+    ctc_decoder:
+      name: RCTCDecoder
+Loss:
+  name: GTCLoss
+  ctc_weight: 0.1
+  gtc_loss:
+    name: SMTRLoss
+PostProcess:
+  name: GTCLabelDecode
+  gtc_label_decode:
+    name: SMTRLabelDecode
+    next_mode: *next
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+  only_gtc: True
+Metric:
+  name: RecGTCMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: false
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - SMTRLabelEncode: # Class handling label
+          sub_str_len: *subsl
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'label_subs', 'label_next', 'length_subs',
+          'label_subs_pre', 'label_next_pre', 'length_subs_pre', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 12
+    num_workers: 4
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ../ltb/
+    label_file_list: ['../ltb/ultra_long_70_list.txt']
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - GTCLabelEncode: # Class handling label
+          gtc_label_encode:
+            name: ARLabelEncode
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: 200
+      - SliceResize:
+          image_shape: [3, 32, 128]
+          padding: False
+          max_ratio: 12
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length', 'ctc_label', 'ctc_length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 1
+    num_workers: 2

configs/rec/gtc/svtrv2_lnconv_smtr_gtc_stream.yml ADDED Viewed

	@@ -0,0 +1,152 @@

+Global:
+  device: gpu
+  epoch_num: 60
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/svtrv2_lnconv_smtr_gtc_stream
+  save_epoch_step: 1
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 500]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/predicts_smtr.txt
+  use_amp: True
+  distributed: true
+  grad_clip_val: 20
+Optimizer:
+  name: AdamW
+  lr: 0.00065
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: BGPD
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    out_channels: 256
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: True
+  Decoder:
+    name: GTCDecoder
+    infer_gtc: True
+    detach: False
+    gtc_decoder:
+      name: SMTRDecoder
+      num_layer: 1
+      ds: True
+      max_len: *max_text_length
+      next_mode: &next True
+      sub_str_len: &subsl 5
+      infer_aug: False
+    ctc_decoder:
+      name: RCTCDecoder
+Loss:
+  name: GTCLoss
+  ctc_weight: 0.25
+  gtc_loss:
+    name: SMTRLoss
+PostProcess:
+  name: GTCLabelDecode
+  gtc_label_decode:
+    name: SMTRLabelDecode
+    next_mode: *next
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+  only_gtc: True
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+  stream: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: false
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - SMTRLabelEncode: # Class handling label
+          sub_str_len: *subsl
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'label_subs', 'label_next', 'length_subs',
+          'label_subs_pre', 'label_next_pre', 'length_subs_pre', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 12
+    num_workers: 4
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ../ltb/
+    label_file_list: ['../ltb/ultra_long_70_list.txt']
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - GTCLabelEncode: # Class handling label
+          gtc_label_encode:
+            name: ARLabelEncode
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - SliceTVResize:
+          image_shape: [32, 128]
+          padding: False
+          max_ratio: 4
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length', 'ctc_label', 'ctc_length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 1
+    num_workers: 2

configs/rec/igtr/readme.md ADDED Viewed

	@@ -0,0 +1,189 @@

+# IGTR
+- [IGTR](#igtr)
+  - [1. Introduction](#1-introduction)
+  - [2. Environment](#2-environment)
+    - [Dataset Preparation](#dataset-preparation)
+  - [3. Model Training / Evaluation](#3-model-training--evaluation)
+  - [Citation](#citation)
+<a name="1"></a>
+## 1. Introduction
+Paper:
+> [Instruction-Guided Scene Text Recognition](https://arxiv.org/abs/2401.17851)
+> Yongkun Du, Zhineng Chen, Yuchen Su, Caiyan Jia, Yu-Gang Jiang
+<a name="model"></a>
+Multi-modal models show appealing performance in visual recognition tasks recently, as free-form text-guided training evokes the ability to understand fine-grained visual content. However, current models are either inefficient or cannot be trivially upgraded to scene text recognition (STR) due to the composition difference between natural and text images. We propose a novel instruction-guided scene text recognition (IGTR) paradigm that formulates STR as an instruction learning problem and understands text images by predicting character attributes, e.g., character frequency, position, etc. IGTR first devises $\\left \\langle condition,question,answer\\right \\rangle$ instruction triplets, providing rich and diverse descriptions of character attributes. To effectively learn these attributes through question-answering, IGTR develops lightweight instruction encoder, cross-modal feature fusion module and multi-task answer head, which guides nuanced text image understanding. Furthermore, IGTR realizes different recognition pipelines simply by using different instructions, enabling a character-understanding-based text reasoning paradigm that considerably differs from current methods. Experiments on English and Chinese benchmarks show that IGTR outperforms existing models by significant margins, while maintaining a small model size and efficient inference speed. Moreover, by adjusting the sampling of instructions, IGTR offers an elegant way to tackle the recognition of both rarely appearing and morphologically similar characters, which were previous challenges.
+<a name="model"></a>
+The accuracy (%) and model files of IGTR on the public dataset of scene text recognition are as follows:
+- Trained on Synth dataset(MJ+ST), test on Common Benchmarks, training and test datasets both from [PARSeq](https://github.com/baudm/parseq).
+|  Model  | IC13<br/>857 | SVT  | IIIT5k<br/>3000 | IC15<br/>1811 | SVTP | CUTE80 |  Avg  |                                        Config&Model&Log                                         |
+| :-----: | :----------: | :--: | :-------------: | :-----------: | :--: | :----: | :---: | :---------------------------------------------------------------------------------------------: |
+| IGTR-PD |     97.6     | 95.2 |      97.6       |     88.4      | 91.6 |  95.5  | 94.30 | [link](https://drive.google.com/drive/folders/1Pv0CW2hiWC_dIyaB74W1fsXqiX3z5yXA?usp=drive_link) |
+| IGTR-AR |     98.6     | 95.7 |      98.2       |     88.4      | 92.4 |  95.5  | 94.78 |                                            as above                                             |
+- Test on Union14M-L benchmark, from [Union14M](https://github.com/Mountchicken/Union14M/).
+|  Model  | Curve | Multi-<br/>Oriented | Artistic | Contextless | Salient | Multi-<br/>word | General |  Avg  |    Config&Model&Log     |
+| :-----: | :---: | :-----------------: | :------: | :---------: | :-----: | :-------------: | :-----: | :---: | :---------------------: |
+| IGTR-PD | 76.9  |        30.6         |   59.1   |    63.3     |  77.8   |      62.5       |  66.7   | 62.40 | Same as the above table |
+| IGTR-AR | 78.4  |        31.9         |   61.3   |    66.5     |  80.2   |      69.3       |  67.9   | 65.07 |        as above         |
+- Trained on Union14M-L training dataset.
+|    Model     | IC13<br/>857 | SVT  | IIIT5k<br/>3000 | IC15<br/>1811 | SVTP | CUTE80 |  Avg  |                                        Config&Model&Log                                         |
+| :----------: | :----------: | :--: | :-------------: | :-----------: | :--: | :----: | :---: | :---------------------------------------------------------------------------------------------: |
+|   IGTR-PD    |     97.7     | 97.7 |      98.3       |     89.8      | 93.7 |  97.9  | 95.86 | [link](https://drive.google.com/drive/folders/1ZGlzDqEzjrBg8qG2wBkbOm3bLRzFbTzo?usp=drive_link) |
+|   IGTR-AR    |     98.1     | 98.4 |      98.7       |     90.5      | 94.9 |  98.3  | 96.48 |                                            as above                                             |
+| IGTR-PD-60ep |     97.9     | 98.3 |      99.2       |     90.8      | 93.7 |  97.6  | 96.24 | [link](https://drive.google.com/drive/folders/1ik4hxZDRsjU1RbCA19nwE45Kg1bCnMoa?usp=drive_link) |
+| IGTR-AR-60ep |     98.4     | 98.1 |      99.3       |     91.5      | 94.3 |  97.6  | 96.54 |                                            as above                                             |
+|  IGTR-PD-PT  |     98.6     | 98.0 |      99.1       |     91.7      | 96.8 |  99.0  | 97.20 | [link](https://drive.google.com/drive/folders/1QM0EWV66IfYI1G0Xm066V2zJA62hH6-1?usp=drive_link) |
+|  IGTR-AR-PT  |     98.8     | 98.3 |      99.2       |     92.0      | 96.8 |  99.0  | 97.34 |                                            as above                                             |
+|    Model     | Curve | Multi-<br/>Oriented | Artistic | Contextless | Salient | Multi-<br/>word | General |  Avg  |    Config&Model&Log     |
+| :----------: | :---: | :-----------------: | :------: | :---------: | :-----: | :-------------: | :-----: | :---: | :---------------------: |
+|   IGTR-PD    | 88.1  |        89.9         |   74.2   |    80.3     |  82.8   |      79.2       |  83.0   | 82.51 | Same as the above table |
+|   IGTR-AR    | 90.4  |        91.2         |   77.0   |    82.4     |  84.7   |      84.0       |  84.4   | 84.86 |        as above         |
+| IGTR-PD-60ep | 90.0  |        92.1         |   77.5   |    82.8     |  86.0   |      83.0       |  84.8   | 85.18 | Same as the above table |
+| IGTR-AR-60ep | 91.0  |        93.0         |   78.7   |    84.6     |  87.3   |      84.8       |  85.6   | 86.43 |        as above         |
+|  IGTR-PD-PT  | 92.4  |        92.1         |   80.7   |    83.6     |  87.7   |      86.9       |  85.0   | 86.92 | Same as the above table |
+|  IGTR-AR-PT  | 93.0  |        92.9         |   81.3   |    83.4     |  88.6   |      88.7       |  85.6   | 87.65 |        as above         |
+- Trained and test on Chinese dataset, from [Chinese Benckmark](https://github.com/FudanVI/benchmarking-chinese-text-recognition).
+|    Model    | Scene | Web  | Document | Handwriting |  Avg  |                                        Config&Model&Log                                         |
+| :---------: | :---: | :--: | :------: | :---------: | :---: | :---------------------------------------------------------------------------------------------: |
+|   IGTR-PD   | 73.1  | 74.8 |   98.6   |    52.5     | 74.75 |                                                                                                 |
+|   IGTR-AR   | 75.1  | 76.4 |   98.7   |    55.3     | 76.37 |                                                                                                 |
+| IGTR-PD-TS  | 73.5  | 75.9 |   98.7   |    54.5     | 75.65 | [link](https://drive.google.com/drive/folders/1H3VRdGHjhawd6fkSC-qlBzVzvYYTpHRg?usp=drive_link) |
+| IGTR-AR-TS  | 75.6  | 77.0 |   98.8   |    57.3     | 77.17 |                                            as above                                             |
+| IGTR-PD-Aug | 79.5  | 80.0 |   99.4   |    58.9     | 79.45 | [link](https://drive.google.com/drive/folders/1XFQkCILwcFwA7iYyQY9crnrouaI5sqcZ?usp=drive_link) |
+| IGTR-AR-Aug | 82.0  | 81.7 |   99.5   |    63.8     | 81.74 |                                            as above                                             |
+Download all Configs, Models, and Logs from [Google Drive](https://drive.google.com/drive/folders/1mSRDg9Mj5R6PspAdFGXZHDHTCQmjkd8d?usp=drive_link).
+<a name="2"></a>
+## 2. Environment
+- [PyTorch](http://pytorch.org/) version >= 1.13.0
+- Python version >= 3.7
+```shell
+git clone -b develop https://github.com/Topdu/OpenOCR.git
+cd OpenOCR
+# A100 Ubuntu 20.04 Cuda 11.8
+conda create -n openocr python==3.8
+conda activate openocr
+conda install pytorch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 pytorch-cuda=11.8 -c pytorch -c nvidia
+pip install -r requirements.txt
+```
+#### Dataset Preparation
+[English dataset download](https://github.com/baudm/parseq)
+[Union14M-L download](https://github.com/Mountchicken/Union14M)
+[Chinese dataset download](https://github.com/fudanvi/benchmarking-chinese-text-recognition#download)
+The expected filesystem structure is as follows:
+```
+benchmark_bctr
+├── benchmark_bctr_test
+│   ├── document_test
+│   ├── handwriting_test
+│   ├── scene_test
+│   └── web_test
+└── benchmark_bctr_train
+    ├── document_train
+    ├── handwriting_train
+    ├── scene_train
+    └── web_train
+evaluation
+├── CUTE80
+├── IC13_857
+├── IC15_1811
+├── IIIT5k
+├── SVT
+└── SVTP
+OpenOCR
+synth
+├── MJ
+│   ├── test
+│   ├── train
+│   └── val
+└── ST
+test # from PARSeq
+├── ArT
+├── COCOv1.4
+├── CUTE80
+├── IC13_1015
+├── IC13_1095
+├── IC13_857
+├── IC15_1811
+├── IC15_2077
+├── IIIT5k
+├── SVT
+├── SVTP
+└── Uber
+u14m # lmdb format
+├── artistic
+├── contextless
+├── curve
+├── general
+├── multi_oriented
+├── multi_words
+└── salient
+Union14M-LMDB-L # lmdb format
+├── train_challenging
+├── train_easy
+├── train_hard
+├── train_medium
+└── train_normal
+```
+<a name="3"></a>
+## 3. Model Training / Evaluation
+Training:
+```shell
+# The configuration file is available from the link provided in the table above.
+# Multi GPU training
+CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 tools/train_rec.py --c PATH/svtr_base_igtr_XXX.yml
+```
+Evaluation:
+```shell
+# The configuration file is available from the link provided in the table above.
+# en
+python tools/eval_rec_all_ratio.py --c PATH/svtr_base_igtr_syn.yml
+# ch
+python tools/eval_rec_all_ch.py --c PATH/svtr_base_igtr_ch_aug.yml
+```
+## Citation
+```bibtex
+@article{Du2024IGTR,
+  title     = {Instruction-Guided Scene Text Recognition},
+  author    = {Du, Yongkun and Chen, Zhineng and Su, Yuchen and Jia, Caiyan and Jiang, Yu-Gang},
+  journal   = {CoRR},
+  eprinttype = {arXiv},
+  primaryClass={cs.CV},
+  volume    = {abs/2401.17851},
+  year      = {2024},
+  url       = {https://arxiv.org/abs/2401.17851}
+}
+```

configs/rec/igtr/svtr_base_ds_igtr.yml ADDED Viewed

	@@ -0,0 +1,157 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtr_base_igtr
+  save_epoch_step: 1
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 500]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path
+  # ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtr_base_igtr.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.0005 # 2gpus 384bs/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: IGTR
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRNet2DPos
+    img_size: [32, -1]
+    out_char_num: 25
+    out_channels: 256
+    patch_merging: 'Conv'
+    embed_dim: [128, 256, 384]
+    depth: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: ['ConvB','ConvB','ConvB','ConvB','ConvB','ConvB', 'ConvB','ConvB', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
+    local_mixer: [[5, 5], [5, 5], [5, 5]]
+    last_stage: False
+    prenorm: True
+    use_first_sub: False
+  Decoder:
+    name: IGTRDecoder
+    dim: 384
+    num_layer: 1
+    ar: False
+    refine_iter: 0
+    # next_pred: True
+    next_pred: False
+    pos2d: True
+    ds: True
+    # pos_len: False
+    # rec_layer: 1
+Loss:
+  name: IGTRLoss
+PostProcess:
+  name: IGTRLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+Train:
+  dataset:
+    name: RatioDataSet
+    ds_width: True
+    padding: &padding False
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - PARSeqAug:
+      - IGTRLabelEncode: # Class handling label
+          k: 8
+          prompt_error: False
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'prompt_pos_idx_list',
+          'prompt_char_idx_list', 'ques_pos_idx_list', 'ques1_answer_list',
+          'ques2_char_idx_list', 'ques2_answer_list', 'ques3_answer', 'ques4_char_num_list',
+          'ques_len_list', 'ques2_len_list', 'prompt_len_list', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 384
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 4
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSet
+    ds_width: True
+    padding: *padding
+    data_dir_list: ['../evaluation/CUTE80',
+    '../evaluation/IC13_857',
+    '../evaluation/IC15_1811',
+    '../evaluation/IIIT5k',
+    '../evaluation/SVT',
+    '../evaluation/SVTP']
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - ARLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    max_ratio: *max_ratio
+    num_workers: 4

configs/rec/lister/focalsvtr_lister_wo_fem_maxratio12.yml ADDED Viewed

	@@ -0,0 +1,133 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/focalsvtr_lister_wo_fem_maxratio12/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/u14m_filter/predicts_focalsvtr_lister_wo_fem_maxratio12.txt
+  use_amp: True
+  grad_clip_val: 20
+Optimizer:
+  name: AdamW
+  lr: 0.00065
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: LISTER
+  Transform:
+  Encoder:
+    name: FocalSVTR
+    img_size: [32, 128]
+    depths: [6, 6, 9]
+    embed_dim: 96
+    sub_k: [[1, 1], [2, 1], [1, 1]]
+    focal_levels: [3, 3, 3]
+    last_stage: False
+    feat2d: True
+  Decoder:
+    name: LISTERDecoder
+    detach_grad: False
+    attn_scaling: True
+    use_fem: False
+Loss:
+  name: LISTERLoss
+PostProcess:
+  name: LISTERLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - EPLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length']
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: 12
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: ['../evaluation/CUTE80',
+    '../evaluation/IC13_857',
+    '../evaluation/IC15_1811',
+    '../evaluation/IIIT5k',
+    '../evaluation/SVT',
+    '../evaluation/SVTP',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - EPLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length']
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: 12
+    num_workers: 4

configs/rec/lister/svtrv2_lister_wo_fem_maxratio12.yml ADDED Viewed

	@@ -0,0 +1,138 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtrv2_lister_wo_fem_maxratio12/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_lister_wo_fem_maxratio12.txt
+  use_amp: True
+  grad_clip_val: 20
+Optimizer:
+  name: AdamW
+  lr: 0.000325
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: LISTER
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    out_channels: 256
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: True
+  Decoder:
+    name: LISTERDecoder
+    detach_grad: False
+    attn_scaling: True
+    use_fem: False
+Loss:
+  name: LISTERLoss
+PostProcess:
+  name: LISTERLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - EPLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length']
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 128
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: 12
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: ['../evaluation/CUTE80',
+    '../evaluation/IC13_857',
+    '../evaluation/IC15_1811',
+    '../evaluation/IIIT5k',
+    '../evaluation/SVT',
+    '../evaluation/SVTP',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - EPLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: 12
+    num_workers: 4

configs/rec/lpv/svtr_base_lpv.yml ADDED Viewed

	@@ -0,0 +1,124 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtr_base_lpv/
+  save_epoch_step: 1
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 500]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  # ./output/rec/u14m_filter/svtr_base_lpv_wo_glrm/best.pth
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtr_lpv.txt
+  use_amp: True
+  grad_clip_val: 20
+Optimizer:
+  name: Adam
+  lr: 0.0001 # for 4gpus bs128/gpu
+  weight_decay: 0.0
+  filter_bias_and_bn: False
+  betas: [0.9, 0.99]
+LRScheduler:
+  name: MultiStepLR
+  milestones: [12]
+  gamma: 0.1
+Architecture:
+  model_type: rec
+  algorithm: LPV
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRNet
+    img_size: [32, 128]
+    out_char_num: 25
+    out_channels: 256
+    patch_merging: 'Conv'
+    embed_dim: [128, 256, 384]
+    depth: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
+    local_mixer: [[5, 5], [5, 5], [5, 5]]
+    sub_k: [[1, 1], [1, 1]]
+    feature2d: True
+    last_stage: False
+    prenorm: True
+  Decoder:
+    name: LPVDecoder
+    num_layer: 3
+    max_len: *max_text_length
+    use_mask: True
+    dim_feedforward: 1536
+    nhead: 12
+    dropout: 0.1
+    trans_layer: 3
+Loss:
+  name: LPVLoss
+PostProcess:
+  name: ARLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ARLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 128
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation/
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 128
+    num_workers: 4

configs/rec/lpv/svtr_base_lpv_wo_glrm.yml ADDED Viewed

	@@ -0,0 +1,123 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtr_base_lpv_wo_glrm/
+  save_epoch_step: 1
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 500]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtr_base_lpv_wo_glrm.txt
+  use_amp: True
+  grad_clip_val: 20
+Optimizer:
+  name: Adam
+  lr: 0.0001 # for 4gpus bs128/gpu
+  weight_decay: 0.0
+  filter_bias_and_bn: False
+  betas: [0.9, 0.99]
+LRScheduler:
+  name: MultiStepLR
+  milestones: [12]
+  gamma: 0.1
+Architecture:
+  model_type: rec
+  algorithm: LPV
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRNet
+    img_size: [32, 128]
+    out_char_num: 25
+    out_channels: 256
+    patch_merging: 'Conv'
+    embed_dim: [128, 256, 384]
+    depth: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: ['Conv','Conv','Conv','Conv','Conv','Conv', 'Conv','Conv', 'Global','Global','Global','Global','Global','Global','Global','Global','Global','Global']
+    local_mixer: [[5, 5], [5, 5], [5, 5]]
+    sub_k: [[1, 1], [1, 1]]
+    feature2d: True
+    last_stage: False
+    prenorm: True
+  Decoder:
+    name: LPVDecoder
+    num_layer: 3
+    max_len: *max_text_length
+    use_mask: False
+    dim_feedforward: 1536
+    nhead: 12
+    dropout: 0.1
+    trans_layer: 3
+Loss:
+  name: LPVLoss
+PostProcess:
+  name: ARLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ARLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 128
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation/
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 128
+    num_workers: 4

configs/rec/lpv/svtrv2_lpv.yml ADDED Viewed

	@@ -0,0 +1,147 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtrv2_lpv/
+  save_epoch_step: 1
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 500]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  # ./output/rec/u14m_filter/svtrv2_lpv_wo_glrm/best.pth
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_lpv.txt
+  use_amp: True
+  grad_clip_val: 20
+Optimizer:
+  name: AdamW
+  lr: 0.000325 # for 4gpus bs128/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: LPV
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: True
+  Decoder:
+    name: LPVDecoder
+    num_layer: 3
+    max_len: *max_text_length
+    use_mask: True
+    dim_feedforward: 1536
+    nhead: 12
+    dropout: 0.1
+    trans_layer: 3
+Loss:
+  name: LPVLoss
+PostProcess:
+  name: ARLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: false
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ARLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 128
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 4
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: [
+      '../evaluation/CUTE80',
+      '../evaluation/IC13_857',
+      '../evaluation/IC15_1811',
+      '../evaluation/IIIT5k',
+      '../evaluation/SVT',
+      '../evaluation/SVTP',
+      ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: *bs
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: *max_ratio
+    num_workers: 4

configs/rec/lpv/svtrv2_lpv_wo_glrm.yml ADDED Viewed

	@@ -0,0 +1,146 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtrv2_lpv_wo_glrm/
+  save_epoch_step: 1
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 500]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_lpv_wo_glrm.txt
+  use_amp: True
+  grad_clip_val: 20
+Optimizer:
+  name: AdamW
+  lr: 0.000325 # for 4gpus bs128/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: LPV
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: True
+  Decoder:
+    name: LPVDecoder
+    num_layer: 3
+    max_len: *max_text_length
+    use_mask: False
+    dim_feedforward: 1536
+    nhead: 12
+    dropout: 0.1
+    trans_layer: 3
+Loss:
+  name: LPVLoss
+PostProcess:
+  name: ARLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: false
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ARLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 128
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 4
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: [
+      '../evaluation/CUTE80',
+      '../evaluation/IC13_857',
+      '../evaluation/IC15_1811',
+      '../evaluation/IIIT5k',
+      '../evaluation/SVT',
+      '../evaluation/SVTP',
+      ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: *bs
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: *max_ratio
+    num_workers: 4

configs/rec/maerec/vit_nrtr.yml ADDED Viewed

	@@ -0,0 +1,116 @@

+Global:
+  device: gpu
+  epoch_num: 10
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/vit_nrtr_ft_mae/
+  save_epoch_step: 1
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 500]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  # ./open_ocr_vit_small_params.pth
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/u14m_filter/predicts_vit_nrtr_ft_mae.txt
+  use_amp: True
+  project_name: maerec
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # for 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 :  1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: BGPD
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: ViT
+    img_size: [32, 128]
+    patch_size: [4, 4]
+    embed_dim: 384
+    depth: 12
+    num_heads: 6
+    mlp_ratio: 4
+    qkv_bias: True
+    use_cls_token: True
+  Decoder:
+    name: NRTRDecoder
+    num_encoder_layers: -1
+    beam_size: 0
+    num_decoder_layers: 6
+    nhead: 8
+    max_len: *max_text_length
+Loss:
+  name: ARLoss
+PostProcess:
+  name: ARLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ARLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation/
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 4

configs/rec/matrn/resnet45_trans_matrn.yml ADDED Viewed

	@@ -0,0 +1,95 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/resnet45_trans_matrn/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  # ./openocr_nolang_abinet_lang.pth
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_resnet45_trans_matrn.txt
+  grad_clip_val: 20
+  use_amp: True
+Optimizer:
+  name: Adam
+  lr: 0.000133 # 4gpus 128bs/gpu
+  weight_decay: 0.0
+  filter_bias_and_bn: False
+LRScheduler:
+  name: MultiStepLR
+  milestones: [12, 18]
+  gamma: 0.1
+Architecture:
+  model_type: rec
+  algorithm: MATRN
+  Transform:
+  Encoder:
+    name: ResNet45
+    in_channels: 3
+    strides: [2, 1, 2, 1, 1]
+  Decoder:
+    name: MATRNDecoder
+    iter_size: 3
+Loss:
+  name: ABINetLoss
+  align_weight: 3.0
+PostProcess:
+  name: ABINetLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ABINetLabelEncode:
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 128
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ABINetLabelEncode:
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/matrn/svtrv2_matrn.yml ADDED Viewed

	@@ -0,0 +1,130 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtrv2_matrn/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  # ./openocr_svtrv2_nolang_abinet_lang.pth
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_matrn.txt
+  use_amp: True
+  grad_clip_val: 20
+Optimizer:
+  name: AdamW
+  lr: 0.000325 # for 4gpus bs128/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: MATRN
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: True
+  Decoder:
+    name: MATRNDecoder
+    iter_size: 3
+    num_layers: 0
+Loss:
+  name: ABINetLoss
+PostProcess:
+  name: ABINetLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: false
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ABINetLabelEncode:
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 128
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 4
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: [
+      '../evaluation/CUTE80',
+      '../evaluation/IC13_857',
+      '../evaluation/IC15_1811',
+      '../evaluation/IIIT5k',
+      '../evaluation/SVT',
+      '../evaluation/SVTP',
+      ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ABINetLabelEncode:
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: *bs
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: *max_ratio
+    num_workers: 4

configs/rec/mgpstr/svtrv2_mgpstr_only_char.yml ADDED Viewed

	@@ -0,0 +1,140 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/svtrv2_mgpstr_only_char/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  use_amp: True
+  save_res_path: ./output/rec/u14m_filter/predicts_svtrv2_mgpstr_only_char.txt
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # 4gpus 256bs/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: MGPSTR
+  Transform:
+  Encoder:
+    name: SVTRv2LNConvTwo33
+    use_pos_embed: False
+    out_channels: 256
+    dims: [128, 256, 384]
+    depths: [6, 6, 6]
+    num_heads: [4, 8, 12]
+    mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
+    local_k: [[5, 5], [5, 5], [-1, -1]]
+    sub_k: [[1, 1], [2, 1], [-1, -1]]
+    last_stage: false
+    feat2d: false
+  Decoder:
+    name: MGPDecoder
+    only_char: &only_char True
+Loss:
+  name: MGPLoss
+  only_char: *only_char
+PostProcess:
+  name: MPGLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+  only_char: *only_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: false
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_filter_train_easy',
+    ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - MGPLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+          only_char: *only_char
+      - KeepKeys:
+          keep_keys: ['image', 'char_label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 4
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSetTVResize
+    ds_width: True
+    padding: False
+    data_dir_list: [
+      '../evaluation/CUTE80',
+      '../evaluation/IC13_857',
+      '../evaluation/IC15_1811',
+      '../evaluation/IIIT5k',
+      '../evaluation/SVT',
+      '../evaluation/SVTP',
+      ]
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - MGPLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+          only_char: *only_char
+      - KeepKeys:
+          keep_keys: ['image', 'char_label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: *bs
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: *bs
+    max_ratio: *max_ratio
+    num_workers: 4

configs/rec/mgpstr/vit_base_mgpstr_only_char.yml ADDED Viewed

	@@ -0,0 +1,111 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/vit_base_mgpstr/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: False
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  use_amp: True
+  save_res_path: ./output/rec/u14m_filter/predicts_vit_mgpstr_only_char.txt
+  grad_clip_val: 5
+  project_name: mgpstr_base
+Optimizer:
+  name: Adam
+  lr: 0.000325 # 4gpus 128bs/gpu
+  weight_decay: 0.
+  filter_bias_and_bn: False
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: MGPSTR
+  Transform:
+  Encoder:
+    name: ViT
+    img_size: [32,128]
+    patch_size: [4, 4]
+    embed_dim: 768
+    depth: 12
+    num_heads: 12
+    mlp_ratio: 4
+    qkv_bias: True
+  Decoder:
+    name: MGPDecoder
+    only_char: &only_char True
+Loss:
+  name: MGPLoss
+  only_char: *only_char
+PostProcess:
+  name: MPGLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+  only_char: *only_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - MGPLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+          only_char: *only_char
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'char_label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 128
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation/
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - MGPLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+          only_char: *only_char
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'char_label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/mgpstr/vit_large_mgpstr_only_char.yml ADDED Viewed

	@@ -0,0 +1,110 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/vit_base_mgpstr_only_char/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: False
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  use_amp: True
+  save_res_path: ./output/rec/u14m_filter/predicts_vit_mgpstr_only_char.txt
+  grad_clip_val: 5
+Optimizer:
+  name: Adam
+  lr: 0.000325 # 4gpus 128bs/gpu
+  weight_decay: 0.
+  filter_bias_and_bn: False
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: MGPSTR
+  Transform:
+  Encoder:
+    name: ViT
+    img_size: [32,128]
+    patch_size: [4, 4]
+    embed_dim: 1024
+    depth: 24
+    num_heads: 16
+    mlp_ratio: 4
+    qkv_bias: True
+  Decoder:
+    name: MGPDecoder
+    only_char: &only_char True
+Loss:
+  name: MGPLoss
+  only_char: *only_char
+PostProcess:
+  name: MPGLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+  only_char: *only_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - MGPLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+          only_char: *only_char
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'char_label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 128
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation/
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - MGPLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+          only_char: *only_char
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'char_label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/mgpstr/vit_mgpstr.yml ADDED Viewed

	@@ -0,0 +1,110 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/vit_mgpstr/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [100000, 2000]
+  cal_metric_during_train: False
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  use_amp: True
+  save_res_path: ./output/rec/u14m_filter/predicts_vit_mgpstr.txt
+  grad_clip_val: 5
+Optimizer:
+  name: Adam
+  lr: 0.000325 # 4gpus 128bs/gpu
+  weight_decay: 0.
+  filter_bias_and_bn: False
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: MGPSTR
+  Transform:
+  Encoder:
+    name: ViT
+    img_size: [32,128]
+    patch_size: [4, 4]
+    embed_dim: 384
+    depth: 12
+    num_heads: 6
+    mlp_ratio: 4
+    qkv_bias: True
+  Decoder:
+    name: MGPDecoder
+    only_char: &only_char False
+Loss:
+  name: MGPLoss
+  only_char: *only_char
+PostProcess:
+  name: MPGLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+  only_char: *only_char
+Metric:
+  name: RecMPGMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - MGPLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+          only_char: *only_char
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'char_label', 'bpe_label', 'wp_label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 128
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation/
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - MGPLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+          only_char: *only_char
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'char_label', 'bpe_label', 'wp_label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/mgpstr/vit_mgpstr_only_char.yml ADDED Viewed

	@@ -0,0 +1,110 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/vit_mgpstr_only_char/
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: False
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  use_amp: True
+  save_res_path: ./output/rec/u14m_filter/predicts_vit_mgpstr_only_char.txt
+  grad_clip_val: 5
+Optimizer:
+  name: Adam
+  lr: 0.000325 # 4gpus 128bs/gpu
+  weight_decay: 0.
+  filter_bias_and_bn: False
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: MGPSTR
+  Transform:
+  Encoder:
+    name: ViT
+    img_size: [32,128]
+    patch_size: [4, 4]
+    embed_dim: 384
+    depth: 12
+    num_heads: 6
+    mlp_ratio: 4
+    qkv_bias: True
+  Decoder:
+    name: MGPDecoder
+    only_char: &only_char True
+Loss:
+  name: MGPLoss
+  only_char: *only_char
+PostProcess:
+  name: MPGLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+  only_char: *only_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - MGPLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+          only_char: *only_char
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'char_label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 128
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation/
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - MGPLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+          only_char: *only_char
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'char_label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/moran/resnet31_lstm_moran.yml ADDED Viewed

	@@ -0,0 +1,92 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/resnet31_lstm_moran
+  eval_epoch_step: [0, 1]
+  eval_batch_step: [0, 500]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: ./tools/utils/EN_symbol_dict.txt
+  max_text_length: 25
+  use_space_char: False
+  save_res_path: ./output/rec/predicts_moran.txt
+  use_amp: True
+  grad_clip_val: 1.0
+Optimizer:
+  name: Adam
+  lr: 0.002 # for 1gpus bs1024/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: False
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: MORAN
+  Transform:
+    name: MORN
+    target_shape: [32, 128]
+  Encoder:
+    name: ResNet_ASTER
+  Decoder:
+    name: ASTERDecoder
+Loss:
+  name: ARLoss
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+PostProcess:
+  name: ARLabelDecode
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ARLabelEncode: # Class handling label
+      - RecTVResize:
+          image_shape: [64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 1024
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+      - RecTVResize:
+          image_shape: [64, 256]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2

configs/rec/nrtr/focalsvtr_nrtr_maxraio12.yml ADDED Viewed

	@@ -0,0 +1,145 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/focalsvtr_nrtr_maxrtio12
+  save_epoch_step: 1
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 500]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img: ../ltb/img
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/u14m_filter/predicts_focalsvtr_nrtr_maxrtio12.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # for 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: NRTR
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: FocalSVTR
+    img_size: [32, 128]
+    depths: [6, 6, 6]
+    embed_dim: 96
+    sub_k: [[1, 1], [2, 1], [1, 1]]
+    focal_levels: [3, 3, 3]
+    last_stage: False
+  Decoder:
+    name: NRTRDecoder
+    num_encoder_layers: -1
+    beam_size: 0
+    num_decoder_layers: 2
+    nhead: 12
+    max_len: *max_text_length
+Loss:
+  name: ARLoss
+PostProcess:
+  name: ARLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: RatioDataSet
+    ds_width: True
+    padding: &padding True
+    padding_rand: True
+    padding_doub: True
+    data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
+    '../Union14M-L-LMDB-Filtered/filter_train_hard',
+    '../Union14M-L-LMDB-Filtered/filter_train_medium',
+    '../Union14M-L-LMDB-Filtered/filter_train_normal',
+    '../Union14M-L-LMDB-Filtered/filter_train_easy',
+    ]
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - PARSeqAug:
+      - ARLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: &bs 256
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: True
+    batch_size_per_card: *bs
+    drop_last: True
+    max_ratio: &max_ratio 12
+    num_workers: 4
+Eval:
+  dataset:
+    name: RatioDataSet
+    ds_width: True
+    padding: False
+    padding_rand: False
+    data_dir_list: [
+      '../evaluation/CUTE80',
+      '../evaluation/IC13_857',
+      '../evaluation/IC15_1811',
+      '../evaluation/IIIT5k',
+      '../evaluation/SVT',
+      '../evaluation/SVTP',
+      ]
+    transforms:
+      - DecodeImage: # load image
+          img_mode: BGR
+          channel_first: False
+      - ARLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  sampler:
+    name: RatioSampler
+    scales: [[128, 32]] # w, h
+    # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+    first_bs: 128
+    fix_bs: false
+    divided_factor: [4, 16] # w, h
+    is_training: False
+  loader:
+    shuffle: False
+    drop_last: False
+    max_ratio: *max_ratio
+    batch_size_per_card: 128
+    num_workers: 4

configs/rec/nrtr/nrtr.yml ADDED Viewed

	@@ -0,0 +1,107 @@

+Global:
+  device: gpu
+  epoch_num: 20
+  log_smooth_window: 20
+  print_batch_step: 10
+  output_dir: ./output/rec/u14m_filter/nrtr/
+  save_epoch_step: 1
+  # evaluation is run every 2000 iterations
+  eval_batch_step: [0, 500]
+  eval_epoch_step: [0, 1]
+  cal_metric_during_train: True
+  pretrained_model:
+  checkpoints:
+  use_tensorboard: false
+  infer_img:
+  # for data or label process
+  character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
+  # ./tools/utils/ppocr_keys_v1.txt  # ch
+  max_text_length: &max_text_length 25
+  use_space_char: &use_space_char False
+  save_res_path: ./output/rec/u14m_filter/predicts_nrtr.txt
+  use_amp: True
+Optimizer:
+  name: AdamW
+  lr: 0.00065 # for 4gpus bs256/gpu
+  weight_decay: 0.05
+  filter_bias_and_bn: True
+LRScheduler:
+  name: OneCycleLR
+  warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
+  cycle_momentum: False
+Architecture:
+  model_type: rec
+  algorithm: BGPD
+  in_channels: 3
+  Transform:
+  Encoder:
+    name: NRTREncoder
+  Decoder:
+    name: NRTRDecoder
+    num_encoder_layers: 6
+    beam_size: 0
+    num_decoder_layers: 6
+    nhead: 8
+    max_len: *max_text_length
+Loss:
+  name: ARLoss
+PostProcess:
+  name: ARLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  is_filter: True
+Train:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../Union14M-L-LMDB-Filtered
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - PARSeqAugPIL:
+      - ARLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    batch_size_per_card: 256
+    drop_last: True
+    num_workers: 4
+Eval:
+  dataset:
+    name: LMDBDataSet
+    data_dir: ../evaluation/
+    transforms:
+      - DecodeImagePIL: # load image
+          img_mode: RGB
+      - ARLabelEncode: # Class handling label
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          max_text_length: *max_text_length
+      - RecTVResize:
+          image_shape: [32, 128]
+          padding: False
+      - KeepKeys:
+          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 256
+    num_workers: 2