Spaces:

modelscope
/

AnyText

Sleeping

App Files Files Community

tastelikefeet commited on Jan 5

Commit

95878c0

•

1 Parent(s): ddfd056

v2

Browse files

Files changed (6) hide show

app.py +36 -6
cldm/cldm.py +10 -1
cldm/model.py +5 -1
cldm/recognizer.py +3 -0
ldm/modules/diffusionmodules/openaimodel.py +1 -1
ldm/modules/diffusionmodules/util.py +2 -1

app.py CHANGED Viewed

@@ -13,12 +13,41 @@ import re
 from gradio.components import Component
 from util import check_channels, resize_image, save_images
 import json
 BBOX_MAX_NUM = 8
 img_save_folder = 'SaveImages'
 load_model = True
 if load_model:
-    inference = pipeline('my-anytext-task', model='damo/cv_anytext_text_generation_editing', model_revision='v1.1.0')
 def count_lines(prompt):
@@ -221,7 +250,8 @@ with block:
             [<a href="https://arxiv.org/abs/2311.03054" style="color:blue; font-size:18px;">arXiv</a>] \
             [<a href="https://github.com/tyxsspa/AnyText" style="color:blue; font-size:18px;">Code</a>] \
             [<a href="https://modelscope.cn/models/damo/cv_anytext_text_generation_editing/summary" style="color:blue; font-size:18px;">ModelScope</a>]\
-            version: 1.1.0 </div>')
     with gr.Row(variant='compact'):
         with gr.Column():
             with gr.Accordion('🕹Instructions(说明)', open=False,):
@@ -305,7 +335,7 @@ with block:
                             rect_xywh_list.extend([x, y, w, h])
                     rect_img = gr.Image(value=create_canvas(), label="Rext Position(方框位置)", elem_id="MD-bbox-rect-t2i", show_label=False, visible=False)
-                    draw_img = gr.Image(value=create_canvas(), label="Draw Position(绘制位置)", visible=True, tool='sketch', show_label=False, brush_radius=60)
                     def re_draw():
                         return [gr.Image(value=create_canvas(), tool='sketch'), gr.Slider(value=512), gr.Slider(value=512)]
@@ -357,7 +387,7 @@ with block:
                         ori_img = gr.Image(label='Ori(原图)')
                     def upload_ref(x):
-                        return [gr.Image(type="numpy", brush_radius=60, tool='sketch'),
                                 gr.Image(value=x)]
                     def clear_ref(x):
@@ -394,8 +424,8 @@ with block:
     run_edit.click(fn=process, inputs=[gr.State('edit')] + ips, outputs=[result_gallery, result_info])
 block.launch(
-    #server_name='0.0.0.0' if os.getenv('GRADIO_LISTEN', '') != '' else "127.0.0.1",
-    #share=False,
     root_path=f"/{os.getenv('GRADIO_PROXY_PATH')}" if os.getenv('GRADIO_PROXY_PATH') else ""
 )
 # block.launch(server_name='0.0.0.0')

 from gradio.components import Component
 from util import check_channels, resize_image, save_images
 import json
+import argparse
 BBOX_MAX_NUM = 8
 img_save_folder = 'SaveImages'
 load_model = True
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--use_fp32",
+        action="store_true",
+        default=False,
+        help="Whether or not to use fp32 during inference."
+    )
+    parser.add_argument(
+        "--no_translator",
+        action="store_true",
+        default=False,
+        help="Whether or not to use the CH->EN translator, which enable input Chinese prompt and cause ~4GB VRAM."
+    )
+    parser.add_argument(
+        "--font_path",
+        type=str,
+        default='font/Arial_Unicode.ttf',
+        help="path of a font file"
+    )
+    args = parser.parse_args()
+    return args
+args = parse_args()
 if load_model:
+    inference = pipeline('my-anytext-task', model='damo/cv_anytext_text_generation_editing', model_revision='v1.1.1', use_fp16=not args.use_fp32, use_translator=not args.no_translator, font_path=args.font_path)
 def count_lines(prompt):
             [<a href="https://arxiv.org/abs/2311.03054" style="color:blue; font-size:18px;">arXiv</a>] \
             [<a href="https://github.com/tyxsspa/AnyText" style="color:blue; font-size:18px;">Code</a>] \
             [<a href="https://modelscope.cn/models/damo/cv_anytext_text_generation_editing/summary" style="color:blue; font-size:18px;">ModelScope</a>]\
+            [<a href="https://huggingface.co/spaces/modelscope/AnyText" style="color:blue; font-size:18px;">HuggingFace</a>]\
+            version: 1.1.1 </div>')
     with gr.Row(variant='compact'):
         with gr.Column():
             with gr.Accordion('🕹Instructions(说明)', open=False,):
                             rect_xywh_list.extend([x, y, w, h])
                     rect_img = gr.Image(value=create_canvas(), label="Rext Position(方框位置)", elem_id="MD-bbox-rect-t2i", show_label=False, visible=False)
+                    draw_img = gr.Image(value=create_canvas(), label="Draw Position(绘制位置)", visible=True, tool='sketch', show_label=False, brush_radius=100)
                     def re_draw():
                         return [gr.Image(value=create_canvas(), tool='sketch'), gr.Slider(value=512), gr.Slider(value=512)]
                         ori_img = gr.Image(label='Ori(原图)')
                     def upload_ref(x):
+                        return [gr.Image(type="numpy", brush_radius=100, tool='sketch'),
                                 gr.Image(value=x)]
                     def clear_ref(x):
     run_edit.click(fn=process, inputs=[gr.State('edit')] + ips, outputs=[result_gallery, result_info])
 block.launch(
+    # server_name='0.0.0.0' if os.getenv('GRADIO_LISTEN', '') != '' else "127.0.0.1",
+    # share=False,
     root_path=f"/{os.getenv('GRADIO_PROXY_PATH')}" if os.getenv('GRADIO_PROXY_PATH') else ""
 )
 # block.launch(server_name='0.0.0.0')

cldm/cldm.py CHANGED Viewed

@@ -32,6 +32,8 @@ class ControlledUnetModel(UNetModel):
         hs = []
         with torch.no_grad():
             t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
             emb = self.time_embed(t_emb)
             h = x.type(self.dtype)
             for module in self.input_blocks:
@@ -124,12 +126,12 @@ class ControlNet(nn.Module):
                   f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
                   f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
                   f"attention will still not be set.")
         self.attention_resolutions = attention_resolutions
         self.dropout = dropout
         self.channel_mult = channel_mult
         self.conv_resample = conv_resample
         self.use_checkpoint = use_checkpoint
         self.dtype = th.float16 if use_fp16 else th.float32
         self.num_heads = num_heads
         self.num_head_channels = num_head_channels
@@ -313,6 +315,8 @@ class ControlNet(nn.Module):
     def forward(self, x, hint, text_info, timesteps, context, **kwargs):
         t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
         emb = self.time_embed(t_emb)
         # guided_hint from text_info
@@ -344,6 +348,7 @@ class ControlNet(nn.Module):
 class ControlLDM(LatentDiffusion):
     def __init__(self, control_stage_config, control_key, glyph_key, position_key, only_mid_control, loss_alpha=0, loss_beta=0, with_step_weight=False, use_vae_upsample=False, latin_weight=1.0, embedding_manager_config=None, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.control_model = instantiate_from_config(control_stage_config)
         self.control_key = control_key
@@ -356,6 +361,7 @@ class ControlLDM(LatentDiffusion):
         self.with_step_weight = with_step_weight
         self.use_vae_upsample = use_vae_upsample
         self.latin_weight = latin_weight
         if embedding_manager_config is not None and embedding_manager_config.params.valid:
             self.embedding_manager = self.instantiate_embedding_manager(embedding_manager_config, self.cond_stage_model)
             for param in self.embedding_manager.embedding_parameters():
@@ -369,6 +375,7 @@ class ControlLDM(LatentDiffusion):
                 args.rec_image_shape = "3, 48, 320"
                 args.rec_batch_num = 6
                 args.rec_char_dict_path = './ocr_recog/ppocr_keys_v1.txt'
                 self.cn_recognizer = TextRecognizer(args, self.text_predictor)
                 for param in self.text_predictor.parameters():
                     param.requires_grad = False
@@ -433,6 +440,8 @@ class ControlLDM(LatentDiffusion):
         diffusion_model = self.model.diffusion_model
         _cond = torch.cat(cond['c_crossattn'], 1)
         _hint = torch.cat(cond['c_concat'], 1)
         control = self.control_model(x=x_noisy, timesteps=t, context=_cond, hint=_hint, text_info=cond['text_info'])
         control = [c * scale for c, scale in zip(control, self.control_scales)]
         eps = diffusion_model(x=x_noisy, timesteps=t, context=_cond, control=control, only_mid_control=self.only_mid_control)

         hs = []
         with torch.no_grad():
             t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+            if self.use_fp16:
+                t_emb = t_emb.half()
             emb = self.time_embed(t_emb)
             h = x.type(self.dtype)
             for module in self.input_blocks:
                   f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
                   f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
                   f"attention will still not be set.")
         self.attention_resolutions = attention_resolutions
         self.dropout = dropout
         self.channel_mult = channel_mult
         self.conv_resample = conv_resample
         self.use_checkpoint = use_checkpoint
+        self.use_fp16 = use_fp16
         self.dtype = th.float16 if use_fp16 else th.float32
         self.num_heads = num_heads
         self.num_head_channels = num_head_channels
     def forward(self, x, hint, text_info, timesteps, context, **kwargs):
         t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+        if self.use_fp16:
+            t_emb = t_emb.half()
         emb = self.time_embed(t_emb)
         # guided_hint from text_info
 class ControlLDM(LatentDiffusion):
     def __init__(self, control_stage_config, control_key, glyph_key, position_key, only_mid_control, loss_alpha=0, loss_beta=0, with_step_weight=False, use_vae_upsample=False, latin_weight=1.0, embedding_manager_config=None, *args, **kwargs):
+        self.use_fp16 = kwargs.pop('use_fp16', False)
         super().__init__(*args, **kwargs)
         self.control_model = instantiate_from_config(control_stage_config)
         self.control_key = control_key
         self.with_step_weight = with_step_weight
         self.use_vae_upsample = use_vae_upsample
         self.latin_weight = latin_weight
         if embedding_manager_config is not None and embedding_manager_config.params.valid:
             self.embedding_manager = self.instantiate_embedding_manager(embedding_manager_config, self.cond_stage_model)
             for param in self.embedding_manager.embedding_parameters():
                 args.rec_image_shape = "3, 48, 320"
                 args.rec_batch_num = 6
                 args.rec_char_dict_path = './ocr_recog/ppocr_keys_v1.txt'
+                args.use_fp16 = self.use_fp16
                 self.cn_recognizer = TextRecognizer(args, self.text_predictor)
                 for param in self.text_predictor.parameters():
                     param.requires_grad = False
         diffusion_model = self.model.diffusion_model
         _cond = torch.cat(cond['c_crossattn'], 1)
         _hint = torch.cat(cond['c_concat'], 1)
+        if self.use_fp16:
+            x_noisy = x_noisy.half()
         control = self.control_model(x=x_noisy, timesteps=t, context=_cond, hint=_hint, text_info=cond['text_info'])
         control = [c * scale for c, scale in zip(control, self.control_scales)]
         eps = diffusion_model(x=x_noisy, timesteps=t, context=_cond, control=control, only_mid_control=self.only_mid_control)

cldm/model.py CHANGED Viewed

@@ -21,10 +21,14 @@ def load_state_dict(ckpt_path, location='cpu'):
     return state_dict
-def create_model(config_path, cond_stage_path=None):
     config = OmegaConf.load(config_path)
     if cond_stage_path:
         config.model.params.cond_stage_config.params.version = cond_stage_path  # use pre-downloaded ckpts, in case blocked
     model = instantiate_from_config(config.model).cpu()
     print(f'Loaded model config from [{config_path}]')
     return model

     return state_dict
+def create_model(config_path, cond_stage_path=None, use_fp16=False):
     config = OmegaConf.load(config_path)
     if cond_stage_path:
         config.model.params.cond_stage_config.params.version = cond_stage_path  # use pre-downloaded ckpts, in case blocked
+    if use_fp16:
+        config.model.params.use_fp16 = True
+        config.model.params.control_stage_config.params.use_fp16 = True
+        config.model.params.unet_config.params.use_fp16 = True
     model = instantiate_from_config(config.model).cpu()
     print(f'Loaded model config from [{config_path}]')
     return model

cldm/recognizer.py CHANGED Viewed

@@ -132,6 +132,7 @@ class TextRecognizer(object):
         self.chars = self.get_char_dict(args.rec_char_dict_path)
         self.char2id = {x: i for i, x in enumerate(self.chars)}
         self.is_onnx = not isinstance(self.predictor, torch.nn.Module)
     # img: CHW
     def resize_norm_img(self, img, max_wh_ratio):
@@ -188,6 +189,8 @@ class TextRecognizer(object):
                 # max_wh_ratio = max(max_wh_ratio, wh_ratio)  # comment to not use different ratio
             for ino in range(beg_img_no, end_img_no):
                 norm_img = self.resize_norm_img(img_list[indices[ino]], max_wh_ratio)
                 norm_img = norm_img.unsqueeze(0)
                 norm_img_batch.append(norm_img)
             norm_img_batch = torch.cat(norm_img_batch, dim=0)

         self.chars = self.get_char_dict(args.rec_char_dict_path)
         self.char2id = {x: i for i, x in enumerate(self.chars)}
         self.is_onnx = not isinstance(self.predictor, torch.nn.Module)
+        self.use_fp16 = args.use_fp16
     # img: CHW
     def resize_norm_img(self, img, max_wh_ratio):
                 # max_wh_ratio = max(max_wh_ratio, wh_ratio)  # comment to not use different ratio
             for ino in range(beg_img_no, end_img_no):
                 norm_img = self.resize_norm_img(img_list[indices[ino]], max_wh_ratio)
+                if self.use_fp16:
+                    norm_img = norm_img.half()
                 norm_img = norm_img.unsqueeze(0)
                 norm_img_batch.append(norm_img)
             norm_img_batch = torch.cat(norm_img_batch, dim=0)

ldm/modules/diffusionmodules/openaimodel.py CHANGED Viewed

@@ -510,7 +510,7 @@ class UNetModel(nn.Module):
                   f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
                   f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
                   f"attention will still not be set.")
         self.attention_resolutions = attention_resolutions
         self.dropout = dropout
         self.channel_mult = channel_mult

                   f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
                   f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
                   f"attention will still not be set.")
+        self.use_fp16 = use_fp16
         self.attention_resolutions = attention_resolutions
         self.dropout = dropout
         self.channel_mult = channel_mult

ldm/modules/diffusionmodules/util.py CHANGED Viewed

@@ -216,7 +216,8 @@ class SiLU(nn.Module):
 class GroupNorm32(nn.GroupNorm):
     def forward(self, x):
-        return super().forward(x.float()).type(x.dtype)
 def conv_nd(dims, *args, **kwargs):
     """

 class GroupNorm32(nn.GroupNorm):
     def forward(self, x):
+        # return super().forward(x.float()).type(x.dtype)
+        return super().forward(x).type(x.dtype)
 def conv_nd(dims, *args, **kwargs):
     """