Spaces:

wondervictor
/

ControlAR

Running on Zero

App Files Files Community

wondervictor commited on Oct 31, 2024

Commit

fc81a43

1 Parent(s): 6cd385f

update README

Browse files

Files changed (1) hide show

condition/midas/midas/vit.py +33 -13

condition/midas/midas/vit.py CHANGED Viewed

@@ -128,12 +128,32 @@ def _resize_pos_embed(self, posemb, gs_h, gs_w):
     return posemb
 def flat_forward_flex(model, x):
     b, c, h, w = x.shape
-    pos_embed = model._resize_pos_embed(model.pos_embed,
-                                        h // model.patch_size[1],
-                                        w // model.patch_size[0])
     B = x.shape[0]
@@ -352,10 +372,10 @@ def _make_vit_b16_backbone(
     # We inject this function into the VisionTransformer instances so that
     # we can use it with interpolated position embeddings without modifying the library source.
-    pretrained.model.forward_flex = types.MethodType(forward_flex,
-                                                     pretrained.model)
-    pretrained.model._resize_pos_embed = types.MethodType(
-        _resize_pos_embed, pretrained.model)
     return pretrained
@@ -550,13 +570,13 @@ def _make_vit_b_rn50_backbone(
     # We inject this function into the VisionTransformer instances so that
     # we can use it with interpolated position embeddings without modifying the library source.
-    pretrained.model.forward_flex = types.MethodType(forward_flex,
-                                                     pretrained.model)
-    # We inject this function into the VisionTransformer instances so that
-    # we can use it with interpolated position embeddings without modifying the library source.
-    pretrained.model._resize_pos_embed = types.MethodType(
-        _resize_pos_embed, pretrained.model)
     return pretrained

     return posemb
+def _flat_resize_pos_embed(model, posemb, gs_h, gs_w):
+    posemb_tok, posemb_grid = (
+        posemb[:, :model.start_index],
+        posemb[0, model.start_index:],
+    )
+    gs_old = int(math.sqrt(len(posemb_grid)))
+    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old,
+                                      -1).permute(0, 3, 1, 2)
+    posemb_grid = F.interpolate(posemb_grid,
+                                size=(gs_h, gs_w),
+                                mode="bilinear")
+    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)
+    posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
+    return posemb
 def flat_forward_flex(model, x):
     b, c, h, w = x.shape
+    pos_embed = _flat_resize_pos_embed(model, model.pos_embed,
+                                       h // model.patch_size[1],
+                                       w // model.patch_size[0])
     B = x.shape[0]
     # We inject this function into the VisionTransformer instances so that
     # we can use it with interpolated position embeddings without modifying the library source.
+    # pretrained.model.forward_flex = types.MethodType(forward_flex,
+    #                                                  pretrained.model)
+    # pretrained.model._resize_pos_embed = types.MethodType(
+    #     _resize_pos_embed, pretrained.model)
     return pretrained
     # We inject this function into the VisionTransformer instances so that
     # we can use it with interpolated position embeddings without modifying the library source.
+    # pretrained.model.forward_flex = types.MethodType(forward_flex,
+    #                                                  pretrained.model)
+    # # We inject this function into the VisionTransformer instances so that
+    # # we can use it with interpolated position embeddings without modifying the library source.
+    # pretrained.model._resize_pos_embed = types.MethodType(
+    #     _resize_pos_embed, pretrained.model)
     return pretrained