WeiChow
/

dptv2_l_vit

PyTorch

Model card Files Files and versions Community

WeiChow commited on 2 days ago

Commit

5245f4c

•

1 Parent(s): a39f167

Upload 2 files

Browse files

Files changed (2) hide show

README.md +84 -3
config.json +34 -0

README.md CHANGED Viewed

@@ -1,3 +1,84 @@
----
-license: apache-2.0
----

+same architecture with [timm/vit_large_patch14_dinov2.lvd142m](https://huggingface.co/timm/vit_large_patch14_dinov2.lvd142m)
+```shell
+git clone https://github.com/DepthAnything/Depth-Anything-V2
+cd Depth-Anything-V2
+```
+# translate
+```python
+'''
+wget https://huggingface.co/depth-anything/Depth-Anything-V2-Small/resolve/main/depth_anything_v2_vits.pth?download=true
+wget https://huggingface.co/depth-anything/Depth-Anything-V2-Base/resolve/main/depth_anything_v2_vitb.pth?download=true
+wget https://huggingface.co/depth-anything/Depth-Anything-V2-Large/resolve/main/depth_anything_v2_vitl.pth?download=true
+'''
+import torch
+from depth_anything_v2.dpt import DepthAnythingV2
+DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
+model_configs = {
+    'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
+    'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
+    'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
+}
+encoder = 'vitb' # or 'vits', 'vitb'
+model = DepthAnythingV2(**model_configs[encoder])
+model.load_state_dict(torch.load(f'depth_anything_v2_{encoder}.pth?download=true', map_location='cpu'))
+vit = model.pretrained
+# total_params = 0
+# for name, param in vit.named_parameters():
+#     print(f"Parameter: {name} - Size: {param.size()} - Total Elements: {param.numel()}")
+#     total_params += param.numel()
+# print(f"Total number of parameters in ViT: {total_params}")
+filtered_state_dict = {k: v for k, v in vit.state_dict().items() if 'mask_token' not in k}
+torch.save(filtered_state_dict, "pytorch_model.bin")
+```
+# usage
+```python
+from urllib.request import urlopen
+from PIL import Image
+import timm
+img = Image.open(urlopen(
+    'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'
+))
+model = timm.create_model(
+    'vit_large_patch14_dinov2.lvd142m',
+    pretrained=True,
+    num_classes=0,  # remove classifier nn.Linear
+    checkpoint_path="pytorch_model.bin"
+)
+# model2.load_state_dict(torch.load("backbone_weights.pth"))
+# for name, param in model.named_parameters():
+#     print(f"Parameter: {name} - Size: {param.size()} - Total Elements: {param.numel()}")
+model = model.eval()
+# get model specific transforms (normalization, resize)
+data_config = timm.data.resolve_model_data_config(model)
+transforms = timm.data.create_transform(**data_config, is_training=False)
+output = model(transforms(img).unsqueeze(0))  # output is (batch_size, num_features) shaped tensor
+# or equivalently (without needing to set num_classes=0)
+output = model.forward_features(transforms(img).unsqueeze(0))
+# output is unpooled, a (1, 1374, 1024) shaped tensor
+output = model.forward_head(output, pre_logits=True)
+print(output)
+```
+Copyright saved.

config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "architecture": "vit_large_patch14_dinov2",
+  "num_classes": 0,
+  "num_features": 1024,
+  "global_pool": "token",
+  "pretrained_cfg": {
+    "tag": "lvd142m",
+    "custom_load": false,
+    "input_size": [
+      3,
+      518,
+      518
+    ],
+    "fixed_input_size": true,
+    "interpolation": "bicubic",
+    "crop_pct": 1.0,
+    "crop_mode": "center",
+    "mean": [
+      0.485,
+      0.456,
+      0.406
+    ],
+    "std": [
+      0.229,
+      0.224,
+      0.225
+    ],
+    "num_classes": 0,
+    "pool_size": null,
+    "first_conv": "patch_embed.proj",
+    "classifier": "head",
+    "license": "cc-by-nc-4.0"
+  }
+}