WeiChow commited on
Commit
5245f4c
1 Parent(s): a39f167

Upload 2 files

Browse files
Files changed (2) hide show
  1. README.md +84 -3
  2. config.json +34 -0
README.md CHANGED
@@ -1,3 +1,84 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ same architecture with [timm/vit_large_patch14_dinov2.lvd142m](https://huggingface.co/timm/vit_large_patch14_dinov2.lvd142m)
2
+
3
+ ```shell
4
+ git clone https://github.com/DepthAnything/Depth-Anything-V2
5
+ cd Depth-Anything-V2
6
+ ```
7
+
8
+ # translate
9
+
10
+ ```python
11
+ '''
12
+ wget https://huggingface.co/depth-anything/Depth-Anything-V2-Small/resolve/main/depth_anything_v2_vits.pth?download=true
13
+ wget https://huggingface.co/depth-anything/Depth-Anything-V2-Base/resolve/main/depth_anything_v2_vitb.pth?download=true
14
+ wget https://huggingface.co/depth-anything/Depth-Anything-V2-Large/resolve/main/depth_anything_v2_vitl.pth?download=true
15
+ '''
16
+ import torch
17
+
18
+ from depth_anything_v2.dpt import DepthAnythingV2
19
+
20
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
21
+
22
+ model_configs = {
23
+ 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
24
+ 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
25
+ 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
26
+ }
27
+
28
+ encoder = 'vitb' # or 'vits', 'vitb'
29
+
30
+ model = DepthAnythingV2(**model_configs[encoder])
31
+ model.load_state_dict(torch.load(f'depth_anything_v2_{encoder}.pth?download=true', map_location='cpu'))
32
+ vit = model.pretrained
33
+
34
+ # total_params = 0
35
+ # for name, param in vit.named_parameters():
36
+ # print(f"Parameter: {name} - Size: {param.size()} - Total Elements: {param.numel()}")
37
+ # total_params += param.numel()
38
+ # print(f"Total number of parameters in ViT: {total_params}")
39
+
40
+ filtered_state_dict = {k: v for k, v in vit.state_dict().items() if 'mask_token' not in k}
41
+ torch.save(filtered_state_dict, "pytorch_model.bin")
42
+ ```
43
+
44
+ # usage
45
+
46
+ ```python
47
+ from urllib.request import urlopen
48
+ from PIL import Image
49
+ import timm
50
+
51
+ img = Image.open(urlopen(
52
+ 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'
53
+ ))
54
+
55
+ model = timm.create_model(
56
+ 'vit_large_patch14_dinov2.lvd142m',
57
+ pretrained=True,
58
+ num_classes=0, # remove classifier nn.Linear
59
+ checkpoint_path="pytorch_model.bin"
60
+ )
61
+
62
+ # model2.load_state_dict(torch.load("backbone_weights.pth"))
63
+
64
+ # for name, param in model.named_parameters():
65
+ # print(f"Parameter: {name} - Size: {param.size()} - Total Elements: {param.numel()}")
66
+ model = model.eval()
67
+
68
+ # get model specific transforms (normalization, resize)
69
+ data_config = timm.data.resolve_model_data_config(model)
70
+ transforms = timm.data.create_transform(**data_config, is_training=False)
71
+
72
+ output = model(transforms(img).unsqueeze(0)) # output is (batch_size, num_features) shaped tensor
73
+
74
+ # or equivalently (without needing to set num_classes=0)
75
+
76
+ output = model.forward_features(transforms(img).unsqueeze(0))
77
+ # output is unpooled, a (1, 1374, 1024) shaped tensor
78
+
79
+ output = model.forward_head(output, pre_logits=True)
80
+ print(output)
81
+ ```
82
+
83
+
84
+ Copyright saved.
config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "vit_large_patch14_dinov2",
3
+ "num_classes": 0,
4
+ "num_features": 1024,
5
+ "global_pool": "token",
6
+ "pretrained_cfg": {
7
+ "tag": "lvd142m",
8
+ "custom_load": false,
9
+ "input_size": [
10
+ 3,
11
+ 518,
12
+ 518
13
+ ],
14
+ "fixed_input_size": true,
15
+ "interpolation": "bicubic",
16
+ "crop_pct": 1.0,
17
+ "crop_mode": "center",
18
+ "mean": [
19
+ 0.485,
20
+ 0.456,
21
+ 0.406
22
+ ],
23
+ "std": [
24
+ 0.229,
25
+ 0.224,
26
+ 0.225
27
+ ],
28
+ "num_classes": 0,
29
+ "pool_size": null,
30
+ "first_conv": "patch_embed.proj",
31
+ "classifier": "head",
32
+ "license": "cc-by-nc-4.0"
33
+ }
34
+ }