Upload 2 files
Browse files- README.md +84 -3
- config.json +34 -0
README.md
CHANGED
@@ -1,3 +1,84 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
same architecture with [timm/vit_large_patch14_dinov2.lvd142m](https://huggingface.co/timm/vit_large_patch14_dinov2.lvd142m)
|
2 |
+
|
3 |
+
```shell
|
4 |
+
git clone https://github.com/DepthAnything/Depth-Anything-V2
|
5 |
+
cd Depth-Anything-V2
|
6 |
+
```
|
7 |
+
|
8 |
+
# translate
|
9 |
+
|
10 |
+
```python
|
11 |
+
'''
|
12 |
+
wget https://huggingface.co/depth-anything/Depth-Anything-V2-Small/resolve/main/depth_anything_v2_vits.pth?download=true
|
13 |
+
wget https://huggingface.co/depth-anything/Depth-Anything-V2-Base/resolve/main/depth_anything_v2_vitb.pth?download=true
|
14 |
+
wget https://huggingface.co/depth-anything/Depth-Anything-V2-Large/resolve/main/depth_anything_v2_vitl.pth?download=true
|
15 |
+
'''
|
16 |
+
import torch
|
17 |
+
|
18 |
+
from depth_anything_v2.dpt import DepthAnythingV2
|
19 |
+
|
20 |
+
DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
|
21 |
+
|
22 |
+
model_configs = {
|
23 |
+
'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
|
24 |
+
'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
|
25 |
+
'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
|
26 |
+
}
|
27 |
+
|
28 |
+
encoder = 'vitb' # or 'vits', 'vitb'
|
29 |
+
|
30 |
+
model = DepthAnythingV2(**model_configs[encoder])
|
31 |
+
model.load_state_dict(torch.load(f'depth_anything_v2_{encoder}.pth?download=true', map_location='cpu'))
|
32 |
+
vit = model.pretrained
|
33 |
+
|
34 |
+
# total_params = 0
|
35 |
+
# for name, param in vit.named_parameters():
|
36 |
+
# print(f"Parameter: {name} - Size: {param.size()} - Total Elements: {param.numel()}")
|
37 |
+
# total_params += param.numel()
|
38 |
+
# print(f"Total number of parameters in ViT: {total_params}")
|
39 |
+
|
40 |
+
filtered_state_dict = {k: v for k, v in vit.state_dict().items() if 'mask_token' not in k}
|
41 |
+
torch.save(filtered_state_dict, "pytorch_model.bin")
|
42 |
+
```
|
43 |
+
|
44 |
+
# usage
|
45 |
+
|
46 |
+
```python
|
47 |
+
from urllib.request import urlopen
|
48 |
+
from PIL import Image
|
49 |
+
import timm
|
50 |
+
|
51 |
+
img = Image.open(urlopen(
|
52 |
+
'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'
|
53 |
+
))
|
54 |
+
|
55 |
+
model = timm.create_model(
|
56 |
+
'vit_large_patch14_dinov2.lvd142m',
|
57 |
+
pretrained=True,
|
58 |
+
num_classes=0, # remove classifier nn.Linear
|
59 |
+
checkpoint_path="pytorch_model.bin"
|
60 |
+
)
|
61 |
+
|
62 |
+
# model2.load_state_dict(torch.load("backbone_weights.pth"))
|
63 |
+
|
64 |
+
# for name, param in model.named_parameters():
|
65 |
+
# print(f"Parameter: {name} - Size: {param.size()} - Total Elements: {param.numel()}")
|
66 |
+
model = model.eval()
|
67 |
+
|
68 |
+
# get model specific transforms (normalization, resize)
|
69 |
+
data_config = timm.data.resolve_model_data_config(model)
|
70 |
+
transforms = timm.data.create_transform(**data_config, is_training=False)
|
71 |
+
|
72 |
+
output = model(transforms(img).unsqueeze(0)) # output is (batch_size, num_features) shaped tensor
|
73 |
+
|
74 |
+
# or equivalently (without needing to set num_classes=0)
|
75 |
+
|
76 |
+
output = model.forward_features(transforms(img).unsqueeze(0))
|
77 |
+
# output is unpooled, a (1, 1374, 1024) shaped tensor
|
78 |
+
|
79 |
+
output = model.forward_head(output, pre_logits=True)
|
80 |
+
print(output)
|
81 |
+
```
|
82 |
+
|
83 |
+
|
84 |
+
Copyright saved.
|
config.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architecture": "vit_large_patch14_dinov2",
|
3 |
+
"num_classes": 0,
|
4 |
+
"num_features": 1024,
|
5 |
+
"global_pool": "token",
|
6 |
+
"pretrained_cfg": {
|
7 |
+
"tag": "lvd142m",
|
8 |
+
"custom_load": false,
|
9 |
+
"input_size": [
|
10 |
+
3,
|
11 |
+
518,
|
12 |
+
518
|
13 |
+
],
|
14 |
+
"fixed_input_size": true,
|
15 |
+
"interpolation": "bicubic",
|
16 |
+
"crop_pct": 1.0,
|
17 |
+
"crop_mode": "center",
|
18 |
+
"mean": [
|
19 |
+
0.485,
|
20 |
+
0.456,
|
21 |
+
0.406
|
22 |
+
],
|
23 |
+
"std": [
|
24 |
+
0.229,
|
25 |
+
0.224,
|
26 |
+
0.225
|
27 |
+
],
|
28 |
+
"num_classes": 0,
|
29 |
+
"pool_size": null,
|
30 |
+
"first_conv": "patch_embed.proj",
|
31 |
+
"classifier": "head",
|
32 |
+
"license": "cc-by-nc-4.0"
|
33 |
+
}
|
34 |
+
}
|