diff --git a/depth_anything_v2/__init__.py b/depth_anything_v2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8218212812027a181814e7d234f46caa80f5d92f --- /dev/null +++ b/depth_anything_v2/__init__.py @@ -0,0 +1 @@ +from .dpt import DepthAnythingV2 \ No newline at end of file diff --git a/depth_anything_v2/__pycache__/__init__.cpython-310.pyc b/depth_anything_v2/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..36635c0c4c6893d38b521b47e1476210cb89ee3d Binary files /dev/null and b/depth_anything_v2/__pycache__/__init__.cpython-310.pyc differ diff --git a/depth_anything_v2/__pycache__/dinov2.cpython-310.pyc b/depth_anything_v2/__pycache__/dinov2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c7790f8060baeb217de162e1849e5196a4862993 Binary files /dev/null and b/depth_anything_v2/__pycache__/dinov2.cpython-310.pyc differ diff --git a/depth_anything_v2/__pycache__/dpt.cpython-310.pyc b/depth_anything_v2/__pycache__/dpt.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..73f4c9c3dbd721ead0250c9346490382ab344813 Binary files /dev/null and b/depth_anything_v2/__pycache__/dpt.cpython-310.pyc differ diff --git a/depth_anything_v2/__pycache__/moments_dataset.cpython-310.pyc b/depth_anything_v2/__pycache__/moments_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1ee6f717992c7417c13e5e70f8886af2e6ab2229 Binary files /dev/null and b/depth_anything_v2/__pycache__/moments_dataset.cpython-310.pyc differ diff --git a/depth_anything_v2/__pycache__/processing_utils.cpython-310.pyc b/depth_anything_v2/__pycache__/processing_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2c7c630b6b0e3e832a87acce1d822815041c228e Binary files /dev/null and b/depth_anything_v2/__pycache__/processing_utils.cpython-310.pyc differ diff --git a/depth_anything_v2/dinov2.py b/depth_anything_v2/dinov2.py new file mode 100644 index 0000000000000000000000000000000000000000..83d250818c721c6df3b30d3f4352945527701615 --- /dev/null +++ b/depth_anything_v2/dinov2.py @@ -0,0 +1,415 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/main/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py + +from functools import partial +import math +import logging +from typing import Sequence, Tuple, Union, Callable + +import torch +import torch.nn as nn +import torch.utils.checkpoint +from torch.nn.init import trunc_normal_ + +from .dinov2_layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block + + +logger = logging.getLogger("dinov2") + + +def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module: + if not depth_first and include_root: + fn(module=module, name=name) + for child_name, child_module in module.named_children(): + child_name = ".".join((name, child_name)) if name else child_name + named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True) + if depth_first and include_root: + fn(module=module, name=name) + return module + + +class BlockChunk(nn.ModuleList): + def forward(self, x): + for b in self: + x = b(x) + return x + + +class DinoVisionTransformer(nn.Module): + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4.0, + qkv_bias=True, + ffn_bias=True, + proj_bias=True, + drop_path_rate=0.0, + drop_path_uniform=False, + init_values=None, # for layerscale: None or 0 => no layerscale + embed_layer=PatchEmbed, + act_layer=nn.GELU, + block_fn=Block, + ffn_layer="mlp", + block_chunks=1, + num_register_tokens=0, + interpolate_antialias=False, + interpolate_offset=0.1, + ): + """ + Args: + img_size (int, tuple): input image size + patch_size (int, tuple): patch size + in_chans (int): number of input channels + embed_dim (int): embedding dimension + depth (int): depth of transformer + num_heads (int): number of attention heads + mlp_ratio (int): ratio of mlp hidden dim to embedding dim + qkv_bias (bool): enable bias for qkv if True + proj_bias (bool): enable bias for proj in attn if True + ffn_bias (bool): enable bias for ffn if True + drop_path_rate (float): stochastic depth rate + drop_path_uniform (bool): apply uniform drop rate across blocks + weight_init (str): weight init scheme + init_values (float): layer-scale init values + embed_layer (nn.Module): patch embedding layer + act_layer (nn.Module): MLP activation layer + block_fn (nn.Module): transformer block class + ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity" + block_chunks: (int) split block sequence into block_chunks units for FSDP wrap + num_register_tokens: (int) number of extra cls tokens (so-called "registers") + interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings + interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings + """ + super().__init__() + norm_layer = partial(nn.LayerNorm, eps=1e-6) + + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + self.num_tokens = 1 + self.n_blocks = depth + self.num_heads = num_heads + self.patch_size = patch_size + self.num_register_tokens = num_register_tokens + self.interpolate_antialias = interpolate_antialias + self.interpolate_offset = interpolate_offset + + self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim)) + assert num_register_tokens >= 0 + self.register_tokens = ( + nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None + ) + + if drop_path_uniform is True: + dpr = [drop_path_rate] * depth + else: + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + + if ffn_layer == "mlp": + logger.info("using MLP layer as FFN") + ffn_layer = Mlp + elif ffn_layer == "swiglufused" or ffn_layer == "swiglu": + logger.info("using SwiGLU layer as FFN") + ffn_layer = SwiGLUFFNFused + elif ffn_layer == "identity": + logger.info("using Identity layer as FFN") + + def f(*args, **kwargs): + return nn.Identity() + + ffn_layer = f + else: + raise NotImplementedError + + blocks_list = [ + block_fn( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + ffn_bias=ffn_bias, + drop_path=dpr[i], + norm_layer=norm_layer, + act_layer=act_layer, + ffn_layer=ffn_layer, + init_values=init_values, + ) + for i in range(depth) + ] + if block_chunks > 0: + self.chunked_blocks = True + chunked_blocks = [] + chunksize = depth // block_chunks + for i in range(0, depth, chunksize): + # this is to keep the block index consistent if we chunk the block list + chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize]) + self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks]) + else: + self.chunked_blocks = False + self.blocks = nn.ModuleList(blocks_list) + + self.norm = norm_layer(embed_dim) + self.head = nn.Identity() + + self.mask_token = nn.Parameter(torch.zeros(1, embed_dim)) + + self.init_weights() + + def init_weights(self): + trunc_normal_(self.pos_embed, std=0.02) + nn.init.normal_(self.cls_token, std=1e-6) + if self.register_tokens is not None: + nn.init.normal_(self.register_tokens, std=1e-6) + named_apply(init_weights_vit_timm, self) + + def interpolate_pos_encoding(self, x, w, h): + previous_dtype = x.dtype + npatch = x.shape[1] - 1 + N = self.pos_embed.shape[1] - 1 + if npatch == N and w == h: + return self.pos_embed + pos_embed = self.pos_embed.float() + class_pos_embed = pos_embed[:, 0] + patch_pos_embed = pos_embed[:, 1:] + dim = x.shape[-1] + w0 = w // self.patch_size + h0 = h // self.patch_size + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + # DINOv2 with register modify the interpolate_offset from 0.1 to 0.0 + w0, h0 = w0 + self.interpolate_offset, h0 + self.interpolate_offset + # w0, h0 = w0 + 0.1, h0 + 0.1 + + sqrt_N = math.sqrt(N) + sx, sy = float(w0) / sqrt_N, float(h0) / sqrt_N + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed.reshape(1, int(sqrt_N), int(sqrt_N), dim).permute(0, 3, 1, 2), + scale_factor=(sx, sy), + # (int(w0), int(h0)), # to solve the upsampling shape issue + mode="bicubic", + antialias=self.interpolate_antialias + ) + + assert int(w0) == patch_pos_embed.shape[-2] + assert int(h0) == patch_pos_embed.shape[-1] + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype) + + def prepare_tokens_with_masks(self, x, masks=None): + B, nc, w, h = x.shape + x = self.patch_embed(x) + if masks is not None: + x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x) + + x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1) + x = x + self.interpolate_pos_encoding(x, w, h) + + if self.register_tokens is not None: + x = torch.cat( + ( + x[:, :1], + self.register_tokens.expand(x.shape[0], -1, -1), + x[:, 1:], + ), + dim=1, + ) + + return x + + def forward_features_list(self, x_list, masks_list): + x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)] + for blk in self.blocks: + x = blk(x) + + all_x = x + output = [] + for x, masks in zip(all_x, masks_list): + x_norm = self.norm(x) + output.append( + { + "x_norm_clstoken": x_norm[:, 0], + "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1], + "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :], + "x_prenorm": x, + "masks": masks, + } + ) + return output + + def forward_features(self, x, masks=None): + if isinstance(x, list): + return self.forward_features_list(x, masks) + + x = self.prepare_tokens_with_masks(x, masks) + + for blk in self.blocks: + x = blk(x) + + x_norm = self.norm(x) + return { + "x_norm_clstoken": x_norm[:, 0], + "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1], + "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :], + "x_prenorm": x, + "masks": masks, + } + + def _get_intermediate_layers_not_chunked(self, x, n=1): + x = self.prepare_tokens_with_masks(x) + # If n is an int, take the n last blocks. If it's a list, take them + output, total_block_len = [], len(self.blocks) + blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n + for i, blk in enumerate(self.blocks): + x = blk(x) + if i in blocks_to_take: + output.append(x) + assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found" + return output + + def _get_intermediate_layers_chunked(self, x, n=1): + x = self.prepare_tokens_with_masks(x) + output, i, total_block_len = [], 0, len(self.blocks[-1]) + # If n is an int, take the n last blocks. If it's a list, take them + blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n + for block_chunk in self.blocks: + for blk in block_chunk[i:]: # Passing the nn.Identity() + x = blk(x) + if i in blocks_to_take: + output.append(x) + i += 1 + assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found" + return output + + def get_intermediate_layers( + self, + x: torch.Tensor, + n: Union[int, Sequence] = 1, # Layers or n last layers to take + reshape: bool = False, + return_class_token: bool = False, + norm=True + ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]: + if self.chunked_blocks: + outputs = self._get_intermediate_layers_chunked(x, n) + else: + outputs = self._get_intermediate_layers_not_chunked(x, n) + if norm: + outputs = [self.norm(out) for out in outputs] + class_tokens = [out[:, 0] for out in outputs] + outputs = [out[:, 1 + self.num_register_tokens:] for out in outputs] + if reshape: + B, _, w, h = x.shape + outputs = [ + out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous() + for out in outputs + ] + if return_class_token: + return tuple(zip(outputs, class_tokens)) + return tuple(outputs) + + def forward(self, *args, is_training=False, **kwargs): + ret = self.forward_features(*args, **kwargs) + if is_training: + return ret + else: + return self.head(ret["x_norm_clstoken"]) + + +def init_weights_vit_timm(module: nn.Module, name: str = ""): + """ViT weight initialization, original timm impl (for reproducibility)""" + if isinstance(module, nn.Linear): + trunc_normal_(module.weight, std=0.02) + if module.bias is not None: + nn.init.zeros_(module.bias) + + +def vit_small(patch_size=16, num_register_tokens=0, **kwargs): + model = DinoVisionTransformer( + patch_size=patch_size, + embed_dim=384, + depth=12, + num_heads=6, + mlp_ratio=4, + block_fn=partial(Block, attn_class=MemEffAttention), + num_register_tokens=num_register_tokens, + **kwargs, + ) + return model + + +def vit_base(patch_size=16, num_register_tokens=0, **kwargs): + model = DinoVisionTransformer( + patch_size=patch_size, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + block_fn=partial(Block, attn_class=MemEffAttention), + num_register_tokens=num_register_tokens, + **kwargs, + ) + return model + + +def vit_large(patch_size=16, num_register_tokens=0, **kwargs): + model = DinoVisionTransformer( + patch_size=patch_size, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + block_fn=partial(Block, attn_class=MemEffAttention), + num_register_tokens=num_register_tokens, + **kwargs, + ) + return model + + +def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs): + """ + Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64 + """ + model = DinoVisionTransformer( + patch_size=patch_size, + embed_dim=1536, + depth=40, + num_heads=24, + mlp_ratio=4, + block_fn=partial(Block, attn_class=MemEffAttention), + num_register_tokens=num_register_tokens, + **kwargs, + ) + return model + + +def DINOv2(model_name): + model_zoo = { + "vits": vit_small, + "vitb": vit_base, + "vitl": vit_large, + "vitg": vit_giant2 + } + + return model_zoo[model_name]( + img_size=518, + patch_size=14, + init_values=1.0, + ffn_layer="mlp" if model_name != "vitg" else "swiglufused", + block_chunks=0, + num_register_tokens=0, + interpolate_antialias=False, + interpolate_offset=0.1 + ) diff --git a/depth_anything_v2/dinov2_layers/__init__.py b/depth_anything_v2/dinov2_layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8120f4bc83066cb3f825ce32daa3b437f88486f1 --- /dev/null +++ b/depth_anything_v2/dinov2_layers/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from .mlp import Mlp +from .patch_embed import PatchEmbed +from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused +from .block import NestedTensorBlock +from .attention import MemEffAttention diff --git a/depth_anything_v2/dinov2_layers/__pycache__/__init__.cpython-310.pyc b/depth_anything_v2/dinov2_layers/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1205a81e2c6cac660bc4d3d8a841b243a211b3fb Binary files /dev/null and b/depth_anything_v2/dinov2_layers/__pycache__/__init__.cpython-310.pyc differ diff --git a/depth_anything_v2/dinov2_layers/__pycache__/attention.cpython-310.pyc b/depth_anything_v2/dinov2_layers/__pycache__/attention.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f34855b540e24c50b830eb73a8a9032295654ad7 Binary files /dev/null and b/depth_anything_v2/dinov2_layers/__pycache__/attention.cpython-310.pyc differ diff --git a/depth_anything_v2/dinov2_layers/__pycache__/block.cpython-310.pyc b/depth_anything_v2/dinov2_layers/__pycache__/block.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c3ee4b642efeb4de36264f4079c9c350146eaec Binary files /dev/null and b/depth_anything_v2/dinov2_layers/__pycache__/block.cpython-310.pyc differ diff --git a/depth_anything_v2/dinov2_layers/__pycache__/drop_path.cpython-310.pyc b/depth_anything_v2/dinov2_layers/__pycache__/drop_path.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..10d63c0c538a08a6c08808602cc6ca67a2b9a20c Binary files /dev/null and b/depth_anything_v2/dinov2_layers/__pycache__/drop_path.cpython-310.pyc differ diff --git a/depth_anything_v2/dinov2_layers/__pycache__/layer_scale.cpython-310.pyc b/depth_anything_v2/dinov2_layers/__pycache__/layer_scale.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e0cedeb727a883a1194b9e63e613949117e7493f Binary files /dev/null and b/depth_anything_v2/dinov2_layers/__pycache__/layer_scale.cpython-310.pyc differ diff --git a/depth_anything_v2/dinov2_layers/__pycache__/mlp.cpython-310.pyc b/depth_anything_v2/dinov2_layers/__pycache__/mlp.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a2f6eae295b3b5359eb5b549f0db0cde64f4c66c Binary files /dev/null and b/depth_anything_v2/dinov2_layers/__pycache__/mlp.cpython-310.pyc differ diff --git a/depth_anything_v2/dinov2_layers/__pycache__/patch_embed.cpython-310.pyc b/depth_anything_v2/dinov2_layers/__pycache__/patch_embed.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb07962be45ff6edecb097c43c79e3c8dee0e5dd Binary files /dev/null and b/depth_anything_v2/dinov2_layers/__pycache__/patch_embed.cpython-310.pyc differ diff --git a/depth_anything_v2/dinov2_layers/__pycache__/swiglu_ffn.cpython-310.pyc b/depth_anything_v2/dinov2_layers/__pycache__/swiglu_ffn.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..078baca497e26dfdcb48df41805d43d1b9a9ef28 Binary files /dev/null and b/depth_anything_v2/dinov2_layers/__pycache__/swiglu_ffn.cpython-310.pyc differ diff --git a/depth_anything_v2/dinov2_layers/attention.py b/depth_anything_v2/dinov2_layers/attention.py new file mode 100644 index 0000000000000000000000000000000000000000..815a2bf53dbec496f6a184ed7d03bcecb7124262 --- /dev/null +++ b/depth_anything_v2/dinov2_layers/attention.py @@ -0,0 +1,83 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py + +import logging + +from torch import Tensor +from torch import nn + + +logger = logging.getLogger("dinov2") + + +try: + from xformers.ops import memory_efficient_attention, unbind, fmha + + XFORMERS_AVAILABLE = True +except ImportError: + logger.warning("xFormers not available") + XFORMERS_AVAILABLE = False + + +class Attention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + proj_bias: bool = True, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + ) -> None: + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim, bias=proj_bias) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x: Tensor) -> Tensor: + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + + q, k, v = qkv[0] * self.scale, qkv[1], qkv[2] + attn = q @ k.transpose(-2, -1) + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class MemEffAttention(Attention): + def forward(self, x: Tensor, attn_bias=None) -> Tensor: + if not XFORMERS_AVAILABLE: + assert attn_bias is None, "xFormers is required for nested tensors usage" + return super().forward(x) + + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads) + + q, k, v = unbind(qkv, 2) + + x = memory_efficient_attention(q, k, v, attn_bias=attn_bias) + x = x.reshape([B, N, C]) + + x = self.proj(x) + x = self.proj_drop(x) + return x + + \ No newline at end of file diff --git a/depth_anything_v2/dinov2_layers/block.py b/depth_anything_v2/dinov2_layers/block.py new file mode 100644 index 0000000000000000000000000000000000000000..25488f57cc0ad3c692f86b62555f6668e2a66db1 --- /dev/null +++ b/depth_anything_v2/dinov2_layers/block.py @@ -0,0 +1,252 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py + +import logging +from typing import Callable, List, Any, Tuple, Dict + +import torch +from torch import nn, Tensor + +from .attention import Attention, MemEffAttention +from .drop_path import DropPath +from .layer_scale import LayerScale +from .mlp import Mlp + + +logger = logging.getLogger("dinov2") + + +try: + from xformers.ops import fmha + from xformers.ops import scaled_index_add, index_select_cat + + XFORMERS_AVAILABLE = True +except ImportError: + logger.warning("xFormers not available") + XFORMERS_AVAILABLE = False + + +class Block(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = False, + proj_bias: bool = True, + ffn_bias: bool = True, + drop: float = 0.0, + attn_drop: float = 0.0, + init_values=None, + drop_path: float = 0.0, + act_layer: Callable[..., nn.Module] = nn.GELU, + norm_layer: Callable[..., nn.Module] = nn.LayerNorm, + attn_class: Callable[..., nn.Module] = Attention, + ffn_layer: Callable[..., nn.Module] = Mlp, + ) -> None: + super().__init__() + # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}") + self.norm1 = norm_layer(dim) + self.attn = attn_class( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + attn_drop=attn_drop, + proj_drop=drop, + ) + self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = ffn_layer( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + bias=ffn_bias, + ) + self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + self.sample_drop_ratio = drop_path + + def forward(self, x: Tensor) -> Tensor: + def attn_residual_func(x: Tensor) -> Tensor: + return self.ls1(self.attn(self.norm1(x))) + + def ffn_residual_func(x: Tensor) -> Tensor: + return self.ls2(self.mlp(self.norm2(x))) + + if self.training and self.sample_drop_ratio > 0.1: + # the overhead is compensated only for a drop path rate larger than 0.1 + x = drop_add_residual_stochastic_depth( + x, + residual_func=attn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + ) + x = drop_add_residual_stochastic_depth( + x, + residual_func=ffn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + ) + elif self.training and self.sample_drop_ratio > 0.0: + x = x + self.drop_path1(attn_residual_func(x)) + x = x + self.drop_path1(ffn_residual_func(x)) # FIXME: drop_path2 + else: + x = x + attn_residual_func(x) + x = x + ffn_residual_func(x) + return x + + +def drop_add_residual_stochastic_depth( + x: Tensor, + residual_func: Callable[[Tensor], Tensor], + sample_drop_ratio: float = 0.0, +) -> Tensor: + # 1) extract subset using permutation + b, n, d = x.shape + sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1) + brange = (torch.randperm(b, device=x.device))[:sample_subset_size] + x_subset = x[brange] + + # 2) apply residual_func to get residual + residual = residual_func(x_subset) + + x_flat = x.flatten(1) + residual = residual.flatten(1) + + residual_scale_factor = b / sample_subset_size + + # 3) add the residual + x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor) + return x_plus_residual.view_as(x) + + +def get_branges_scales(x, sample_drop_ratio=0.0): + b, n, d = x.shape + sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1) + brange = (torch.randperm(b, device=x.device))[:sample_subset_size] + residual_scale_factor = b / sample_subset_size + return brange, residual_scale_factor + + +def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None): + if scaling_vector is None: + x_flat = x.flatten(1) + residual = residual.flatten(1) + x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor) + else: + x_plus_residual = scaled_index_add( + x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor + ) + return x_plus_residual + + +attn_bias_cache: Dict[Tuple, Any] = {} + + +def get_attn_bias_and_cat(x_list, branges=None): + """ + this will perform the index select, cat the tensors, and provide the attn_bias from cache + """ + batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list] + all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list)) + if all_shapes not in attn_bias_cache.keys(): + seqlens = [] + for b, x in zip(batch_sizes, x_list): + for _ in range(b): + seqlens.append(x.shape[1]) + attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens) + attn_bias._batch_sizes = batch_sizes + attn_bias_cache[all_shapes] = attn_bias + + if branges is not None: + cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1]) + else: + tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list) + cat_tensors = torch.cat(tensors_bs1, dim=1) + + return attn_bias_cache[all_shapes], cat_tensors + + +def drop_add_residual_stochastic_depth_list( + x_list: List[Tensor], + residual_func: Callable[[Tensor, Any], Tensor], + sample_drop_ratio: float = 0.0, + scaling_vector=None, +) -> Tensor: + # 1) generate random set of indices for dropping samples in the batch + branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list] + branges = [s[0] for s in branges_scales] + residual_scale_factors = [s[1] for s in branges_scales] + + # 2) get attention bias and index+concat the tensors + attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges) + + # 3) apply residual_func to get residual, and split the result + residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias)) # type: ignore + + outputs = [] + for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors): + outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x)) + return outputs + + +class NestedTensorBlock(Block): + def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]: + """ + x_list contains a list of tensors to nest together and run + """ + assert isinstance(self.attn, MemEffAttention) + + if self.training and self.sample_drop_ratio > 0.0: + + def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.attn(self.norm1(x), attn_bias=attn_bias) + + def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.mlp(self.norm2(x)) + + x_list = drop_add_residual_stochastic_depth_list( + x_list, + residual_func=attn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None, + ) + x_list = drop_add_residual_stochastic_depth_list( + x_list, + residual_func=ffn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None, + ) + return x_list + else: + + def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias)) + + def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.ls2(self.mlp(self.norm2(x))) + + attn_bias, x = get_attn_bias_and_cat(x_list) + x = x + attn_residual_func(x, attn_bias=attn_bias) + x = x + ffn_residual_func(x) + return attn_bias.split(x) + + def forward(self, x_or_x_list): + if isinstance(x_or_x_list, Tensor): + return super().forward(x_or_x_list) + elif isinstance(x_or_x_list, list): + assert XFORMERS_AVAILABLE, "Please install xFormers for nested tensors usage" + return self.forward_nested(x_or_x_list) + else: + raise AssertionError diff --git a/depth_anything_v2/dinov2_layers/drop_path.py b/depth_anything_v2/dinov2_layers/drop_path.py new file mode 100644 index 0000000000000000000000000000000000000000..af05625984dd14682cc96a63bf0c97bab1f123b1 --- /dev/null +++ b/depth_anything_v2/dinov2_layers/drop_path.py @@ -0,0 +1,35 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py + + +from torch import nn + + +def drop_path(x, drop_prob: float = 0.0, training: bool = False): + if drop_prob == 0.0 or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = x.new_empty(shape).bernoulli_(keep_prob) + if keep_prob > 0.0: + random_tensor.div_(keep_prob) + output = x * random_tensor + return output + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) diff --git a/depth_anything_v2/dinov2_layers/layer_scale.py b/depth_anything_v2/dinov2_layers/layer_scale.py new file mode 100644 index 0000000000000000000000000000000000000000..ca5daa52bd81d3581adeb2198ea5b7dba2a3aea1 --- /dev/null +++ b/depth_anything_v2/dinov2_layers/layer_scale.py @@ -0,0 +1,28 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110 + +from typing import Union + +import torch +from torch import Tensor +from torch import nn + + +class LayerScale(nn.Module): + def __init__( + self, + dim: int, + init_values: Union[float, Tensor] = 1e-5, + inplace: bool = False, + ) -> None: + super().__init__() + self.inplace = inplace + self.gamma = nn.Parameter(init_values * torch.ones(dim)) + + def forward(self, x: Tensor) -> Tensor: + return x.mul_(self.gamma) if self.inplace else x * self.gamma diff --git a/depth_anything_v2/dinov2_layers/mlp.py b/depth_anything_v2/dinov2_layers/mlp.py new file mode 100644 index 0000000000000000000000000000000000000000..5e4b315f972f9a9f54aef1e4ef4e81b52976f018 --- /dev/null +++ b/depth_anything_v2/dinov2_layers/mlp.py @@ -0,0 +1,41 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py + + +from typing import Callable, Optional + +from torch import Tensor, nn + + +class Mlp(nn.Module): + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Callable[..., nn.Module] = nn.GELU, + drop: float = 0.0, + bias: bool = True, + ) -> None: + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features, bias=bias) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features, bias=bias) + self.drop = nn.Dropout(drop) + + def forward(self, x: Tensor) -> Tensor: + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x diff --git a/depth_anything_v2/dinov2_layers/patch_embed.py b/depth_anything_v2/dinov2_layers/patch_embed.py new file mode 100644 index 0000000000000000000000000000000000000000..574abe41175568d700a389b8b96d1ba554914779 --- /dev/null +++ b/depth_anything_v2/dinov2_layers/patch_embed.py @@ -0,0 +1,89 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py + +from typing import Callable, Optional, Tuple, Union + +from torch import Tensor +import torch.nn as nn + + +def make_2tuple(x): + if isinstance(x, tuple): + assert len(x) == 2 + return x + + assert isinstance(x, int) + return (x, x) + + +class PatchEmbed(nn.Module): + """ + 2D image to patch embedding: (B,C,H,W) -> (B,N,D) + + Args: + img_size: Image size. + patch_size: Patch token size. + in_chans: Number of input image channels. + embed_dim: Number of linear projection output channels. + norm_layer: Normalization layer. + """ + + def __init__( + self, + img_size: Union[int, Tuple[int, int]] = 224, + patch_size: Union[int, Tuple[int, int]] = 16, + in_chans: int = 3, + embed_dim: int = 768, + norm_layer: Optional[Callable] = None, + flatten_embedding: bool = True, + ) -> None: + super().__init__() + + image_HW = make_2tuple(img_size) + patch_HW = make_2tuple(patch_size) + patch_grid_size = ( + image_HW[0] // patch_HW[0], + image_HW[1] // patch_HW[1], + ) + + self.img_size = image_HW + self.patch_size = patch_HW + self.patches_resolution = patch_grid_size + self.num_patches = patch_grid_size[0] * patch_grid_size[1] + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.flatten_embedding = flatten_embedding + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW) + self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() + + def forward(self, x: Tensor) -> Tensor: + _, _, H, W = x.shape + patch_H, patch_W = self.patch_size + + assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}" + assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}" + + x = self.proj(x) # B C H W + H, W = x.size(2), x.size(3) + x = x.flatten(2).transpose(1, 2) # B HW C + x = self.norm(x) + if not self.flatten_embedding: + x = x.reshape(-1, H, W, self.embed_dim) # B H W C + return x + + def flops(self) -> float: + Ho, Wo = self.patches_resolution + flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) + if self.norm is not None: + flops += Ho * Wo * self.embed_dim + return flops diff --git a/depth_anything_v2/dinov2_layers/swiglu_ffn.py b/depth_anything_v2/dinov2_layers/swiglu_ffn.py new file mode 100644 index 0000000000000000000000000000000000000000..b3324b266fb0a50ccf8c3a0ede2ae10ac4dfa03e --- /dev/null +++ b/depth_anything_v2/dinov2_layers/swiglu_ffn.py @@ -0,0 +1,63 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Callable, Optional + +from torch import Tensor, nn +import torch.nn.functional as F + + +class SwiGLUFFN(nn.Module): + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Callable[..., nn.Module] = None, + drop: float = 0.0, + bias: bool = True, + ) -> None: + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias) + self.w3 = nn.Linear(hidden_features, out_features, bias=bias) + + def forward(self, x: Tensor) -> Tensor: + x12 = self.w12(x) + x1, x2 = x12.chunk(2, dim=-1) + hidden = F.silu(x1) * x2 + return self.w3(hidden) + + +try: + from xformers.ops import SwiGLU + + XFORMERS_AVAILABLE = True +except ImportError: + SwiGLU = SwiGLUFFN + XFORMERS_AVAILABLE = False + + +class SwiGLUFFNFused(SwiGLU): + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Callable[..., nn.Module] = None, + drop: float = 0.0, + bias: bool = True, + ) -> None: + out_features = out_features or in_features + hidden_features = hidden_features or in_features + hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 + super().__init__( + in_features=in_features, + hidden_features=hidden_features, + out_features=out_features, + bias=bias, + ) diff --git a/depth_anything_v2/dpt.py b/depth_anything_v2/dpt.py new file mode 100644 index 0000000000000000000000000000000000000000..f27f5a97a4b17747d170be136ececf839d062c56 --- /dev/null +++ b/depth_anything_v2/dpt.py @@ -0,0 +1,224 @@ +import pdb + +import cv2 +import torch +import torch.nn as nn +import torch.nn.functional as F +from torchvision.transforms import Compose +import numpy as np + +from .dinov2 import DINOv2 +from .util.blocks import FeatureFusionBlock, _make_scratch +from .util.transform import Resize, NormalizeImage, PrepareForNet + + +def _make_fusion_block(features, use_bn, size=None): + return FeatureFusionBlock( + features, + nn.ReLU(False), + deconv=False, + bn=use_bn, + expand=False, + align_corners=True, + size=size, + ) + + +class ConvBlock(nn.Module): + def __init__(self, in_feature, out_feature): + super().__init__() + + self.conv_block = nn.Sequential( + nn.Conv2d(in_feature, out_feature, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(out_feature), + nn.ReLU(True) + ) + + def forward(self, x): + return self.conv_block(x) + + +class DPTHead(nn.Module): + def __init__( + self, + in_channels, + features=256, + use_bn=False, + out_channels=[256, 512, 1024, 1024], + use_clstoken=False + ): + super(DPTHead, self).__init__() + + self.use_clstoken = use_clstoken + + self.projects = nn.ModuleList([ + nn.Conv2d( + in_channels=in_channels, + out_channels=out_channel, + kernel_size=1, + stride=1, + padding=0, + ) for out_channel in out_channels + ]) + + self.resize_layers = nn.ModuleList([ + nn.ConvTranspose2d( + in_channels=out_channels[0], + out_channels=out_channels[0], + kernel_size=4, + stride=4, + padding=0), + nn.ConvTranspose2d( + in_channels=out_channels[1], + out_channels=out_channels[1], + kernel_size=2, + stride=2, + padding=0), + nn.Identity(), + nn.Conv2d( + in_channels=out_channels[3], + out_channels=out_channels[3], + kernel_size=3, + stride=2, + padding=1) + ]) + + if use_clstoken: + self.readout_projects = nn.ModuleList() + for _ in range(len(self.projects)): + self.readout_projects.append( + nn.Sequential( + nn.Linear(2 * in_channels, in_channels), + nn.GELU())) + + self.scratch = _make_scratch( + out_channels, + features, + groups=1, + expand=False, + ) + + self.scratch.stem_transpose = None + + self.scratch.refinenet1 = _make_fusion_block(features, use_bn) + self.scratch.refinenet2 = _make_fusion_block(features, use_bn) + self.scratch.refinenet3 = _make_fusion_block(features, use_bn) + self.scratch.refinenet4 = _make_fusion_block(features, use_bn) + + head_features_1 = features + head_features_2 = 32 + + self.scratch.output_conv1 = nn.Conv2d(head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1) + self.scratch.output_conv2 = nn.Sequential( + nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1), + nn.ReLU(True), + nn.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0), + nn.ReLU(True), + nn.Identity(), + ) + + def forward(self, out_features, patch_h, patch_w): + out = [] + for i, x in enumerate(out_features): + if self.use_clstoken: + x, cls_token = x[0], x[1] + readout = cls_token.unsqueeze(1).expand_as(x) + x = self.readout_projects[i](torch.cat((x, readout), -1)) + else: + x = x[0] + + x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w)) + + x = self.projects[i](x) + x = self.resize_layers[i](x) + + out.append(x) + + layer_1, layer_2, layer_3, layer_4 = out + + layer_1_rn = self.scratch.layer1_rn(layer_1) + layer_2_rn = self.scratch.layer2_rn(layer_2) + layer_3_rn = self.scratch.layer3_rn(layer_3) + layer_4_rn = self.scratch.layer4_rn(layer_4) + + path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:]) + path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:]) + path_2 = self.scratch.refinenet2(path_3, layer_2_rn, size=layer_1_rn.shape[2:]) + path_1 = self.scratch.refinenet1(path_2, layer_1_rn) + + out = self.scratch.output_conv1(path_1) + out = F.interpolate(out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True) + out = self.scratch.output_conv2(out) + + return out + +class DepthAnythingV2(nn.Module): + def __init__( + self, + encoder='vitl', + features=256, + out_channels=[256, 512, 1024, 1024], + use_bn=False, + use_clstoken=False + ): + super(DepthAnythingV2, self).__init__() + + self.intermediate_layer_idx = { + 'vits': [2, 5, 8, 11], + 'vitb': [2, 5, 8, 11], + 'vitl': [4, 11, 17, 23], + 'vitg': [9, 19, 29, 39] + } + + self.encoder = encoder + self.pretrained = DINOv2(model_name=encoder) + + self.depth_head = DPTHead(self.pretrained.embed_dim, features, use_bn, out_channels=out_channels, use_clstoken=use_clstoken) + + @torch.no_grad() + def forward(self, image, input_size=518, device='cuda:0'): + x, (h, w) = self.image2tensor(image, input_size, device) + + patch_h, patch_w = x.shape[-2] // 14, x.shape[-1] // 14 + features = self.pretrained.get_intermediate_layers(x, self.intermediate_layer_idx[self.encoder], return_class_token=True) + + depth = self.depth_head(features, patch_h, patch_w) + depth = F.relu(depth).squeeze(1) + depth = F.interpolate(depth[:, None], (h, w), mode="bilinear", align_corners=True).squeeze() + return depth + + @torch.no_grad() + def infer_image(self, raw_image, input_size=518): + image, (h, w) = self.image2tensor(raw_image, input_size) + + depth = self.forward(image) + + depth = F.interpolate(depth[:, None], (h, w), mode="bilinear", align_corners=True)[0, 0] + + return depth + + def image2tensor(self, raw_image, input_size=518, device='cuda'): + transform = Compose([ + Resize( + width=input_size, + height=input_size, + resize_target=False, + keep_aspect_ratio=True, + ensure_multiple_of=14, + resize_method='lower_bound', + image_interpolation_method=cv2.INTER_CUBIC, + ), + NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + PrepareForNet(), + ]) + # raw_image (bs, 3, h, w) + h, w = raw_image.shape[-2:] + raw_image = np.moveaxis(raw_image, 1, -1) + images = [] + for i, single_image in enumerate(raw_image): + image = cv2.cvtColor(single_image, cv2.COLOR_BGR2RGB) / 255.0 + image = transform({'image': image})['image'] + images.append(torch.from_numpy(image)) + images = torch.stack(images, dim=0) + images = images.to(device) + return images, (h, w) diff --git a/depth_anything_v2/moments_dataset.py b/depth_anything_v2/moments_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..7cf988b3a81da05f321fc34cfb7f2d36328c9a00 --- /dev/null +++ b/depth_anything_v2/moments_dataset.py @@ -0,0 +1,54 @@ +# Copyright 2024 Adobe. All rights reserved. + +#%% +import glob +import torch +import torchvision +import matplotlib.pyplot as plt +from torch.utils.data import Dataset +import numpy as np + + +# %% +class MomentsDataset(Dataset): + def __init__(self, videos_folder, num_frames, samples_per_video, frame_size=512) -> None: + super().__init__() + + self.videos_paths = glob.glob(f'{videos_folder}/*mp4') + self.resize = torchvision.transforms.Resize(size=frame_size) + self.center_crop = torchvision.transforms.CenterCrop(size=frame_size) + self.num_samples_per_video = samples_per_video + self.num_frames = num_frames + + def __len__(self): + return len(self.videos_paths) * self.num_samples_per_video + + def __getitem__(self, idx): + video_idx = idx // self.num_samples_per_video + video_path = self.videos_paths[video_idx] + + try: + start_idx = np.random.randint(0, 20) + + unsampled_video_frames, audio_frames, info = torchvision.io.read_video(video_path,output_format="TCHW") + sampled_indices = torch.tensor(np.linspace(start_idx, len(unsampled_video_frames)-1, self.num_frames).astype(int)) + sampled_frames = unsampled_video_frames[sampled_indices] + processed_frames = [] + + for frame in sampled_frames: + resized_cropped_frame = self.center_crop(self.resize(frame)) + processed_frames.append(resized_cropped_frame) + frames = torch.stack(processed_frames, dim=0) + frames = frames.float() / 255.0 + except Exception as e: + print('oops', e) + rand_idx = np.random.randint(0, len(self)) + return self.__getitem__(rand_idx) + + out_dict = {'frames': frames, + 'caption': 'none', + 'keywords': 'none'} + + return out_dict + + diff --git a/depth_anything_v2/moments_processing.py b/depth_anything_v2/moments_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..88ca77265443bf9f2f8b39973378e218bd986f52 --- /dev/null +++ b/depth_anything_v2/moments_processing.py @@ -0,0 +1,345 @@ +# Copyright 2024 Adobe. All rights reserved. + +#%% +import numpy as np +import torchvision +import cv2 +import tqdm +import torchvision.transforms.functional as F +from PIL import Image +from torchvision.utils import save_image +import time +import os +import pathlib +from torch.utils.data import DataLoader +# %matplotlib inline +from kornia.filters.median import MedianBlur + +median_filter = MedianBlur(kernel_size=(15,15)) +from moments_dataset import MomentsDataset + +try: + from processing_utils import aggregate_frames + import processing_utils +except Exception as e: + print(e) + print('process failed') + exit() + +import torch + + +# %% + +def load_image(img_path, resize_size=None,crop_size=None): + + img1_pil = Image.open(img_path) + img1_frames = torchvision.transforms.functional.pil_to_tensor(img1_pil) + + if resize_size: + img1_frames = torchvision.transforms.functional.resize(img1_frames, resize_size) + + if crop_size: + img1_frames = torchvision.transforms.functional.center_crop(img1_frames, crop_size) + + img1_batch = torch.unsqueeze(img1_frames, dim=0) + + return img1_batch + +def get_grid(size): + y = np.repeat(np.arange(size)[None, ...], size) + y = y.reshape(size, size) + x = y.transpose() + out = np.stack([y,x], -1) + return out + +def collage_from_frames(frames_t): + # decide forward or backward + if np.random.randint(0, 2) == 0: + # flip + frames_t = frames_t.flip(0) + + # decide how deep you would go + tgt_idx_guess = np.random.randint(1, min(len(frames_t), 20)) + tgt_idx = 1 + pairwise_flows = [] + flow = None + init_time = time.time() + unsmoothed_agg = None + for cur_idx in range(1, tgt_idx_guess+1): + # cur_idx = i+1 + cur_flow, pairwise_flows = aggregate_frames(frames_t[:cur_idx+1] , pairwise_flows, unsmoothed_agg) # passing pairwise flows for efficiency + unsmoothed_agg = cur_flow.clone() + agg_cur_flow = median_filter(cur_flow) + + flow_norm = torch.norm(agg_cur_flow.squeeze(), dim=0).flatten() + # flow_10 = np.percentile(flow_norm.cpu().numpy(), 10) + flow_90 = np.percentile(flow_norm.cpu().numpy(), 90) + + # flow_10 = np.percentile(flow_norm.cpu().numpy(), 10) + flow_90 = np.percentile(flow_norm.cpu().numpy(), 90) + flow_95 = np.percentile(flow_norm.cpu().numpy(), 95) + + if cur_idx == 5: # if still small flow then drop + if flow_95 < 20.0: + # no motion in the frame. skip + print('flow is tiny :(') + return None + + if cur_idx == tgt_idx_guess-1: # if still small flow then drop + if flow_95 < 50.0: + # no motion in the frame. skip + print('flow is tiny :(') + return None + + if flow is None: # means first iter + if flow_90 < 1.0: + # no motion in the frame. skip + return None + flow = agg_cur_flow + + if flow_90 <= 300: # maybe should increase this part + # update idx + tgt_idx = cur_idx + flow = agg_cur_flow + else: + break + final_time = time.time() + print('time guessing idx', final_time - init_time) + + _, flow_warping_mask = processing_utils.forward_warp(frames_t[0], frames_t[tgt_idx], flow, grid=None, alpha_mask=None) + flow_warping_mask = flow_warping_mask.squeeze().numpy() > 0.5 + + if np.mean(flow_warping_mask) < 0.6: + return + + + src_array = frames_t[0].moveaxis(0, -1).cpu().numpy() * 1.0 + init_time = time.time() + depth = get_depth_from_array(frames_t[0]) + finish_time = time.time() + print('time getting depth', finish_time - init_time) + # flow, pairwise_flows = aggregate_frames(frames_t) + # agg_flow = median_filter(flow) + + src_array_uint = src_array * 255.0 + src_array_uint = src_array_uint.astype(np.uint8) + segments = processing_utils.mask_generator.generate(src_array_uint) + + size = src_array.shape[1] + grid_np = get_grid(size).astype(np.float16) / size # 512 x 512 x 2get + grid_t = torch.tensor(grid_np).moveaxis(-1, 0) # 512 x 512 x 2 + + + collage, canvas_alpha, lost_alpha = collage_warp(src_array, flow.squeeze(), depth, segments, grid_array=grid_np) + lost_alpha_t = torch.tensor(lost_alpha).squeeze().unsqueeze(0) + warping_alpha = (lost_alpha_t < 0.5).float() + + rgb_grid_splatted, actual_warped_mask = processing_utils.forward_warp(frames_t[0], frames_t[tgt_idx], flow, grid=grid_t, alpha_mask=warping_alpha) + + + # basic blending now + # print('rgb grid splatted', rgb_grid_splatted.shape) + warped_src = (rgb_grid_splatted * actual_warped_mask).moveaxis(0, -1).cpu().numpy() + canvas_alpha_mask = canvas_alpha == 0.0 + collage_mask = canvas_alpha.squeeze() + actual_warped_mask.squeeze().cpu().numpy() + collage_mask = collage_mask > 0.5 + + composite_grid = warped_src * canvas_alpha_mask + collage + rgb_grid_splatted_np = rgb_grid_splatted.moveaxis(0, -1).cpu().numpy() + + return frames_t[0], frames_t[tgt_idx], rgb_grid_splatted_np, composite_grid, flow_warping_mask, collage_mask + +def collage_warp(rgb_array, flow, depth, segments, grid_array): + avg_depths = [] + avg_flows = [] + + # src_array = src_array.moveaxis(-1, 0).cpu().numpy() #np.array(Image.open(src_path).convert('RGB')) / 255.0 + src_array = np.concatenate([rgb_array, grid_array], axis=-1) + canvas = np.zeros_like(src_array) + canvas_alpha = np.zeros_like(canvas[...,-1:]).astype(float) + lost_regions = np.zeros_like(canvas[...,-1:]).astype(float) + z_buffer = np.ones_like(depth)[..., None] * -1.0 + unsqueezed_depth = depth[..., None] + + affine_transforms = [] + + filtered_segments = [] + for segment in segments: + if segment['area'] > 300: + filtered_segments.append(segment) + + for segment in filtered_segments: + seg_mask = segment['segmentation'] + avg_flow = torch.mean(flow[:, seg_mask],dim=1) + avg_flows.append(avg_flow) + # median depth (conversion from disparity) + avg_depth = torch.median(1.0 / (depth[seg_mask] + 1e-6)) + avg_depths.append(avg_depth) + + all_y, all_x = np.nonzero(segment['segmentation']) + rand_indices = np.random.randint(0, len(all_y), size=50) + rand_x = [all_x[i] for i in rand_indices] + rand_y = [all_y[i] for i in rand_indices] + + src_pairs = [(x, y) for x, y in zip(rand_x, rand_y)] + # tgt_pairs = [(x + w, y) for x, y in src_pairs] + tgt_pairs = [] + # print('estimating affine') # TODO this can be faster + for i in range(len(src_pairs)): + x, y = src_pairs[i] + dx, dy = flow[:, y, x] + tgt_pairs.append((x+dx, y+dy)) + + # affine_trans, inliers = cv2.estimateAffine2D(np.array(src_pairs).astype(np.float32), np.array(tgt_pairs).astype(np.float32)) + affine_trans, inliers = cv2.estimateAffinePartial2D(np.array(src_pairs).astype(np.float32), np.array(tgt_pairs).astype(np.float32)) + # print('num inliers', np.sum(inliers)) + # # print('num inliers', np.sum(inliers)) + affine_transforms.append(affine_trans) + + depth_sorted_indices = np.arange(len(avg_depths)) + depth_sorted_indices = sorted(depth_sorted_indices, key=lambda x: avg_depths[x]) + # sorted_masks = [] + # print('warping stuff') + for idx in depth_sorted_indices: + # sorted_masks.append(mask[idx]) + alpha_mask = filtered_segments[idx]['segmentation'][..., None] * (lost_regions < 0.5).astype(float) + src_rgba = np.concatenate([src_array, alpha_mask, unsqueezed_depth], axis=-1) + warp_dst = cv2.warpAffine(src_rgba, affine_transforms[idx], (src_array.shape[1], src_array.shape[0])) + warped_mask = warp_dst[..., -2:-1] # this is warped alpha + warped_depth = warp_dst[..., -1:] + warped_rgb = warp_dst[...,:-2] + + good_z_region = warped_depth > z_buffer + + warped_mask = np.logical_and(warped_mask > 0.5, good_z_region).astype(float) + + kernel = np.ones((3,3), float) + # print('og masked shape', warped_mask.shape) + # warped_mask = cv2.erode(warped_mask,(5,5))[..., None] + # print('eroded masked shape', warped_mask.shape) + canvas_alpha += cv2.erode(warped_mask,kernel)[..., None] + + lost_regions += alpha_mask + canvas = canvas * (1.0 - warped_mask) + warped_mask * warped_rgb # TODO check if need to dialate here + z_buffer = z_buffer * (1.0 - warped_mask) + warped_mask * warped_depth # TODO check if need to dialate here # print('max lost region', np.max(lost_regions)) + return canvas, canvas_alpha, lost_regions + +def get_depth_from_array(img_t): + img_arr = img_t.moveaxis(0, -1).cpu().numpy() * 1.0 + # print(img_arr.shape) + img_arr *= 255.0 + img_arr = img_arr.astype(np.uint8) + input_batch = processing_utils.depth_transform(img_arr).cuda() + + with torch.no_grad(): + prediction = processing_utils.midas(input_batch) + + prediction = torch.nn.functional.interpolate( + prediction.unsqueeze(1), + size=img_arr.shape[:2], + mode="bicubic", + align_corners=False, + ).squeeze() + + output = prediction.cpu() + return output + + +# %% + +def main(): + print('starting main') + video_folder = './example_videos' + save_dir = pathlib.Path('./processed_data') + process_video_folder(video_folder, save_dir) + +def process_video_folder(video_folder, save_dir): + all_counter = 0 + success_counter = 0 + + # save_folder = pathlib.Path('/dev/shm/processed') + # save_dir = save_folder / foldername #pathlib.Path('/sensei-fs/users/halzayer/collage2photo/testing_partitioning_dilate_extreme') + os.makedirs(save_dir, exist_ok=True) + + dataset = MomentsDataset(videos_folder=video_folder, num_frames=20, samples_per_video=5) + batch_size = 4 + dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) + + with torch.no_grad(): + for i, batch in tqdm.tqdm(enumerate(dataloader), total=len(dataset)//batch_size): + frames_to_visualize = batch["frames"] + bs = frames_to_visualize.shape[0] + + for j in range(bs): + frames = frames_to_visualize[j] + caption = batch["caption"][j] + + collage_init_time = time.time() + out = collage_from_frames(frames) + collage_finish_time = time.time() + print('collage processing time', collage_finish_time - collage_init_time) + all_counter += 1 + if out is not None: + src_image, tgt_image, splatted, collage, flow_mask, collage_mask = out + + splatted_rgb = splatted[...,:3] + splatted_grid = splatted[...,3:].astype(np.float16) + + collage_rgb = collage[...,:3] + collage_grid = collage[...,3:].astype(np.float16) + success_counter += 1 + else: + continue + + id_str = f'{success_counter:08d}' + + src_path = str(save_dir / f'src_{id_str}.png') + tgt_path = str(save_dir / f'tgt_{id_str}.png') + flow_warped_path = str(save_dir / f'flow_warped_{id_str}.png') + composite_path = str(save_dir / f'composite_{id_str}.png') + flow_mask_path = str(save_dir / f'flow_mask_{id_str}.png') + composite_mask_path = str(save_dir / f'composite_mask_{id_str}.png') + + flow_grid_path = str(save_dir / f'flow_warped_grid_{id_str}.npy') + composite_grid_path = str(save_dir / f'composite_grid_{id_str}.npy') + + save_image(src_image, src_path) + save_image(tgt_image, tgt_path) + + collage_pil = Image.fromarray((collage_rgb * 255).astype(np.uint8)) + collage_pil.save(composite_path) + + splatted_pil = Image.fromarray((splatted_rgb * 255).astype(np.uint8)) + splatted_pil.save(flow_warped_path) + + flow_mask_pil = Image.fromarray((flow_mask.astype(float) * 255).astype(np.uint8)) + flow_mask_pil.save(flow_mask_path) + + composite_mask_pil = Image.fromarray((collage_mask.astype(float) * 255).astype(np.uint8)) + composite_mask_pil.save(composite_mask_path) + + splatted_grid_t = torch.tensor(splatted_grid).moveaxis(-1, 0) + splatted_grid_resized = torchvision.transforms.functional.resize(splatted_grid_t, (64,64)) + + collage_grid_t = torch.tensor(collage_grid).moveaxis(-1, 0) + collage_grid_resized = torchvision.transforms.functional.resize(collage_grid_t, (64,64)) + np.save(flow_grid_path, splatted_grid_resized.cpu().numpy()) + np.save(composite_grid_path, collage_grid_resized.cpu().numpy()) + + del out + del splatted_grid + del collage_grid + del frames + + del frames_to_visualize + +#%% + +if __name__ == '__main__': + try: + main() + except Exception as e: + print(e) + print('process failed') + diff --git a/depth_anything_v2/processing_utils.py b/depth_anything_v2/processing_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5b096bfa15521f8c542793f2024f60a48696f0f2 --- /dev/null +++ b/depth_anything_v2/processing_utils.py @@ -0,0 +1,318 @@ +import torch +import cv2 +import numpy as np +import sys +import torchvision +from PIL import Image +from torchvision.models.optical_flow import Raft_Large_Weights +from torchvision.models.optical_flow import raft_large +from segment_anything import sam_model_registry, SamAutomaticMaskGenerator, SamPredictor +import matplotlib.pyplot as plt +import torchvision.transforms.functional as F +sys.path.append('./softmax-splatting') +import softsplat + +sam_checkpoint = "./sam_vit_h_4b8939.pth" +model_type = "vit_h" + +device = "cuda" + +sam = sam_model_registry[model_type](checkpoint=sam_checkpoint) +sam.to(device=device) +# mask_generator = SamAutomaticMaskGenerator(sam, +# crop_overlap_ratio=0.05, +# box_nms_thresh=0.2, +# points_per_side=32, +# pred_iou_thresh=0.86, +# stability_score_thresh=0.8, + +# min_mask_region_area=100,) +# mask_generator = SamAutomaticMaskGenerator(sam) +mask_generator = SamAutomaticMaskGenerator(sam, + # box_nms_thresh=0.5, + # crop_overlap_ratio=0.75, + # min_mask_region_area=200, + ) + +def get_mask(img_path): + image = cv2.imread(img_path) + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + masks = mask_generator.generate(image) + return masks + +def get_mask_from_array(arr): + return mask_generator.generate(arr) + +# depth model + +import cv2 +import torch +import urllib.request + +import matplotlib.pyplot as plt + +# potentially downgrade this. just need rough depths. benchmark this +# model_type = "DPT_Large" # MiDaS v3 - Large (highest accuracy, slowest inference speed) +# #model_type = "DPT_Hybrid" # MiDaS v3 - Hybrid (medium accuracy, medium inference speed) +# #model_type = "MiDaS_small" # MiDaS v2.1 - Small (lowest accuracy, highest inference speed) +# +# # midas = torch.hub.load("intel-isl/MiDaS", model_type) +# midas = torch.hub.load("/sensei-fs/users/halzayer/collage2photo/model_cache/intel-isl_MiDaS_master", model_type, source='local') +# +# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") +# midas.to(device) +# midas.eval() +# +# # midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms") +# midas_transforms = torch.hub.load("/sensei-fs/users/halzayer/collage2photo/model_cache/intel-isl_MiDaS_master", "transforms", source='local') +# +# if model_type == "DPT_Large" or model_type == "DPT_Hybrid": +# depth_transform = midas_transforms.dpt_transform +# else: +# depth_transform = midas_transforms.small_transform +from dpt import DepthAnythingV2 + +model_configs = { + 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]}, + 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]}, + 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}, + 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]} +} + +depth_anything = DepthAnythingV2(**model_configs['vitl']) +depth_anything.load_state_dict(torch.load(f'/home/aiops/wangzh/Depth-Anything-V2/checkpoints/depth_anything_v2_vitl.pth', map_location='cpu')) +depth_anything = depth_anything.to(device).eval() + +# img_path = '/sensei-fs/users/halzayer/valid/JPEGImages/45597680/00005.jpg' +def get_depth(img_path): + img = cv2.imread(img_path) + + with torch.no_grad(): + depth = depth_anything.infer_image(img, 518) + + depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 + depth = depth.cpu().numpy().astype(np.uint8) + + prediction = torch.nn.functional.interpolate( + depth.unsqueeze(1), + size=img.shape[:2], + mode="bicubic", + align_corners=False, + ).squeeze() + + output = prediction.cpu() + return output + +def get_depth_from_array(img): + input_batch = img.to(device) + + with torch.no_grad(): + depth = depth_anything.infer_image(input_batch, 518) + + depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 + depth = depth.cpu().numpy().astype(np.uint8) + + prediction = torch.nn.functional.interpolate( + depth.unsqueeze(1), + size=img.shape[:2], + mode="bicubic", + align_corners=False, + ).squeeze() + + output = prediction.cpu() + return output + + +def load_image(img_path): + img1_names = [img_path] + + img1_pil = [Image.open(fn) for fn in img1_names] + img1_frames = [torchvision.transforms.functional.pil_to_tensor(fn) for fn in img1_pil] + + img1_batch = torch.stack(img1_frames) + + return img1_batch + +weights = Raft_Large_Weights.DEFAULT +transforms = weights.transforms() + +device = "cuda" if torch.cuda.is_available() else "cpu" + +model = raft_large(weights=Raft_Large_Weights.DEFAULT, progress=False).to(device) +model = model.eval() + +print('created model') + +def preprocess(img1_batch, img2_batch, size=[520,960], transform_batch=True): + img1_batch = F.resize(img1_batch, size=size, antialias=False) + img2_batch = F.resize(img2_batch, size=size, antialias=False) + if transform_batch: + return transforms(img1_batch, img2_batch) + else: + return img1_batch, img2_batch + +def compute_flow(img_path_1, img_path_2): + img1_batch_og, img2_batch_og = load_image(img_path_1), load_image(img_path_2) + B, C, H, W = img1_batch_og.shape + + img1_batch, img2_batch = preprocess(img1_batch_og, img2_batch_og, transform_batch=False) + img1_batch_t, img2_batch_t = transforms(img1_batch, img2_batch) + + # If you can, run this example on a GPU, it will be a lot faster. + with torch.no_grad(): + list_of_flows = model(img1_batch_t.to(device), img2_batch_t.to(device)) + predicted_flows = list_of_flows[-1] + # flows.append(predicted_flows) + + resized_flow = F.resize(predicted_flows, size=(H, W), antialias=False) + + _, _, flow_H, flow_W = predicted_flows.shape + + resized_flow[:,0] *= (W / flow_W) + resized_flow[:,1] *= (H / flow_H) + + return resized_flow.detach().cpu().squeeze() + +def compute_flow_from_tensors(img1_batch_og, img2_batch_og): + if len(img1_batch_og.shape) < 4: + img1_batch_og = img1_batch_og.unsqueeze(0) + if len(img2_batch_og.shape) < 4: + img2_batch_og = img2_batch_og.unsqueeze(0) + + B, C, H, W = img1_batch_og.shape + img1_batch, img2_batch = preprocess(img1_batch_og, img2_batch_og, transform_batch=False) + img1_batch_t, img2_batch_t = transforms(img1_batch, img2_batch) + + # If you can, run this example on a GPU, it will be a lot faster. + with torch.no_grad(): + list_of_flows = model(img1_batch_t.to(device), img2_batch_t.to(device)) + predicted_flows = list_of_flows[-1] + # flows.append(predicted_flows) + + resized_flow = F.resize(predicted_flows, size=(H, W), antialias=False) + + _, _, flow_H, flow_W = predicted_flows.shape + + resized_flow[:,0] *= (W / flow_W) + resized_flow[:,1] *= (H / flow_H) + + return resized_flow.detach().cpu().squeeze() + + + +# import run +backwarp_tenGrid = {} + +def backwarp(tenIn, tenFlow): + if str(tenFlow.shape) not in backwarp_tenGrid: + tenHor = torch.linspace(start=-1.0, end=1.0, steps=tenFlow.shape[3], dtype=tenFlow.dtype, device=tenFlow.device).view(1, 1, 1, -1).repeat(1, 1, tenFlow.shape[2], 1) + tenVer = torch.linspace(start=-1.0, end=1.0, steps=tenFlow.shape[2], dtype=tenFlow.dtype, device=tenFlow.device).view(1, 1, -1, 1).repeat(1, 1, 1, tenFlow.shape[3]) + + backwarp_tenGrid[str(tenFlow.shape)] = torch.cat([tenHor, tenVer], 1).cuda() + # end + + tenFlow = torch.cat([tenFlow[:, 0:1, :, :] / ((tenIn.shape[3] - 1.0) / 2.0), tenFlow[:, 1:2, :, :] / ((tenIn.shape[2] - 1.0) / 2.0)], 1) + + return torch.nn.functional.grid_sample(input=tenIn, grid=(backwarp_tenGrid[str(tenFlow.shape)] + tenFlow).permute(0, 2, 3, 1), mode='bilinear', padding_mode='zeros', align_corners=True) + +torch.backends.cudnn.enabled = True # make sure to use cudnn for computational performance + +########################################################## +def forward_splt(src, tgt, flow, partial=False): + tenTwo = tgt.unsqueeze(0).cuda() #torch.FloatTensor(numpy.ascontiguousarray(cv2.imread(filename='./images/one.png', flags=-1).transpose(2, 0, 1)[None, :, :, :].astype(numpy.float32) * (1.0 / 255.0))).cuda() + tenOne = src.unsqueeze(0).cuda() #torch.FloatTensor(numpy.ascontiguousarray(cv2.imread(filename='./images/two.png', flags=-1).transpose(2, 0, 1)[None, :, :, :].astype(numpy.float32) * (1.0 / 255.0))).cuda() + tenFlow = flow.unsqueeze(0).cuda() #torch.FloatTensor(numpy.ascontiguousarray(run.read_flo('./images/flow.flo').transpose(2, 0, 1)[None, :, :, :])).cuda() + + if not partial: + tenMetric = torch.nn.functional.l1_loss(input=tenOne, target=backwarp(tenIn=tenTwo, tenFlow=tenFlow), reduction='none').mean([1], True) + else: + tenMetric = torch.nn.functional.l1_loss(input=tenOne[:,:3], target=backwarp(tenIn=tenTwo[:,:3], tenFlow=tenFlow[:,:3]), reduction='none').mean([1], True) + # for intTime, fltTime in enumerate(np.linspace(0.0, 1.0, 11).tolist()): + tenSoftmax = softsplat.softsplat(tenIn=tenOne, tenFlow=tenFlow , tenMetric=(-20.0 * tenMetric).clip(-20.0, 20.0), strMode='soft') # -20.0 is a hyperparameter, called 'alpha' in the paper, that could be learned using a torch.Parameter + + return tenSoftmax.cpu() + + +def aggregate_frames(frames, pairwise_flows=None, agg_flow=None): + if pairwise_flows is None: + # store pairwise flows + pairwise_flows = [] + + if agg_flow is None: + start_idx = 0 + else: + start_idx = len(pairwise_flows) + + og_image = frames[start_idx] + prev_frame = og_image + + for i in range(start_idx, len(frames)-1): + tgt_frame = frames[i+1] + + if i < len(pairwise_flows): + flow = pairwise_flows[i] + else: + flow = compute_flow_from_tensors(prev_frame, tgt_frame) + pairwise_flows.append(flow.clone()) + + _, H, W = flow.shape + B=1 + + xx = torch.arange(0, W).view(1,-1).repeat(H,1) + + yy = torch.arange(0, H).view(-1,1).repeat(1,W) + + xx = xx.view(1,1,H,W).repeat(B,1,1,1) + + yy = yy.view(1,1,H,W).repeat(B,1,1,1) + + grid = torch.cat((xx,yy),1).float() + + flow = flow.unsqueeze(0) + if agg_flow is None: + agg_flow = torch.zeros_like(flow) + + vgrid = grid + agg_flow + vgrid[:,0,:,:] = 2.0*vgrid[:,0,:,:].clone() / max(W-1,1) - 1 + + vgrid[:,1,:,:] = 2.0*vgrid[:,1,:,:].clone() / max(H-1,1) - 1 + + flow_out = torch.nn.functional.grid_sample(flow, vgrid.permute(0,2,3,1), 'nearest') + + agg_flow += flow_out + + + # mask = forward_splt(torch.ones_like(og_image), torch.ones_like(og_image), agg_flow.squeeze()).squeeze() + # blur_t = torchvision.transforms.GaussianBlur(kernel_size=(25,25), sigma=5.0) + # warping_mask = (blur_t(mask)[0:1] > 0.8) + # masks.append(warping_mask) + prev_frame = tgt_frame + + return agg_flow, pairwise_flows #og_splatted_img, agg_flow, actual_warped_mask + + +def forward_warp(src_frame, tgt_frame, flow, grid=None, alpha_mask=None): + if alpha_mask is None: + alpha_mask = torch.ones_like(src_frame[:1]) + + if grid is not None: + src_list = [src_frame, grid, alpha_mask] + tgt_list = [tgt_frame, grid, alpha_mask] + else: + src_list = [src_frame, alpha_mask] + tgt_list = [tgt_frame, alpha_mask] + + og_image_padded = torch.concat(src_list, dim=0) + tgt_frame_padded = torch.concat(tgt_list, dim=0) + + og_splatted_img = forward_splt(og_image_padded, tgt_frame_padded, flow.squeeze(), partial=True).squeeze() + # print('og splatted image shape') + # grid_transformed = og_splatted_img[3:-1] + # print('grid transformed shape', grid_transformed) + + # grid *= grid_size + # grid_transformed *= grid_size + actual_warped_mask = og_splatted_img[-1:] + splatted_rgb_grid = og_splatted_img[:-1] + + return splatted_rgb_grid, actual_warped_mask \ No newline at end of file diff --git a/depth_anything_v2/softmax-splatting/README.md b/depth_anything_v2/softmax-splatting/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b01d39145603de4279a6719bea71bdce8211a517 --- /dev/null +++ b/depth_anything_v2/softmax-splatting/README.md @@ -0,0 +1,90 @@ +# softmax-splatting +This is a reference implementation of the softmax splatting operator, which has been proposed in Softmax Splatting for Video Frame Interpolation [1], using PyTorch. Softmax splatting is a well-motivated approach for differentiable forward warping. It uses a translational invariant importance metric to disambiguate cases where multiple source pixels map to the same target pixel. Should you be making use of our work, please cite our paper [1]. + +Paper + +For our previous work on SepConv, see: https://github.com/sniklaus/revisiting-sepconv + +## setup +The softmax splatting is implemented in CUDA using CuPy, which is why CuPy is a required dependency. It can be installed using `pip install cupy` or alternatively using one of the provided [binary packages](https://docs.cupy.dev/en/stable/install.html#installing-cupy) as outlined in the CuPy repository. + +If you plan to process videos, then please also make sure to have `pip install moviepy` installed. + +## usage +To run it on your own pair of frames, use the following command. + +``` +python run.py --model lf --one ./images/one.png --two ./images/two.png --out ./out.png +``` + +To run in on a video, use the following command. + +``` +python run.py --model lf --video ./videos/car-turn.mp4 --out ./out.mp4 +``` + +For a quick benchmark using examples from the Middlebury benchmark for optical flow, run `python benchmark_middlebury.py`. You can use it to easily verify that the provided implementation runs as expected. + +## warping +We provide a small script to replicate the third figure of our paper [1]. You can simply run the following to obtain the comparison between summation splatting, average splatting, linear splatting, and softmax splatting. + +The example script is using OpenCV to load and display images, as well as to read the provided optical flow file. An easy way to install OpenCV for Python is using the `pip install opencv-contrib-python` package. + +``` +import cv2 +import numpy +import torch + +import run + +import softsplat # the custom softmax splatting layer + +########################################################## + +torch.set_grad_enabled(False) # make sure to not compute gradients for computational performance + +torch.backends.cudnn.enabled = True # make sure to use cudnn for computational performance + +########################################################## + +tenOne = torch.FloatTensor(numpy.ascontiguousarray(cv2.imread(filename='./images/one.png', flags=-1).transpose(2, 0, 1)[None, :, :, :].astype(numpy.float32) * (1.0 / 255.0))).cuda() +tenTwo = torch.FloatTensor(numpy.ascontiguousarray(cv2.imread(filename='./images/two.png', flags=-1).transpose(2, 0, 1)[None, :, :, :].astype(numpy.float32) * (1.0 / 255.0))).cuda() +tenFlow = torch.FloatTensor(numpy.ascontiguousarray(run.read_flo('./images/flow.flo').transpose(2, 0, 1)[None, :, :, :])).cuda() + +tenMetric = torch.nn.functional.l1_loss(input=tenOne, target=run.backwarp(tenIn=tenTwo, tenFlow=tenFlow), reduction='none').mean([1], True) + +for intTime, fltTime in enumerate(numpy.linspace(0.0, 1.0, 11).tolist()): + tenSummation = softsplat.softsplat(tenIn=tenOne, tenFlow=tenFlow * fltTime, tenMetric=None, strMode='sum') + tenAverage = softsplat.softsplat(tenIn=tenOne, tenFlow=tenFlow * fltTime, tenMetric=None, strMode='avg') + tenLinear = softsplat.softsplat(tenIn=tenOne, tenFlow=tenFlow * fltTime, tenMetric=(0.3 - tenMetric).clip(0.001, 1.0), strMode='linear') # finding a good linearly metric is difficult, and it is not invariant to translations + tenSoftmax = softsplat.softsplat(tenIn=tenOne, tenFlow=tenFlow * fltTime, tenMetric=(-20.0 * tenMetric).clip(-20.0, 20.0), strMode='soft') # -20.0 is a hyperparameter, called 'alpha' in the paper, that could be learned using a torch.Parameter + + cv2.imshow(winname='summation', mat=tenSummation[0, :, :, :].cpu().numpy().transpose(1, 2, 0)) + cv2.imshow(winname='average', mat=tenAverage[0, :, :, :].cpu().numpy().transpose(1, 2, 0)) + cv2.imshow(winname='linear', mat=tenLinear[0, :, :, :].cpu().numpy().transpose(1, 2, 0)) + cv2.imshow(winname='softmax', mat=tenSoftmax[0, :, :, :].cpu().numpy().transpose(1, 2, 0)) + cv2.waitKey(delay=0) +# end +``` + +## xiph +In our paper, we propose to use 4K video clips from Xiph to evaluate video frame interpolation on high-resolution footage. Please see the supplementary `benchmark_xiph.py` on how to reproduce the shown metrics. + +## video +Video + +## license +The provided implementation is strictly for academic purposes only. Should you be interested in using our technology for any commercial use, please feel free to contact us. + +## references +``` +[1] @inproceedings{Niklaus_CVPR_2020, + author = {Simon Niklaus and Feng Liu}, + title = {Softmax Splatting for Video Frame Interpolation}, + booktitle = {IEEE Conference on Computer Vision and Pattern Recognition}, + year = {2020} + } +``` + +## acknowledgment +The video above uses materials under a Creative Common license as detailed at the end. \ No newline at end of file diff --git a/depth_anything_v2/softmax-splatting/__pycache__/softsplat.cpython-310.pyc b/depth_anything_v2/softmax-splatting/__pycache__/softsplat.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9dbd899d1cc3d5f599603d88d57ebfb442f9396c Binary files /dev/null and b/depth_anything_v2/softmax-splatting/__pycache__/softsplat.cpython-310.pyc differ diff --git a/depth_anything_v2/softmax-splatting/benchmark_middlebury.py b/depth_anything_v2/softmax-splatting/benchmark_middlebury.py new file mode 100644 index 0000000000000000000000000000000000000000..33375451c336071c7071c696701dfb8528ebe25d --- /dev/null +++ b/depth_anything_v2/softmax-splatting/benchmark_middlebury.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python + +import glob +import numpy +import PIL +import PIL.Image +import skimage +import skimage.metrics +import torch + +import run + +########################################################## + +run.args_strModel = 'l1' + +########################################################## + +if __name__ == '__main__': + fltPsnr = [] + fltSsim = [] + + for strTruth in sorted(glob.glob('./middlebury/*/frame10i11.png')): + tenOne = torch.FloatTensor(numpy.ascontiguousarray(numpy.array(PIL.Image.open(strTruth.replace('frame10i11', 'frame10')))[:, :, ::-1].transpose(2, 0, 1).astype(numpy.float32) * (1.0 / 255.0))) + tenTwo = torch.FloatTensor(numpy.ascontiguousarray(numpy.array(PIL.Image.open(strTruth.replace('frame10i11', 'frame11')))[:, :, ::-1].transpose(2, 0, 1).astype(numpy.float32) * (1.0 / 255.0))) + + npyEstimate = (run.estimate(tenOne, tenTwo, [0.5])[0].clip(0.0, 1.0).numpy().transpose(1, 2, 0) * 255.0).round().astype(numpy.uint8) + + fltPsnr.append(skimage.metrics.peak_signal_noise_ratio(image_true=numpy.array(PIL.Image.open(strTruth))[:, :, ::-1], image_test=npyEstimate, data_range=255)) + fltSsim.append(skimage.metrics.structural_similarity(im1=numpy.array(PIL.Image.open(strTruth))[:, :, ::-1], im2=npyEstimate, data_range=255, channel_axis=2)) + # end + + print('computed average psnr', numpy.mean(fltPsnr)) + print('computed average ssim', numpy.mean(fltSsim)) +# end \ No newline at end of file diff --git a/depth_anything_v2/softmax-splatting/benchmark_xiph.py b/depth_anything_v2/softmax-splatting/benchmark_xiph.py new file mode 100644 index 0000000000000000000000000000000000000000..b024c889cf645618c8b5c7caa82a604944c9c101 --- /dev/null +++ b/depth_anything_v2/softmax-splatting/benchmark_xiph.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python + +import cv2 +import glob +import numpy +import os +import skimage +import skimage.metrics +import sys +import torch + +import run + +########################################################## + +run.args_strModel = 'l1' + +########################################################## + +os.makedirs(name='./netflix', exist_ok=True) + +if len(glob.glob('./netflix/BoxingPractice-*.png')) != 100: + os.system('ffmpeg -i https://media.xiph.org/video/derf/ElFuente/Netflix_BoxingPractice_4096x2160_60fps_10bit_420.y4m -pix_fmt rgb24 -vframes 100 ./netflix/BoxingPractice-%03d.png') +# end + +if len(glob.glob('./netflix/Crosswalk-*.png')) != 100: + os.system('ffmpeg -i https://media.xiph.org/video/derf/ElFuente/Netflix_Crosswalk_4096x2160_60fps_10bit_420.y4m -pix_fmt rgb24 -vframes 100 ./netflix/Crosswalk-%03d.png') +# end + +if len(glob.glob('./netflix/DrivingPOV-*.png')) != 100: + os.system('ffmpeg -i https://media.xiph.org/video/derf/Chimera/Netflix_DrivingPOV_4096x2160_60fps_10bit_420.y4m -pix_fmt rgb24 -vframes 100 ./netflix/DrivingPOV-%03d.png') +# end + +if len(glob.glob('./netflix/FoodMarket-*.png')) != 100: + os.system('ffmpeg -i https://media.xiph.org/video/derf/ElFuente/Netflix_FoodMarket_4096x2160_60fps_10bit_420.y4m -pix_fmt rgb24 -vframes 100 ./netflix/FoodMarket-%03d.png') +# end + +if len(glob.glob('./netflix/FoodMarket2-*.png')) != 100: + os.system('ffmpeg -i https://media.xiph.org/video/derf/ElFuente/Netflix_FoodMarket2_4096x2160_60fps_10bit_420.y4m -pix_fmt rgb24 -vframes 100 ./netflix/FoodMarket2-%03d.png') +# end + +if len(glob.glob('./netflix/RitualDance-*.png')) != 100: + os.system('ffmpeg -i https://media.xiph.org/video/derf/ElFuente/Netflix_RitualDance_4096x2160_60fps_10bit_420.y4m -pix_fmt rgb24 -vframes 100 ./netflix/RitualDance-%03d.png') +# end + +if len(glob.glob('./netflix/SquareAndTimelapse-*.png')) != 100: + os.system('ffmpeg -i https://media.xiph.org/video/derf/ElFuente/Netflix_SquareAndTimelapse_4096x2160_60fps_10bit_420.y4m -pix_fmt rgb24 -vframes 100 ./netflix/SquareAndTimelapse-%03d.png') +# end + +if len(glob.glob('./netflix/Tango-*.png')) != 100: + os.system('ffmpeg -i https://media.xiph.org/video/derf/ElFuente/Netflix_Tango_4096x2160_60fps_10bit_420.y4m -pix_fmt rgb24 -vframes 100 ./netflix/Tango-%03d.png') +# end + +########################################################## + +for strCategory in ['resized', 'cropped']: + fltPsnr = [] + fltSsim = [] + + for strFile in ['BoxingPractice', 'Crosswalk', 'DrivingPOV', 'FoodMarket', 'FoodMarket2', 'RitualDance', 'SquareAndTimelapse', 'Tango']: + for intFrame in range(2, 99, 2): + npyOne = cv2.imread(filename='./netflix/' + strFile + '-' + str(intFrame - 1).zfill(3) + '.png', flags=-1) + npyTwo = cv2.imread(filename='./netflix/' + strFile + '-' + str(intFrame + 1).zfill(3) + '.png', flags=-1) + npyTruth = cv2.imread(filename='./netflix/' + strFile + '-' + str(intFrame).zfill(3) + '.png', flags=-1) + + if strCategory == 'resized': + npyOne = cv2.resize(src=npyOne, dsize=(2048, 1080), fx=0.0, fy=0.0, interpolation=cv2.INTER_AREA) + npyTwo = cv2.resize(src=npyTwo, dsize=(2048, 1080), fx=0.0, fy=0.0, interpolation=cv2.INTER_AREA) + npyTruth = cv2.resize(src=npyTruth, dsize=(2048, 1080), fx=0.0, fy=0.0, interpolation=cv2.INTER_AREA) + + elif strCategory == 'cropped': + npyOne = npyOne[540:-540, 1024:-1024, :] + npyTwo = npyTwo[540:-540, 1024:-1024, :] + npyTruth = npyTruth[540:-540, 1024:-1024, :] + + # end + + tenOne = torch.FloatTensor(numpy.ascontiguousarray(npyOne.transpose(2, 0, 1).astype(numpy.float32) * (1.0 / 255.0))) + tenTwo = torch.FloatTensor(numpy.ascontiguousarray(npyTwo.transpose(2, 0, 1).astype(numpy.float32) * (1.0 / 255.0))) + + npyEstimate = (run.estimate(tenOne, tenTwo, [0.5])[0].clip(0.0, 1.0).numpy().transpose(1, 2, 0) * 255.0).round().astype(numpy.uint8) + + fltPsnr.append(skimage.metrics.peak_signal_noise_ratio(image_true=npyTruth, image_test=npyEstimate, data_range=255)) + fltSsim.append(skimage.metrics.structural_similarity(im1=npyTruth, im2=npyEstimate, data_range=255, channel_axis=2)) + # end + # end + + print('category', strCategory) + print('computed average psnr', numpy.mean(fltPsnr)) + print('computed average ssim', numpy.mean(fltSsim)) +# end diff --git a/depth_anything_v2/softmax-splatting/correlation/README.md b/depth_anything_v2/softmax-splatting/correlation/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a8e0ca529d50b7e09d521cc288daae7771514188 --- /dev/null +++ b/depth_anything_v2/softmax-splatting/correlation/README.md @@ -0,0 +1 @@ +This is an adaptation of the FlowNet2 implementation in order to compute cost volumes. Should you be making use of this work, please make sure to adhere to the licensing terms of the original authors. Should you be making use or modify this particular implementation, please acknowledge it appropriately. \ No newline at end of file diff --git a/depth_anything_v2/softmax-splatting/correlation/correlation.py b/depth_anything_v2/softmax-splatting/correlation/correlation.py new file mode 100644 index 0000000000000000000000000000000000000000..1dec3c03796f0bfea0659cdcf08aa4c24134b04c --- /dev/null +++ b/depth_anything_v2/softmax-splatting/correlation/correlation.py @@ -0,0 +1,400 @@ +#!/usr/bin/env python + +import cupy +import os +import re +import torch + +kernel_Correlation_rearrange = ''' + extern "C" __global__ void kernel_Correlation_rearrange( + const int n, + const float* input, + float* output + ) { + int intIndex = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (intIndex >= n) { + return; + } + + int intSample = blockIdx.z; + int intChannel = blockIdx.y; + + float fltValue = input[(((intSample * SIZE_1(input)) + intChannel) * SIZE_2(input) * SIZE_3(input)) + intIndex]; + + __syncthreads(); + + int intPaddedY = (intIndex / SIZE_3(input)) + 4; + int intPaddedX = (intIndex % SIZE_3(input)) + 4; + int intRearrange = ((SIZE_3(input) + 8) * intPaddedY) + intPaddedX; + + output[(((intSample * SIZE_1(output) * SIZE_2(output)) + intRearrange) * SIZE_1(input)) + intChannel] = fltValue; + } +''' + +kernel_Correlation_updateOutput = ''' + extern "C" __global__ void kernel_Correlation_updateOutput( + const int n, + const float* rbot0, + const float* rbot1, + float* top + ) { + extern __shared__ char patch_data_char[]; + + float *patch_data = (float *)patch_data_char; + + // First (upper left) position of kernel upper-left corner in current center position of neighborhood in image 1 + int x1 = blockIdx.x + 4; + int y1 = blockIdx.y + 4; + int item = blockIdx.z; + int ch_off = threadIdx.x; + + // Load 3D patch into shared shared memory + for (int j = 0; j < 1; j++) { // HEIGHT + for (int i = 0; i < 1; i++) { // WIDTH + int ji_off = (j + i) * SIZE_3(rbot0); + for (int ch = ch_off; ch < SIZE_3(rbot0); ch += 32) { // CHANNELS + int idx1 = ((item * SIZE_1(rbot0) + y1+j) * SIZE_2(rbot0) + x1+i) * SIZE_3(rbot0) + ch; + int idxPatchData = ji_off + ch; + patch_data[idxPatchData] = rbot0[idx1]; + } + } + } + + __syncthreads(); + + __shared__ float sum[32]; + + // Compute correlation + for (int top_channel = 0; top_channel < SIZE_1(top); top_channel++) { + sum[ch_off] = 0; + + int s2o = top_channel % 9 - 4; + int s2p = top_channel / 9 - 4; + + for (int j = 0; j < 1; j++) { // HEIGHT + for (int i = 0; i < 1; i++) { // WIDTH + int ji_off = (j + i) * SIZE_3(rbot0); + for (int ch = ch_off; ch < SIZE_3(rbot0); ch += 32) { // CHANNELS + int x2 = x1 + s2o; + int y2 = y1 + s2p; + + int idxPatchData = ji_off + ch; + int idx2 = ((item * SIZE_1(rbot0) + y2+j) * SIZE_2(rbot0) + x2+i) * SIZE_3(rbot0) + ch; + + sum[ch_off] += patch_data[idxPatchData] * rbot1[idx2]; + } + } + } + + __syncthreads(); + + if (ch_off == 0) { + float total_sum = 0; + for (int idx = 0; idx < 32; idx++) { + total_sum += sum[idx]; + } + const int sumelems = SIZE_3(rbot0); + const int index = ((top_channel*SIZE_2(top) + blockIdx.y)*SIZE_3(top))+blockIdx.x; + top[index + item*SIZE_1(top)*SIZE_2(top)*SIZE_3(top)] = total_sum / (float)sumelems; + } + } + } +''' + +kernel_Correlation_updateGradOne = ''' + #define ROUND_OFF 50000 + + extern "C" __global__ void kernel_Correlation_updateGradOne( + const int n, + const int intSample, + const float* rbot0, + const float* rbot1, + const float* gradOutput, + float* gradOne, + float* gradTwo + ) { for (int intIndex = (blockIdx.x * blockDim.x) + threadIdx.x; intIndex < n; intIndex += blockDim.x * gridDim.x) { + int n = intIndex % SIZE_1(gradOne); // channels + int l = (intIndex / SIZE_1(gradOne)) % SIZE_3(gradOne) + 4; // w-pos + int m = (intIndex / SIZE_1(gradOne) / SIZE_3(gradOne)) % SIZE_2(gradOne) + 4; // h-pos + + // round_off is a trick to enable integer division with ceil, even for negative numbers + // We use a large offset, for the inner part not to become negative. + const int round_off = ROUND_OFF; + const int round_off_s1 = round_off; + + // We add round_off before_s1 the int division and subtract round_off after it, to ensure the formula matches ceil behavior: + int xmin = (l - 4 + round_off_s1 - 1) + 1 - round_off; // ceil (l - 4) + int ymin = (m - 4 + round_off_s1 - 1) + 1 - round_off; // ceil (l - 4) + + // Same here: + int xmax = (l - 4 + round_off_s1) - round_off; // floor (l - 4) + int ymax = (m - 4 + round_off_s1) - round_off; // floor (m - 4) + + float sum = 0; + if (xmax>=0 && ymax>=0 && (xmin<=SIZE_3(gradOutput)-1) && (ymin<=SIZE_2(gradOutput)-1)) { + xmin = max(0,xmin); + xmax = min(SIZE_3(gradOutput)-1,xmax); + + ymin = max(0,ymin); + ymax = min(SIZE_2(gradOutput)-1,ymax); + + for (int p = -4; p <= 4; p++) { + for (int o = -4; o <= 4; o++) { + // Get rbot1 data: + int s2o = o; + int s2p = p; + int idxbot1 = ((intSample * SIZE_1(rbot0) + (m+s2p)) * SIZE_2(rbot0) + (l+s2o)) * SIZE_3(rbot0) + n; + float bot1tmp = rbot1[idxbot1]; // rbot1[l+s2o,m+s2p,n] + + // Index offset for gradOutput in following loops: + int op = (p+4) * 9 + (o+4); // index[o,p] + int idxopoffset = (intSample * SIZE_1(gradOutput) + op); + + for (int y = ymin; y <= ymax; y++) { + for (int x = xmin; x <= xmax; x++) { + int idxgradOutput = (idxopoffset * SIZE_2(gradOutput) + y) * SIZE_3(gradOutput) + x; // gradOutput[x,y,o,p] + sum += gradOutput[idxgradOutput] * bot1tmp; + } + } + } + } + } + const int sumelems = SIZE_1(gradOne); + const int bot0index = ((n * SIZE_2(gradOne)) + (m-4)) * SIZE_3(gradOne) + (l-4); + gradOne[bot0index + intSample*SIZE_1(gradOne)*SIZE_2(gradOne)*SIZE_3(gradOne)] = sum / (float)sumelems; + } } +''' + +kernel_Correlation_updateGradTwo = ''' + #define ROUND_OFF 50000 + + extern "C" __global__ void kernel_Correlation_updateGradTwo( + const int n, + const int intSample, + const float* rbot0, + const float* rbot1, + const float* gradOutput, + float* gradOne, + float* gradTwo + ) { for (int intIndex = (blockIdx.x * blockDim.x) + threadIdx.x; intIndex < n; intIndex += blockDim.x * gridDim.x) { + int n = intIndex % SIZE_1(gradTwo); // channels + int l = (intIndex / SIZE_1(gradTwo)) % SIZE_3(gradTwo) + 4; // w-pos + int m = (intIndex / SIZE_1(gradTwo) / SIZE_3(gradTwo)) % SIZE_2(gradTwo) + 4; // h-pos + + // round_off is a trick to enable integer division with ceil, even for negative numbers + // We use a large offset, for the inner part not to become negative. + const int round_off = ROUND_OFF; + const int round_off_s1 = round_off; + + float sum = 0; + for (int p = -4; p <= 4; p++) { + for (int o = -4; o <= 4; o++) { + int s2o = o; + int s2p = p; + + //Get X,Y ranges and clamp + // We add round_off before_s1 the int division and subtract round_off after it, to ensure the formula matches ceil behavior: + int xmin = (l - 4 - s2o + round_off_s1 - 1) + 1 - round_off; // ceil (l - 4 - s2o) + int ymin = (m - 4 - s2p + round_off_s1 - 1) + 1 - round_off; // ceil (l - 4 - s2o) + + // Same here: + int xmax = (l - 4 - s2o + round_off_s1) - round_off; // floor (l - 4 - s2o) + int ymax = (m - 4 - s2p + round_off_s1) - round_off; // floor (m - 4 - s2p) + + if (xmax>=0 && ymax>=0 && (xmin<=SIZE_3(gradOutput)-1) && (ymin<=SIZE_2(gradOutput)-1)) { + xmin = max(0,xmin); + xmax = min(SIZE_3(gradOutput)-1,xmax); + + ymin = max(0,ymin); + ymax = min(SIZE_2(gradOutput)-1,ymax); + + // Get rbot0 data: + int idxbot0 = ((intSample * SIZE_1(rbot0) + (m-s2p)) * SIZE_2(rbot0) + (l-s2o)) * SIZE_3(rbot0) + n; + float bot0tmp = rbot0[idxbot0]; // rbot1[l+s2o,m+s2p,n] + + // Index offset for gradOutput in following loops: + int op = (p+4) * 9 + (o+4); // index[o,p] + int idxopoffset = (intSample * SIZE_1(gradOutput) + op); + + for (int y = ymin; y <= ymax; y++) { + for (int x = xmin; x <= xmax; x++) { + int idxgradOutput = (idxopoffset * SIZE_2(gradOutput) + y) * SIZE_3(gradOutput) + x; // gradOutput[x,y,o,p] + sum += gradOutput[idxgradOutput] * bot0tmp; + } + } + } + } + } + const int sumelems = SIZE_1(gradTwo); + const int bot1index = ((n * SIZE_2(gradTwo)) + (m-4)) * SIZE_3(gradTwo) + (l-4); + gradTwo[bot1index + intSample*SIZE_1(gradTwo)*SIZE_2(gradTwo)*SIZE_3(gradTwo)] = sum / (float)sumelems; + } } +''' + +def cupy_kernel(strFunction, objVariables): + strKernel = globals()[strFunction] + + while True: + objMatch = re.search('(SIZE_)([0-4])(\()([^\)]*)(\))', strKernel) + + if objMatch is None: + break + # end + + intArg = int(objMatch.group(2)) + + strTensor = objMatch.group(4) + intSizes = objVariables[strTensor].size() + + strKernel = strKernel.replace(objMatch.group(), str(intSizes[intArg] if torch.is_tensor(intSizes[intArg]) == False else intSizes[intArg].item())) + + while True: + objMatch = re.search('(VALUE_)([0-4])(\()([^\)]+)(\))', strKernel) + + if objMatch is None: + break + # end + + intArgs = int(objMatch.group(2)) + strArgs = objMatch.group(4).split(',') + + strTensor = strArgs[0] + intStrides = objVariables[strTensor].stride() + strIndex = [ '((' + strArgs[intArg + 1].replace('{', '(').replace('}', ')').strip() + ')*' + str(intStrides[intArg] if torch.is_tensor(intStrides[intArg]) == False else intStrides[intArg].item()) + ')' for intArg in range(intArgs) ] + + strKernel = strKernel.replace(objMatch.group(0), strTensor + '[' + str.join('+', strIndex) + ']') + # end + + return strKernel +# end + +@cupy.memoize(for_each_device=True) +def cupy_launch(strFunction, strKernel): + if 'CUDA_HOME' not in os.environ: + os.environ['CUDA_HOME'] = cupy.cuda.get_cuda_path() + # end + + return cupy.RawKernel(strKernel, strFunction, tuple(['-I ' + os.environ['CUDA_HOME'], '-I ' + os.environ['CUDA_HOME'] + '/include'])) +# end + +class _FunctionCorrelation(torch.autograd.Function): + @staticmethod + def forward(self, one, two): + rbot0 = one.new_zeros([ one.shape[0], one.shape[2] + 8, one.shape[3] + 8, one.shape[1] ]) + rbot1 = one.new_zeros([ one.shape[0], one.shape[2] + 8, one.shape[3] + 8, one.shape[1] ]) + + one = one.contiguous(); assert(one.is_cuda == True) + two = two.contiguous(); assert(two.is_cuda == True) + + output = one.new_zeros([ one.shape[0], 81, one.shape[2], one.shape[3] ]) + + if one.is_cuda == True: + n = one.shape[2] * one.shape[3] + cupy_launch('kernel_Correlation_rearrange', cupy_kernel('kernel_Correlation_rearrange', { + 'input': one, + 'output': rbot0 + }))( + grid=tuple([ int((n + 16 - 1) / 16), one.shape[1], one.shape[0] ]), + block=tuple([ 16, 1, 1 ]), + args=[ cupy.int32(n), one.data_ptr(), rbot0.data_ptr() ] + ) + + n = two.shape[2] * two.shape[3] + cupy_launch('kernel_Correlation_rearrange', cupy_kernel('kernel_Correlation_rearrange', { + 'input': two, + 'output': rbot1 + }))( + grid=tuple([ int((n + 16 - 1) / 16), two.shape[1], two.shape[0] ]), + block=tuple([ 16, 1, 1 ]), + args=[ cupy.int32(n), two.data_ptr(), rbot1.data_ptr() ] + ) + + n = output.shape[1] * output.shape[2] * output.shape[3] + cupy_launch('kernel_Correlation_updateOutput', cupy_kernel('kernel_Correlation_updateOutput', { + 'rbot0': rbot0, + 'rbot1': rbot1, + 'top': output + }))( + grid=tuple([ output.shape[3], output.shape[2], output.shape[0] ]), + block=tuple([ 32, 1, 1 ]), + shared_mem=one.shape[1] * 4, + args=[ cupy.int32(n), rbot0.data_ptr(), rbot1.data_ptr(), output.data_ptr() ] + ) + + elif one.is_cuda == False: + raise NotImplementedError() + + # end + + self.save_for_backward(one, two, rbot0, rbot1) + + return output + # end + + @staticmethod + def backward(self, gradOutput): + one, two, rbot0, rbot1 = self.saved_tensors + + gradOutput = gradOutput.contiguous(); assert(gradOutput.is_cuda == True) + + gradOne = one.new_zeros([ one.shape[0], one.shape[1], one.shape[2], one.shape[3] ]) if self.needs_input_grad[0] == True else None + gradTwo = one.new_zeros([ one.shape[0], one.shape[1], one.shape[2], one.shape[3] ]) if self.needs_input_grad[1] == True else None + + if one.is_cuda == True: + if gradOne is not None: + for intSample in range(one.shape[0]): + n = one.shape[1] * one.shape[2] * one.shape[3] + cupy_launch('kernel_Correlation_updateGradOne', cupy_kernel('kernel_Correlation_updateGradOne', { + 'rbot0': rbot0, + 'rbot1': rbot1, + 'gradOutput': gradOutput, + 'gradOne': gradOne, + 'gradTwo': None + }))( + grid=tuple([ int((n + 512 - 1) / 512), 1, 1 ]), + block=tuple([ 512, 1, 1 ]), + args=[ cupy.int32(n), intSample, rbot0.data_ptr(), rbot1.data_ptr(), gradOutput.data_ptr(), gradOne.data_ptr(), None ] + ) + # end + # end + + if gradTwo is not None: + for intSample in range(one.shape[0]): + n = one.shape[1] * one.shape[2] * one.shape[3] + cupy_launch('kernel_Correlation_updateGradTwo', cupy_kernel('kernel_Correlation_updateGradTwo', { + 'rbot0': rbot0, + 'rbot1': rbot1, + 'gradOutput': gradOutput, + 'gradOne': None, + 'gradTwo': gradTwo + }))( + grid=tuple([ int((n + 512 - 1) / 512), 1, 1 ]), + block=tuple([ 512, 1, 1 ]), + args=[ cupy.int32(n), intSample, rbot0.data_ptr(), rbot1.data_ptr(), gradOutput.data_ptr(), None, gradTwo.data_ptr() ] + ) + # end + # end + + elif one.is_cuda == False: + raise NotImplementedError() + + # end + + return gradOne, gradTwo + # end +# end + +def FunctionCorrelation(tenOne, tenTwo): + return _FunctionCorrelation.apply(tenOne, tenTwo) +# end + +class ModuleCorrelation(torch.nn.Module): + def __init__(self): + super().__init__() + # end + + def forward(self, tenOne, tenTwo): + return _FunctionCorrelation.apply(tenOne, tenTwo) + # end +# end diff --git a/depth_anything_v2/softmax-splatting/images/README.md b/depth_anything_v2/softmax-splatting/images/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7f6d609da095387ad2d338da5ae1e39d6bab11f5 --- /dev/null +++ b/depth_anything_v2/softmax-splatting/images/README.md @@ -0,0 +1 @@ +The used example originates from the DAVIS challenge: https://davischallenge.org/ \ No newline at end of file diff --git a/depth_anything_v2/softmax-splatting/images/flow.flo b/depth_anything_v2/softmax-splatting/images/flow.flo new file mode 100644 index 0000000000000000000000000000000000000000..0bf7072ea25fca4af75bd9198c6405321004f6b5 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/images/flow.flo differ diff --git a/depth_anything_v2/softmax-splatting/images/one.png b/depth_anything_v2/softmax-splatting/images/one.png new file mode 100644 index 0000000000000000000000000000000000000000..f9ae108c0d5c73151b295b798088478a5872711c Binary files /dev/null and b/depth_anything_v2/softmax-splatting/images/one.png differ diff --git a/depth_anything_v2/softmax-splatting/images/two.png b/depth_anything_v2/softmax-splatting/images/two.png new file mode 100644 index 0000000000000000000000000000000000000000..a38586f38e8f3285e0a9d4959a047924d3f7e3b6 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/images/two.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/Beanbags/frame10.png b/depth_anything_v2/softmax-splatting/middlebury/Beanbags/frame10.png new file mode 100644 index 0000000000000000000000000000000000000000..1f972cbae0d781395ec9912f78e09eebc5bad4e8 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/Beanbags/frame10.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/Beanbags/frame10i11.png b/depth_anything_v2/softmax-splatting/middlebury/Beanbags/frame10i11.png new file mode 100644 index 0000000000000000000000000000000000000000..f93de7949421b2c81208efc88e79cfdec2cd69fb Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/Beanbags/frame10i11.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/Beanbags/frame11.png b/depth_anything_v2/softmax-splatting/middlebury/Beanbags/frame11.png new file mode 100644 index 0000000000000000000000000000000000000000..f90de6dd265455fcf1b0eb8ea8885cf1224e49c0 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/Beanbags/frame11.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/Dimetrodon/frame10.png b/depth_anything_v2/softmax-splatting/middlebury/Dimetrodon/frame10.png new file mode 100644 index 0000000000000000000000000000000000000000..cb21d695603cc00f9397c86f223588698330bbac Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/Dimetrodon/frame10.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/Dimetrodon/frame10i11.png b/depth_anything_v2/softmax-splatting/middlebury/Dimetrodon/frame10i11.png new file mode 100644 index 0000000000000000000000000000000000000000..d563dfcfa461aca179cf6765fe08cfcd0d2df475 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/Dimetrodon/frame10i11.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/Dimetrodon/frame11.png b/depth_anything_v2/softmax-splatting/middlebury/Dimetrodon/frame11.png new file mode 100644 index 0000000000000000000000000000000000000000..a7ad396a8e7472d1caedfd348ce96643b49d4aad Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/Dimetrodon/frame11.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/DogDance/frame10.png b/depth_anything_v2/softmax-splatting/middlebury/DogDance/frame10.png new file mode 100644 index 0000000000000000000000000000000000000000..42d50dc751219deb373c58efa55121ed2a717741 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/DogDance/frame10.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/DogDance/frame10i11.png b/depth_anything_v2/softmax-splatting/middlebury/DogDance/frame10i11.png new file mode 100644 index 0000000000000000000000000000000000000000..323924ca657de0cd30387fa8ba25838b1f54fc91 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/DogDance/frame10i11.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/DogDance/frame11.png b/depth_anything_v2/softmax-splatting/middlebury/DogDance/frame11.png new file mode 100644 index 0000000000000000000000000000000000000000..8f9f850ea8e94d2923262d128e24f432d425e903 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/DogDance/frame11.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/Grove2/frame10.png b/depth_anything_v2/softmax-splatting/middlebury/Grove2/frame10.png new file mode 100644 index 0000000000000000000000000000000000000000..35b06ea069b5527040bba11bc8bab7d6ad46a23f Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/Grove2/frame10.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/Grove2/frame10i11.png b/depth_anything_v2/softmax-splatting/middlebury/Grove2/frame10i11.png new file mode 100644 index 0000000000000000000000000000000000000000..7af54d5c555bea2bf2bb10cb07fb2a98fe351152 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/Grove2/frame10i11.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/Grove2/frame11.png b/depth_anything_v2/softmax-splatting/middlebury/Grove2/frame11.png new file mode 100644 index 0000000000000000000000000000000000000000..5b735f27e51f56e9b8bbfd5a56acefec4dffdb62 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/Grove2/frame11.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/Grove3/frame10.png b/depth_anything_v2/softmax-splatting/middlebury/Grove3/frame10.png new file mode 100644 index 0000000000000000000000000000000000000000..2e996bb7e73597f460270b11edce3bbdbf757b41 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/Grove3/frame10.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/Grove3/frame10i11.png b/depth_anything_v2/softmax-splatting/middlebury/Grove3/frame10i11.png new file mode 100644 index 0000000000000000000000000000000000000000..0c6a8e14cf218540fce473878f2982523708ee0b Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/Grove3/frame10i11.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/Grove3/frame11.png b/depth_anything_v2/softmax-splatting/middlebury/Grove3/frame11.png new file mode 100644 index 0000000000000000000000000000000000000000..930d0741e860e648ada9bf8c5eee0b744f5f1e4f Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/Grove3/frame11.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/Hydrangea/frame10.png b/depth_anything_v2/softmax-splatting/middlebury/Hydrangea/frame10.png new file mode 100644 index 0000000000000000000000000000000000000000..966b942e26642b4f23842d413881a19854944032 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/Hydrangea/frame10.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/Hydrangea/frame10i11.png b/depth_anything_v2/softmax-splatting/middlebury/Hydrangea/frame10i11.png new file mode 100644 index 0000000000000000000000000000000000000000..768d6e4798630a1210518652c7a3d90769e1d0e5 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/Hydrangea/frame10i11.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/Hydrangea/frame11.png b/depth_anything_v2/softmax-splatting/middlebury/Hydrangea/frame11.png new file mode 100644 index 0000000000000000000000000000000000000000..dca24764407d4ac47031f702580006e1fab1c512 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/Hydrangea/frame11.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/MiniCooper/frame10.png b/depth_anything_v2/softmax-splatting/middlebury/MiniCooper/frame10.png new file mode 100644 index 0000000000000000000000000000000000000000..d39fb28feaba40e8f47931ce68550d10524e0fa7 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/MiniCooper/frame10.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/MiniCooper/frame10i11.png b/depth_anything_v2/softmax-splatting/middlebury/MiniCooper/frame10i11.png new file mode 100644 index 0000000000000000000000000000000000000000..3f31b3930d0dccf5a00862e8c256aac0e03e13b6 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/MiniCooper/frame10i11.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/MiniCooper/frame11.png b/depth_anything_v2/softmax-splatting/middlebury/MiniCooper/frame11.png new file mode 100644 index 0000000000000000000000000000000000000000..a80f3f42371837f988e8b323c9bd29031aaeb425 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/MiniCooper/frame11.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/README.md b/depth_anything_v2/softmax-splatting/middlebury/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d9d7fd336f9f04a3427ee5d43c2a8edd627dc38f --- /dev/null +++ b/depth_anything_v2/softmax-splatting/middlebury/README.md @@ -0,0 +1 @@ +The images in this folder originate from the Middlebury benchmark for Optical Flow: http://vision.middlebury.edu/flow \ No newline at end of file diff --git a/depth_anything_v2/softmax-splatting/middlebury/RubberWhale/frame10.png b/depth_anything_v2/softmax-splatting/middlebury/RubberWhale/frame10.png new file mode 100644 index 0000000000000000000000000000000000000000..e2b08dba290ef18233cddfbcb0e3b6fb03ded39d Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/RubberWhale/frame10.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/RubberWhale/frame10i11.png b/depth_anything_v2/softmax-splatting/middlebury/RubberWhale/frame10i11.png new file mode 100644 index 0000000000000000000000000000000000000000..82c387551ef6334300fdc71fe44bf808c3e09650 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/RubberWhale/frame10i11.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/RubberWhale/frame11.png b/depth_anything_v2/softmax-splatting/middlebury/RubberWhale/frame11.png new file mode 100644 index 0000000000000000000000000000000000000000..f365370a4c7c8c7c3066afb64d1033c6469dc4f7 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/RubberWhale/frame11.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/Urban2/frame10.png b/depth_anything_v2/softmax-splatting/middlebury/Urban2/frame10.png new file mode 100644 index 0000000000000000000000000000000000000000..42abc03f0eb3f906ddb23380ec6c14be2aedf6b2 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/Urban2/frame10.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/Urban2/frame10i11.png b/depth_anything_v2/softmax-splatting/middlebury/Urban2/frame10i11.png new file mode 100644 index 0000000000000000000000000000000000000000..16904afd68ac309627500561a07db733586a1fe3 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/Urban2/frame10i11.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/Urban2/frame11.png b/depth_anything_v2/softmax-splatting/middlebury/Urban2/frame11.png new file mode 100644 index 0000000000000000000000000000000000000000..b01c3eaecef6945bb889676993eca073b0f0eb1b Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/Urban2/frame11.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/Urban3/frame10.png b/depth_anything_v2/softmax-splatting/middlebury/Urban3/frame10.png new file mode 100644 index 0000000000000000000000000000000000000000..7386b3614a2d7115e017521e83d2e8da2b82d08d Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/Urban3/frame10.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/Urban3/frame10i11.png b/depth_anything_v2/softmax-splatting/middlebury/Urban3/frame10i11.png new file mode 100644 index 0000000000000000000000000000000000000000..87751bd44ff1ad27a6cb50db03621280f5adea34 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/Urban3/frame10i11.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/Urban3/frame11.png b/depth_anything_v2/softmax-splatting/middlebury/Urban3/frame11.png new file mode 100644 index 0000000000000000000000000000000000000000..41eed9c95c47ed30c34842a277d79d5ba3dde344 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/Urban3/frame11.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/Venus/frame10.png b/depth_anything_v2/softmax-splatting/middlebury/Venus/frame10.png new file mode 100644 index 0000000000000000000000000000000000000000..3876ff8d402c6157baa243f271d61b43f895c698 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/Venus/frame10.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/Venus/frame10i11.png b/depth_anything_v2/softmax-splatting/middlebury/Venus/frame10i11.png new file mode 100644 index 0000000000000000000000000000000000000000..45d5957d7e9b9212cead8a6f6563dcf2ec4fa410 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/Venus/frame10i11.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/Venus/frame11.png b/depth_anything_v2/softmax-splatting/middlebury/Venus/frame11.png new file mode 100644 index 0000000000000000000000000000000000000000..80910c4199290af90542de883df38886f46340b4 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/Venus/frame11.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/Walking/frame10.png b/depth_anything_v2/softmax-splatting/middlebury/Walking/frame10.png new file mode 100644 index 0000000000000000000000000000000000000000..f7bf4dda36e481c5f4cc7445372cfd3fc6e69afd Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/Walking/frame10.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/Walking/frame10i11.png b/depth_anything_v2/softmax-splatting/middlebury/Walking/frame10i11.png new file mode 100644 index 0000000000000000000000000000000000000000..525bb3312c3125255cb5763e31bc5a84a40b9d96 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/Walking/frame10i11.png differ diff --git a/depth_anything_v2/softmax-splatting/middlebury/Walking/frame11.png b/depth_anything_v2/softmax-splatting/middlebury/Walking/frame11.png new file mode 100644 index 0000000000000000000000000000000000000000..fb0a10b3a4b2770a750a8d24f09671db1b64e5e5 Binary files /dev/null and b/depth_anything_v2/softmax-splatting/middlebury/Walking/frame11.png differ diff --git a/depth_anything_v2/softmax-splatting/requirements.txt b/depth_anything_v2/softmax-splatting/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..47537b4922f41ef6820c45834281edf21ab52b6e --- /dev/null +++ b/depth_anything_v2/softmax-splatting/requirements.txt @@ -0,0 +1,4 @@ +cupy>=5.0.0 +numpy>=1.15.0 +opencv-contrib-python>=3.4.0 +torch>=1.6.0 \ No newline at end of file diff --git a/depth_anything_v2/softmax-splatting/run.py b/depth_anything_v2/softmax-splatting/run.py new file mode 100644 index 0000000000000000000000000000000000000000..436dbe63ff5176af1b0284b54c760062e4914b53 --- /dev/null +++ b/depth_anything_v2/softmax-splatting/run.py @@ -0,0 +1,648 @@ +#!/usr/bin/env python + +import getopt +import math +import numpy +import PIL +import PIL.Image +import sys +import torch +import typing + +import softsplat # the custom softmax splatting layer + +try: + from .correlation import correlation # the custom cost volume layer +except: + sys.path.insert(0, './correlation'); import correlation # you should consider upgrading python +# end + +########################################################## + +torch.set_grad_enabled(False) # make sure to not compute gradients for computational performance + +torch.backends.cudnn.enabled = True # make sure to use cudnn for computational performance + +########################################################## + +args_strModel = 'lf' +args_strOne = './images/one.png' +args_strTwo = './images/two.png' +args_strVideo = './videos/car-turn.mp4' +args_strOut = './out.png' + +for strOption, strArg in getopt.getopt(sys.argv[1:], '', [ + 'model=', + 'one=', + 'two=', + 'video=', + 'out=', +])[0]: + if strOption == '--model' and strArg != '': args_strModel = strArg # which model to use + if strOption == '--one' and strArg != '': args_strOne = strArg # path to the first frame + if strOption == '--two' and strArg != '': args_strTwo = strArg # path to the second frame + if strOption == '--video' and strArg != '': args_strVideo = strArg # path to a video + if strOption == '--out' and strArg != '': args_strOut = strArg # path to where the output should be stored +# end + +########################################################## + +def read_flo(strFile): + with open(strFile, 'rb') as objFile: + strFlow = objFile.read() + # end + + assert(numpy.frombuffer(buffer=strFlow, dtype=numpy.float32, count=1, offset=0) == 202021.25) + + intWidth = numpy.frombuffer(buffer=strFlow, dtype=numpy.int32, count=1, offset=4)[0] + intHeight = numpy.frombuffer(buffer=strFlow, dtype=numpy.int32, count=1, offset=8)[0] + + return numpy.frombuffer(buffer=strFlow, dtype=numpy.float32, count=intHeight * intWidth * 2, offset=12).reshape(intHeight, intWidth, 2) +# end + +########################################################## + +backwarp_tenGrid = {} + +def backwarp(tenIn, tenFlow): + if str(tenFlow.shape) not in backwarp_tenGrid: + tenHor = torch.linspace(start=-1.0, end=1.0, steps=tenFlow.shape[3], dtype=tenFlow.dtype, device=tenFlow.device).view(1, 1, 1, -1).repeat(1, 1, tenFlow.shape[2], 1) + tenVer = torch.linspace(start=-1.0, end=1.0, steps=tenFlow.shape[2], dtype=tenFlow.dtype, device=tenFlow.device).view(1, 1, -1, 1).repeat(1, 1, 1, tenFlow.shape[3]) + + backwarp_tenGrid[str(tenFlow.shape)] = torch.cat([tenHor, tenVer], 1).cuda() + # end + + tenFlow = torch.cat([tenFlow[:, 0:1, :, :] / ((tenIn.shape[3] - 1.0) / 2.0), tenFlow[:, 1:2, :, :] / ((tenIn.shape[2] - 1.0) / 2.0)], 1) + + return torch.nn.functional.grid_sample(input=tenIn, grid=(backwarp_tenGrid[str(tenFlow.shape)] + tenFlow).permute(0, 2, 3, 1), mode='bilinear', padding_mode='zeros', align_corners=True) +# end + +########################################################## + +class Flow(torch.nn.Module): + def __init__(self): + super().__init__() + + class Extractor(torch.nn.Module): + def __init__(self): + super().__init__() + + self.netFirst = torch.nn.Sequential( + torch.nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=2, padding=1), + torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), + torch.nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, stride=1, padding=1), + torch.nn.LeakyReLU(inplace=False, negative_slope=0.1) + ) + + self.netSecond = torch.nn.Sequential( + torch.nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=2, padding=1), + torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), + torch.nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1), + torch.nn.LeakyReLU(inplace=False, negative_slope=0.1) + ) + + self.netThird = torch.nn.Sequential( + torch.nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=1), + torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), + torch.nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1), + torch.nn.LeakyReLU(inplace=False, negative_slope=0.1) + ) + + self.netFourth = torch.nn.Sequential( + torch.nn.Conv2d(in_channels=64, out_channels=96, kernel_size=3, stride=2, padding=1), + torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), + torch.nn.Conv2d(in_channels=96, out_channels=96, kernel_size=3, stride=1, padding=1), + torch.nn.LeakyReLU(inplace=False, negative_slope=0.1) + ) + + self.netFifth = torch.nn.Sequential( + torch.nn.Conv2d(in_channels=96, out_channels=128, kernel_size=3, stride=2, padding=1), + torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), + torch.nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1), + torch.nn.LeakyReLU(inplace=False, negative_slope=0.1) + ) + + self.netSixth = torch.nn.Sequential( + torch.nn.Conv2d(in_channels=128, out_channels=192, kernel_size=3, stride=2, padding=1), + torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), + torch.nn.Conv2d(in_channels=192, out_channels=192, kernel_size=3, stride=1, padding=1), + torch.nn.LeakyReLU(inplace=False, negative_slope=0.1) + ) + # end + + def forward(self, tenInput): + tenFirst = self.netFirst(tenInput) + tenSecond = self.netSecond(tenFirst) + tenThird = self.netThird(tenSecond) + tenFourth = self.netFourth(tenThird) + tenFifth = self.netFifth(tenFourth) + tenSixth = self.netSixth(tenFifth) + + return [tenFirst, tenSecond, tenThird, tenFourth, tenFifth, tenSixth] + # end + # end + + class Decoder(torch.nn.Module): + def __init__(self, intChannels): + super().__init__() + + self.netMain = torch.nn.Sequential( + torch.nn.Conv2d(in_channels=intChannels, out_channels=128, kernel_size=3, stride=1, padding=1), + torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), + torch.nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1), + torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), + torch.nn.Conv2d(in_channels=128, out_channels=96, kernel_size=3, stride=1, padding=1), + torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), + torch.nn.Conv2d(in_channels=96, out_channels=64, kernel_size=3, stride=1, padding=1), + torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), + torch.nn.Conv2d(in_channels=64, out_channels=32, kernel_size=3, stride=1, padding=1), + torch.nn.LeakyReLU(inplace=False, negative_slope=0.1), + torch.nn.Conv2d(in_channels=32, out_channels=2, kernel_size=3, stride=1, padding=1) + ) + # end + + def forward(self, tenOne, tenTwo, objPrevious): + intWidth = tenOne.shape[3] and tenTwo.shape[3] + intHeight = tenOne.shape[2] and tenTwo.shape[2] + + tenMain = None + + if objPrevious is None: + tenVolume = correlation.FunctionCorrelation(tenOne=tenOne, tenTwo=tenTwo) + + tenMain = torch.cat([tenOne, tenVolume], 1) + + elif objPrevious is not None: + tenForward = torch.nn.functional.interpolate(input=objPrevious['tenForward'], size=(intHeight, intWidth), mode='bilinear', align_corners=False) / float(objPrevious['tenForward'].shape[3]) * float(intWidth) + + tenVolume = correlation.FunctionCorrelation(tenOne=tenOne, tenTwo=backwarp(tenTwo, tenForward)) + + tenMain = torch.cat([tenOne, tenVolume, tenForward], 1) + + # end + + return { + 'tenForward': self.netMain(tenMain) + } + # end + # end + + self.netExtractor = Extractor() + + self.netFirst = Decoder(16 + 81 + 2) + self.netSecond = Decoder(32 + 81 + 2) + self.netThird = Decoder(64 + 81 + 2) + self.netFourth = Decoder(96 + 81 + 2) + self.netFifth = Decoder(128 + 81 + 2) + self.netSixth = Decoder(192 + 81) + # end + + def forward(self, tenOne, tenTwo): + intWidth = tenOne.shape[3] and tenTwo.shape[3] + intHeight = tenOne.shape[2] and tenTwo.shape[2] + + tenOne = self.netExtractor(tenOne) + tenTwo = self.netExtractor(tenTwo) + + objForward = None + objBackward = None + + objForward = self.netSixth(tenOne[-1], tenTwo[-1], objForward) + objBackward = self.netSixth(tenTwo[-1], tenOne[-1], objBackward) + + objForward = self.netFifth(tenOne[-2], tenTwo[-2], objForward) + objBackward = self.netFifth(tenTwo[-2], tenOne[-2], objBackward) + + objForward = self.netFourth(tenOne[-3], tenTwo[-3], objForward) + objBackward = self.netFourth(tenTwo[-3], tenOne[-3], objBackward) + + objForward = self.netThird(tenOne[-4], tenTwo[-4], objForward) + objBackward = self.netThird(tenTwo[-4], tenOne[-4], objBackward) + + objForward = self.netSecond(tenOne[-5], tenTwo[-5], objForward) + objBackward = self.netSecond(tenTwo[-5], tenOne[-5], objBackward) + + objForward = self.netFirst(tenOne[-6], tenTwo[-6], objForward) + objBackward = self.netFirst(tenTwo[-6], tenOne[-6], objBackward) + + return { + 'tenForward': torch.nn.functional.interpolate(input=objForward['tenForward'], size=(intHeight, intWidth), mode='bilinear', align_corners=False) * (float(intWidth) / float(objForward['tenForward'].shape[3])), + 'tenBackward': torch.nn.functional.interpolate(input=objBackward['tenForward'], size=(intHeight, intWidth), mode='bilinear', align_corners=False) * (float(intWidth) / float(objBackward['tenForward'].shape[3])) + } + # end +# end + +########################################################## + +class Synthesis(torch.nn.Module): + def __init__(self): + super().__init__() + + class Basic(torch.nn.Module): + def __init__(self, strType, intChannels, boolSkip): + super().__init__() + + if strType == 'relu-conv-relu-conv': + self.netMain = torch.nn.Sequential( + torch.nn.PReLU(num_parameters=intChannels[0], init=0.25), + torch.nn.Conv2d(in_channels=intChannels[0], out_channels=intChannels[1], kernel_size=3, stride=1, padding=1, bias=False), + torch.nn.PReLU(num_parameters=intChannels[1], init=0.25), + torch.nn.Conv2d(in_channels=intChannels[1], out_channels=intChannels[2], kernel_size=3, stride=1, padding=1, bias=False) + ) + + elif strType == 'conv-relu-conv': + self.netMain = torch.nn.Sequential( + torch.nn.Conv2d(in_channels=intChannels[0], out_channels=intChannels[1], kernel_size=3, stride=1, padding=1, bias=False), + torch.nn.PReLU(num_parameters=intChannels[1], init=0.25), + torch.nn.Conv2d(in_channels=intChannels[1], out_channels=intChannels[2], kernel_size=3, stride=1, padding=1, bias=False) + ) + + # end + + self.boolSkip = boolSkip + + if boolSkip == True: + if intChannels[0] == intChannels[2]: + self.netShortcut = None + + elif intChannels[0] != intChannels[2]: + self.netShortcut = torch.nn.Conv2d(in_channels=intChannels[0], out_channels=intChannels[2], kernel_size=1, stride=1, padding=0, bias=False) + + # end + # end + # end + + def forward(self, tenInput): + if self.boolSkip == False: + return self.netMain(tenInput) + # end + + if self.netShortcut is None: + return self.netMain(tenInput) + tenInput + + elif self.netShortcut is not None: + return self.netMain(tenInput) + self.netShortcut(tenInput) + + # end + # end + # end + + class Downsample(torch.nn.Module): + def __init__(self, intChannels): + super().__init__() + + self.netMain = torch.nn.Sequential( + torch.nn.PReLU(num_parameters=intChannels[0], init=0.25), + torch.nn.Conv2d(in_channels=intChannels[0], out_channels=intChannels[1], kernel_size=3, stride=2, padding=1, bias=False), + torch.nn.PReLU(num_parameters=intChannels[1], init=0.25), + torch.nn.Conv2d(in_channels=intChannels[1], out_channels=intChannels[2], kernel_size=3, stride=1, padding=1, bias=False) + ) + # end + + def forward(self, tenInput): + return self.netMain(tenInput) + # end + # end + + class Upsample(torch.nn.Module): + def __init__(self, intChannels): + super().__init__() + + self.netMain = torch.nn.Sequential( + torch.nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False), + torch.nn.PReLU(num_parameters=intChannels[0], init=0.25), + torch.nn.Conv2d(in_channels=intChannels[0], out_channels=intChannels[1], kernel_size=3, stride=1, padding=1, bias=False), + torch.nn.PReLU(num_parameters=intChannels[1], init=0.25), + torch.nn.Conv2d(in_channels=intChannels[1], out_channels=intChannels[2], kernel_size=3, stride=1, padding=1, bias=False) + ) + # end + + def forward(self, tenInput): + return self.netMain(tenInput) + # end + # end + + class Encode(torch.nn.Module): + def __init__(self): + super().__init__() + + self.netOne = torch.nn.Sequential( + torch.nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=1, padding=1, bias=False), + torch.nn.PReLU(num_parameters=32, init=0.25), + torch.nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1, bias=False), + torch.nn.PReLU(num_parameters=32, init=0.25) + ) + + self.netTwo = torch.nn.Sequential( + torch.nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=1, bias=False), + torch.nn.PReLU(num_parameters=64, init=0.25), + torch.nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1, bias=False), + torch.nn.PReLU(num_parameters=64, init=0.25) + ) + + self.netThr = torch.nn.Sequential( + torch.nn.Conv2d(in_channels=64, out_channels=96, kernel_size=3, stride=2, padding=1, bias=False), + torch.nn.PReLU(num_parameters=96, init=0.25), + torch.nn.Conv2d(in_channels=96, out_channels=96, kernel_size=3, stride=1, padding=1, bias=False), + torch.nn.PReLU(num_parameters=96, init=0.25) + ) + # end + + def forward(self, tenInput): + tenOutput = [] + + tenOutput.append(self.netOne(tenInput)) + tenOutput.append(self.netTwo(tenOutput[-1])) + tenOutput.append(self.netThr(tenOutput[-1])) + + return [torch.cat([tenInput, tenOutput[0]], 1)] + tenOutput[1:] + # end + # end + + class Softmetric(torch.nn.Module): + def __init__(self): + super().__init__() + + self.netInput = torch.nn.Conv2d(in_channels=3, out_channels=12, kernel_size=3, stride=1, padding=1, bias=False) + self.netError = torch.nn.Conv2d(in_channels=1, out_channels=4, kernel_size=3, stride=1, padding=1, bias=False) + + for intRow, intFeatures in [(0, 16), (1, 32), (2, 64), (3, 96)]: + self.add_module(str(intRow) + 'x0' + ' - ' + str(intRow) + 'x1', Basic('relu-conv-relu-conv', [intFeatures, intFeatures, intFeatures], True)) + # end + + for intCol in [0]: + self.add_module('0x' + str(intCol) + ' - ' + '1x' + str(intCol), Downsample([16, 32, 32])) + self.add_module('1x' + str(intCol) + ' - ' + '2x' + str(intCol), Downsample([32, 64, 64])) + self.add_module('2x' + str(intCol) + ' - ' + '3x' + str(intCol), Downsample([64, 96, 96])) + # end + + for intCol in [1]: + self.add_module('3x' + str(intCol) + ' - ' + '2x' + str(intCol), Upsample([96, 64, 64])) + self.add_module('2x' + str(intCol) + ' - ' + '1x' + str(intCol), Upsample([64, 32, 32])) + self.add_module('1x' + str(intCol) + ' - ' + '0x' + str(intCol), Upsample([32, 16, 16])) + # end + + self.netOutput = Basic('conv-relu-conv', [16, 16, 1], True) + # end + + def forward(self, tenEncone, tenEnctwo, tenFlow): + tenColumn = [None, None, None, None] + + tenColumn[0] = torch.cat([self.netInput(tenEncone[0][:, 0:3, :, :]), self.netError(torch.nn.functional.l1_loss(input=tenEncone[0], target=backwarp(tenEnctwo[0], tenFlow), reduction='none').mean([1], True))], 1) + tenColumn[1] = self._modules['0x0 - 1x0'](tenColumn[0]) + tenColumn[2] = self._modules['1x0 - 2x0'](tenColumn[1]) + tenColumn[3] = self._modules['2x0 - 3x0'](tenColumn[2]) + + intColumn = 1 + for intRow in range(len(tenColumn) -1, -1, -1): + tenColumn[intRow] = self._modules[str(intRow) + 'x' + str(intColumn - 1) + ' - ' + str(intRow) + 'x' + str(intColumn)](tenColumn[intRow]) + if intRow != len(tenColumn) - 1: + tenUp = self._modules[str(intRow + 1) + 'x' + str(intColumn) + ' - ' + str(intRow) + 'x' + str(intColumn)](tenColumn[intRow + 1]) + + if tenUp.shape[2] != tenColumn[intRow].shape[2]: tenUp = torch.nn.functional.pad(input=tenUp, pad=[0, 0, 0, -1], mode='constant', value=0.0) + if tenUp.shape[3] != tenColumn[intRow].shape[3]: tenUp = torch.nn.functional.pad(input=tenUp, pad=[0, -1, 0, 0], mode='constant', value=0.0) + + tenColumn[intRow] = tenColumn[intRow] + tenUp + # end + # end + + return self.netOutput(tenColumn[0]) + # end + # end + + class Warp(torch.nn.Module): + def __init__(self): + super().__init__() + + self.netOne = Basic('conv-relu-conv', [3 + 3 + 32 + 32 + 1 + 1, 32, 32], True) + self.netTwo = Basic('conv-relu-conv', [0 + 0 + 64 + 64 + 1 + 1, 64, 64], True) + self.netThr = Basic('conv-relu-conv', [0 + 0 + 96 + 96 + 1 + 1, 96, 96], True) + # end + + def forward(self, tenEncone, tenEnctwo, tenMetricone, tenMetrictwo, tenForward, tenBackward): + tenOutput = [] + + for intLevel in range(3): + if intLevel != 0: + tenMetricone = torch.nn.functional.interpolate(input=tenMetricone, size=(tenEncone[intLevel].shape[2], tenEncone[intLevel].shape[3]), mode='bilinear', align_corners=False) + tenMetrictwo = torch.nn.functional.interpolate(input=tenMetrictwo, size=(tenEnctwo[intLevel].shape[2], tenEnctwo[intLevel].shape[3]), mode='bilinear', align_corners=False) + + tenForward = torch.nn.functional.interpolate(input=tenForward, size=(tenEncone[intLevel].shape[2], tenEncone[intLevel].shape[3]), mode='bilinear', align_corners=False) * (float(tenEncone[intLevel].shape[3]) / float(tenForward.shape[3])) + tenBackward = torch.nn.functional.interpolate(input=tenBackward, size=(tenEnctwo[intLevel].shape[2], tenEnctwo[intLevel].shape[3]), mode='bilinear', align_corners=False) * (float(tenEnctwo[intLevel].shape[3]) / float(tenBackward.shape[3])) + # end + + tenOutput.append([self.netOne, self.netTwo, self.netThr][intLevel](torch.cat([ + softsplat.softsplat(tenIn=torch.cat([tenEncone[intLevel], tenMetricone], 1), tenFlow=tenForward, tenMetric=tenMetricone.neg().clip(-20.0, 20.0), strMode='soft'), + softsplat.softsplat(tenIn=torch.cat([tenEnctwo[intLevel], tenMetrictwo], 1), tenFlow=tenBackward, tenMetric=tenMetrictwo.neg().clip(-20.0, 20.0), strMode='soft') + ], 1))) + # end + + return tenOutput + # end + # end + + self.netEncode = Encode() + + self.netSoftmetric = Softmetric() + + self.netWarp = Warp() + + for intRow, intFeatures in [(0, 32), (1, 64), (2, 96)]: + self.add_module(str(intRow) + 'x0' + ' - ' + str(intRow) + 'x1', Basic('relu-conv-relu-conv', [intFeatures, intFeatures, intFeatures], True)) + self.add_module(str(intRow) + 'x1' + ' - ' + str(intRow) + 'x2', Basic('relu-conv-relu-conv', [intFeatures, intFeatures, intFeatures], True)) + self.add_module(str(intRow) + 'x2' + ' - ' + str(intRow) + 'x3', Basic('relu-conv-relu-conv', [intFeatures, intFeatures, intFeatures], True)) + self.add_module(str(intRow) + 'x3' + ' - ' + str(intRow) + 'x4', Basic('relu-conv-relu-conv', [intFeatures, intFeatures, intFeatures], True)) + self.add_module(str(intRow) + 'x4' + ' - ' + str(intRow) + 'x5', Basic('relu-conv-relu-conv', [intFeatures, intFeatures, intFeatures], True)) + # end + + for intCol in [0, 1, 2]: + self.add_module('0x' + str(intCol) + ' - ' + '1x' + str(intCol), Downsample([32, 64, 64])) + self.add_module('1x' + str(intCol) + ' - ' + '2x' + str(intCol), Downsample([64, 96, 96])) + # end + + for intCol in [3, 4, 5]: + self.add_module('2x' + str(intCol) + ' - ' + '1x' + str(intCol), Upsample([96, 64, 64])) + self.add_module('1x' + str(intCol) + ' - ' + '0x' + str(intCol), Upsample([64, 32, 32])) + # end + + self.netOutput = Basic('conv-relu-conv', [32, 32, 3], True) + # end + + def forward(self, tenOne, tenTwo, tenForward, tenBackward, fltTime): + tenEncone = self.netEncode(tenOne) + tenEnctwo = self.netEncode(tenTwo) + + tenMetricone = self.netSoftmetric(tenEncone, tenEnctwo, tenForward) * 2.0 * fltTime + tenMetrictwo = self.netSoftmetric(tenEnctwo, tenEncone, tenBackward) * 2.0 * (1.0 - fltTime) + + tenForward = tenForward * fltTime + tenBackward = tenBackward * (1.0 - fltTime) + + tenWarp = self.netWarp(tenEncone, tenEnctwo, tenMetricone, tenMetrictwo, tenForward, tenBackward) + + tenColumn = [None, None, None] + + tenColumn[0] = tenWarp[0] + tenColumn[1] = tenWarp[1] + self._modules['0x0 - 1x0'](tenColumn[0]) + tenColumn[2] = tenWarp[2] + self._modules['1x0 - 2x0'](tenColumn[1]) + + intColumn = 1 + for intRow in range(len(tenColumn)): + tenColumn[intRow] = self._modules[str(intRow) + 'x' + str(intColumn - 1) + ' - ' + str(intRow) + 'x' + str(intColumn)](tenColumn[intRow]) + if intRow != 0: + tenColumn[intRow] = tenColumn[intRow] + self._modules[str(intRow - 1) + 'x' + str(intColumn) + ' - ' + str(intRow) + 'x' + str(intColumn)](tenColumn[intRow - 1]) + # end + # end + + intColumn = 2 + for intRow in range(len(tenColumn)): + tenColumn[intRow] = self._modules[str(intRow) + 'x' + str(intColumn - 1) + ' - ' + str(intRow) + 'x' + str(intColumn)](tenColumn[intRow]) + if intRow != 0: + tenColumn[intRow] = tenColumn[intRow] + self._modules[str(intRow - 1) + 'x' + str(intColumn) + ' - ' + str(intRow) + 'x' + str(intColumn)](tenColumn[intRow - 1]) + # end + # end + + intColumn = 3 + for intRow in range(len(tenColumn) -1, -1, -1): + tenColumn[intRow] = self._modules[str(intRow) + 'x' + str(intColumn - 1) + ' - ' + str(intRow) + 'x' + str(intColumn)](tenColumn[intRow]) + if intRow != len(tenColumn) - 1: + tenUp = self._modules[str(intRow + 1) + 'x' + str(intColumn) + ' - ' + str(intRow) + 'x' + str(intColumn)](tenColumn[intRow + 1]) + + if tenUp.shape[2] != tenColumn[intRow].shape[2]: tenUp = torch.nn.functional.pad(input=tenUp, pad=[0, 0, 0, -1], mode='constant', value=0.0) + if tenUp.shape[3] != tenColumn[intRow].shape[3]: tenUp = torch.nn.functional.pad(input=tenUp, pad=[0, -1, 0, 0], mode='constant', value=0.0) + + tenColumn[intRow] = tenColumn[intRow] + tenUp + # end + # end + + intColumn = 4 + for intRow in range(len(tenColumn) -1, -1, -1): + tenColumn[intRow] = self._modules[str(intRow) + 'x' + str(intColumn - 1) + ' - ' + str(intRow) + 'x' + str(intColumn)](tenColumn[intRow]) + if intRow != len(tenColumn) - 1: + tenUp = self._modules[str(intRow + 1) + 'x' + str(intColumn) + ' - ' + str(intRow) + 'x' + str(intColumn)](tenColumn[intRow + 1]) + + if tenUp.shape[2] != tenColumn[intRow].shape[2]: tenUp = torch.nn.functional.pad(input=tenUp, pad=[0, 0, 0, -1], mode='constant', value=0.0) + if tenUp.shape[3] != tenColumn[intRow].shape[3]: tenUp = torch.nn.functional.pad(input=tenUp, pad=[0, -1, 0, 0], mode='constant', value=0.0) + + tenColumn[intRow] = tenColumn[intRow] + tenUp + # end + # end + + intColumn = 5 + for intRow in range(len(tenColumn) -1, -1, -1): + tenColumn[intRow] = self._modules[str(intRow) + 'x' + str(intColumn - 1) + ' - ' + str(intRow) + 'x' + str(intColumn)](tenColumn[intRow]) + if intRow != len(tenColumn) - 1: + tenUp = self._modules[str(intRow + 1) + 'x' + str(intColumn) + ' - ' + str(intRow) + 'x' + str(intColumn)](tenColumn[intRow + 1]) + + if tenUp.shape[2] != tenColumn[intRow].shape[2]: tenUp = torch.nn.functional.pad(input=tenUp, pad=[0, 0, 0, -1], mode='constant', value=0.0) + if tenUp.shape[3] != tenColumn[intRow].shape[3]: tenUp = torch.nn.functional.pad(input=tenUp, pad=[0, -1, 0, 0], mode='constant', value=0.0) + + tenColumn[intRow] = tenColumn[intRow] + tenUp + # end + # end + + return self.netOutput(tenColumn[0]) + # end +# end + +########################################################## + +class Network(torch.nn.Module): + def __init__(self): + super().__init__() + + self.netFlow = Flow() + + self.netSynthesis = Synthesis() + + self.load_state_dict({strKey.replace('module', 'net'): tenWeight for strKey, tenWeight in torch.hub.load_state_dict_from_url(url='http://content.sniklaus.com/softsplat/network-' + args_strModel + '.pytorch', file_name='softsplat-' + args_strModel).items()}) + # end + + def forward(self, tenOne, tenTwo, fltTimes): + with torch.set_grad_enabled(False): + tenStats = [tenOne, tenTwo] + tenMean = sum([tenIn.mean([1, 2, 3], True) for tenIn in tenStats]) / len(tenStats) + tenStd = (sum([tenIn.std([1, 2, 3], False, True).square() + (tenMean - tenIn.mean([1, 2, 3], True)).square() for tenIn in tenStats]) / len(tenStats)).sqrt() + tenOne = ((tenOne - tenMean) / (tenStd + 0.0000001)).detach() + tenTwo = ((tenTwo - tenMean) / (tenStd + 0.0000001)).detach() + # end + + objFlow = self.netFlow(tenOne, tenTwo) + + tenImages = [self.netSynthesis(tenOne, tenTwo, objFlow['tenForward'], objFlow['tenBackward'], fltTime) for fltTime in fltTimes] + + return [(tenImage * tenStd) + tenMean for tenImage in tenImages] + # end +# end + +netNetwork = None + +########################################################## + +def estimate(tenOne, tenTwo, fltTimes): + global netNetwork + + if netNetwork is None: + netNetwork = Network().cuda().eval() + # end + + assert(tenOne.shape[1] == tenTwo.shape[1]) + assert(tenOne.shape[2] == tenTwo.shape[2]) + + intWidth = tenOne.shape[2] + intHeight = tenOne.shape[1] + + tenPreprocessedOne = tenOne.cuda().view(1, 3, intHeight, intWidth) + tenPreprocessedTwo = tenTwo.cuda().view(1, 3, intHeight, intWidth) + + intPadr = (2 - (intWidth % 2)) % 2 + intPadb = (2 - (intHeight % 2)) % 2 + + tenPreprocessedOne = torch.nn.functional.pad(input=tenPreprocessedOne, pad=[0, intPadr, 0, intPadb], mode='replicate') + tenPreprocessedTwo = torch.nn.functional.pad(input=tenPreprocessedTwo, pad=[0, intPadr, 0, intPadb], mode='replicate') + + return [tenImage[0, :, :intHeight, :intWidth].cpu() for tenImage in netNetwork(tenPreprocessedOne, tenPreprocessedTwo, fltTimes)] +# end + +########################################################## + +if __name__ == '__main__': + if args_strOut.split('.')[-1] in ['bmp', 'jpg', 'jpeg', 'png']: + tenOne = torch.FloatTensor(numpy.ascontiguousarray(numpy.array(PIL.Image.open(args_strOne))[:, :, ::-1].transpose(2, 0, 1).astype(numpy.float32) * (1.0 / 255.0))) + tenTwo = torch.FloatTensor(numpy.ascontiguousarray(numpy.array(PIL.Image.open(args_strTwo))[:, :, ::-1].transpose(2, 0, 1).astype(numpy.float32) * (1.0 / 255.0))) + + tenOutput = estimate(tenOne, tenTwo, [0.5])[0] + + PIL.Image.fromarray((tenOutput.clip(0.0, 1.0).numpy().transpose(1, 2, 0)[:, :, ::-1] * 255.0).astype(numpy.uint8)).save(args_strOut) + + elif args_strOut.split('.')[-1] in ['avi', 'mp4', 'webm', 'wmv']: + import moviepy + import moviepy.editor + import moviepy.video.io.ffmpeg_writer + + objVideoreader = moviepy.editor.VideoFileClip(filename=args_strVideo) + + intWidth = objVideoreader.w + intHeight = objVideoreader.h + + tenFrames = [None, None, None, None, None] + + with moviepy.video.io.ffmpeg_writer.FFMPEG_VideoWriter(filename=args_strOut, size=(intWidth, intHeight), fps=objVideoreader.fps) as objVideowriter: + for npyFrame in objVideoreader.iter_frames(): + tenFrames[4] = torch.FloatTensor(numpy.ascontiguousarray(npyFrame[:, :, ::-1].transpose(2, 0, 1).astype(numpy.float32) * (1.0 / 255.0))) + + if tenFrames[0] is not None: + tenFrames[1:4] = estimate(tenFrames[0], tenFrames[4], [0.25, 0.5, 0.75]) + + objVideowriter.write_frame((tenFrames[0].clip(0.0, 1.0).numpy().transpose(1, 2, 0)[:, :, ::-1] * 255.0).astype(numpy.uint8)) + objVideowriter.write_frame((tenFrames[1].clip(0.0, 1.0).numpy().transpose(1, 2, 0)[:, :, ::-1] * 255.0).astype(numpy.uint8)) + objVideowriter.write_frame((tenFrames[2].clip(0.0, 1.0).numpy().transpose(1, 2, 0)[:, :, ::-1] * 255.0).astype(numpy.uint8)) + objVideowriter.write_frame((tenFrames[3].clip(0.0, 1.0).numpy().transpose(1, 2, 0)[:, :, ::-1] * 255.0).astype(numpy.uint8)) + # end + + tenFrames[0] = torch.FloatTensor(numpy.ascontiguousarray(npyFrame[:, :, ::-1].transpose(2, 0, 1).astype(numpy.float32) * (1.0 / 255.0))) + # end + # end + + # end +# end \ No newline at end of file diff --git a/depth_anything_v2/softmax-splatting/softsplat.py b/depth_anything_v2/softmax-splatting/softsplat.py new file mode 100644 index 0000000000000000000000000000000000000000..fe146b2232197dda63c92a9c4b256d4833774d58 --- /dev/null +++ b/depth_anything_v2/softmax-splatting/softsplat.py @@ -0,0 +1,529 @@ +#!/usr/bin/env python + +import collections +import cupy +import os +import re +import torch +import typing + + +########################################################## + + +objCudacache = {} + + +def cuda_int32(intIn:int): + return cupy.int32(intIn) +# end + + +def cuda_float32(fltIn:float): + return cupy.float32(fltIn) +# end + + +def cuda_kernel(strFunction:str, strKernel:str, objVariables:typing.Dict): + if 'device' not in objCudacache: + objCudacache['device'] = torch.cuda.get_device_name() + # end + + strKey = strFunction + + for strVariable in objVariables: + objValue = objVariables[strVariable] + + strKey += strVariable + + if objValue is None: + continue + + elif type(objValue) == int: + strKey += str(objValue) + + elif type(objValue) == float: + strKey += str(objValue) + + elif type(objValue) == bool: + strKey += str(objValue) + + elif type(objValue) == str: + strKey += objValue + + elif type(objValue) == torch.Tensor: + strKey += str(objValue.dtype) + strKey += str(objValue.shape) + strKey += str(objValue.stride()) + + elif True: + print(strVariable, type(objValue)) + assert(False) + + # end + # end + + strKey += objCudacache['device'] + + if strKey not in objCudacache: + for strVariable in objVariables: + objValue = objVariables[strVariable] + + if objValue is None: + continue + + elif type(objValue) == int: + strKernel = strKernel.replace('{{' + strVariable + '}}', str(objValue)) + + elif type(objValue) == float: + strKernel = strKernel.replace('{{' + strVariable + '}}', str(objValue)) + + elif type(objValue) == bool: + strKernel = strKernel.replace('{{' + strVariable + '}}', str(objValue)) + + elif type(objValue) == str: + strKernel = strKernel.replace('{{' + strVariable + '}}', objValue) + + elif type(objValue) == torch.Tensor and objValue.dtype == torch.uint8: + strKernel = strKernel.replace('{{type}}', 'unsigned char') + + elif type(objValue) == torch.Tensor and objValue.dtype == torch.float16: + strKernel = strKernel.replace('{{type}}', 'half') + + elif type(objValue) == torch.Tensor and objValue.dtype == torch.float32: + strKernel = strKernel.replace('{{type}}', 'float') + + elif type(objValue) == torch.Tensor and objValue.dtype == torch.float64: + strKernel = strKernel.replace('{{type}}', 'double') + + elif type(objValue) == torch.Tensor and objValue.dtype == torch.int32: + strKernel = strKernel.replace('{{type}}', 'int') + + elif type(objValue) == torch.Tensor and objValue.dtype == torch.int64: + strKernel = strKernel.replace('{{type}}', 'long') + + elif type(objValue) == torch.Tensor: + print(strVariable, objValue.dtype) + assert(False) + + elif True: + print(strVariable, type(objValue)) + assert(False) + + # end + # end + + while True: + objMatch = re.search('(SIZE_)([0-4])(\()([^\)]*)(\))', strKernel) + + if objMatch is None: + break + # end + + intArg = int(objMatch.group(2)) + + strTensor = objMatch.group(4) + intSizes = objVariables[strTensor].size() + + strKernel = strKernel.replace(objMatch.group(), str(intSizes[intArg] if torch.is_tensor(intSizes[intArg]) == False else intSizes[intArg].item())) + # end + + while True: + objMatch = re.search('(OFFSET_)([0-4])(\()', strKernel) + + if objMatch is None: + break + # end + + intStart = objMatch.span()[1] + intStop = objMatch.span()[1] + intParentheses = 1 + + while True: + intParentheses += 1 if strKernel[intStop] == '(' else 0 + intParentheses -= 1 if strKernel[intStop] == ')' else 0 + + if intParentheses == 0: + break + # end + + intStop += 1 + # end + + intArgs = int(objMatch.group(2)) + strArgs = strKernel[intStart:intStop].split(',') + + assert(intArgs == len(strArgs) - 1) + + strTensor = strArgs[0] + intStrides = objVariables[strTensor].stride() + + strIndex = [] + + for intArg in range(intArgs): + strIndex.append('((' + strArgs[intArg + 1].replace('{', '(').replace('}', ')').strip() + ')*' + str(intStrides[intArg] if torch.is_tensor(intStrides[intArg]) == False else intStrides[intArg].item()) + ')') + # end + + strKernel = strKernel.replace('OFFSET_' + str(intArgs) + '(' + strKernel[intStart:intStop] + ')', '(' + str.join('+', strIndex) + ')') + # end + + while True: + objMatch = re.search('(VALUE_)([0-4])(\()', strKernel) + + if objMatch is None: + break + # end + + intStart = objMatch.span()[1] + intStop = objMatch.span()[1] + intParentheses = 1 + + while True: + intParentheses += 1 if strKernel[intStop] == '(' else 0 + intParentheses -= 1 if strKernel[intStop] == ')' else 0 + + if intParentheses == 0: + break + # end + + intStop += 1 + # end + + intArgs = int(objMatch.group(2)) + strArgs = strKernel[intStart:intStop].split(',') + + assert(intArgs == len(strArgs) - 1) + + strTensor = strArgs[0] + intStrides = objVariables[strTensor].stride() + + strIndex = [] + + for intArg in range(intArgs): + strIndex.append('((' + strArgs[intArg + 1].replace('{', '(').replace('}', ')').strip() + ')*' + str(intStrides[intArg] if torch.is_tensor(intStrides[intArg]) == False else intStrides[intArg].item()) + ')') + # end + + strKernel = strKernel.replace('VALUE_' + str(intArgs) + '(' + strKernel[intStart:intStop] + ')', strTensor + '[' + str.join('+', strIndex) + ']') + # end + + objCudacache[strKey] = { + 'strFunction': strFunction, + 'strKernel': strKernel + } + # end + + return strKey +# end + + +@cupy.memoize(for_each_device=True) +def cuda_launch(strKey:str): + if 'CUDA_HOME' not in os.environ: + os.environ['CUDA_HOME'] = cupy.cuda.get_cuda_path() + # end + + return cupy.RawKernel(objCudacache[strKey]['strKernel'], objCudacache[strKey]['strFunction'], tuple(['-I ' + os.environ['CUDA_HOME'], '-I ' + os.environ['CUDA_HOME'] + '/include'])) +# end + + +########################################################## + + +def softsplat(tenIn:torch.Tensor, tenFlow:torch.Tensor, tenMetric:torch.Tensor, strMode:str): + assert(strMode.split('-')[0] in ['sum', 'avg', 'linear', 'soft']) + + if strMode == 'sum': assert(tenMetric is None) + if strMode == 'avg': assert(tenMetric is None) + if strMode.split('-')[0] == 'linear': assert(tenMetric is not None) + if strMode.split('-')[0] == 'soft': assert(tenMetric is not None) + + if strMode == 'avg': + tenIn = torch.cat([tenIn, tenIn.new_ones([tenIn.shape[0], 1, tenIn.shape[2], tenIn.shape[3]])], 1) + + elif strMode.split('-')[0] == 'linear': + tenIn = torch.cat([tenIn * tenMetric, tenMetric], 1) + + elif strMode.split('-')[0] == 'soft': + tenIn = torch.cat([tenIn * tenMetric.exp(), tenMetric.exp()], 1) + + # end + + tenOut = softsplat_func.apply(tenIn, tenFlow) + + if strMode.split('-')[0] in ['avg', 'linear', 'soft']: + tenNormalize = tenOut[:, -1:, :, :] + + if len(strMode.split('-')) == 1: + tenNormalize = tenNormalize + 0.0000001 + + elif strMode.split('-')[1] == 'addeps': + tenNormalize = tenNormalize + 0.0000001 + + elif strMode.split('-')[1] == 'zeroeps': + tenNormalize[tenNormalize == 0.0] = 1.0 + + elif strMode.split('-')[1] == 'clipeps': + tenNormalize = tenNormalize.clip(0.0000001, None) + + # end + + tenOut = tenOut[:, :-1, :, :] / tenNormalize + # end + + return tenOut +# end + + +class softsplat_func(torch.autograd.Function): + @staticmethod + @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32) + def forward(self, tenIn, tenFlow): + tenOut = tenIn.new_zeros([tenIn.shape[0], tenIn.shape[1], tenIn.shape[2], tenIn.shape[3]]) + + if tenIn.is_cuda == True: + cuda_launch(cuda_kernel('softsplat_out', ''' + extern "C" __global__ void __launch_bounds__(512) softsplat_out( + const int n, + const {{type}}* __restrict__ tenIn, + const {{type}}* __restrict__ tenFlow, + {{type}}* __restrict__ tenOut + ) { for (int intIndex = (blockIdx.x * blockDim.x) + threadIdx.x; intIndex < n; intIndex += blockDim.x * gridDim.x) { + const int intN = ( intIndex / SIZE_3(tenOut) / SIZE_2(tenOut) / SIZE_1(tenOut) ) % SIZE_0(tenOut); + const int intC = ( intIndex / SIZE_3(tenOut) / SIZE_2(tenOut) ) % SIZE_1(tenOut); + const int intY = ( intIndex / SIZE_3(tenOut) ) % SIZE_2(tenOut); + const int intX = ( intIndex ) % SIZE_3(tenOut); + + assert(SIZE_1(tenFlow) == 2); + + {{type}} fltX = ({{type}}) (intX) + VALUE_4(tenFlow, intN, 0, intY, intX); + {{type}} fltY = ({{type}}) (intY) + VALUE_4(tenFlow, intN, 1, intY, intX); + + if (isfinite(fltX) == false) { return; } + if (isfinite(fltY) == false) { return; } + + {{type}} fltIn = VALUE_4(tenIn, intN, intC, intY, intX); + + int intNorthwestX = (int) (floor(fltX)); + int intNorthwestY = (int) (floor(fltY)); + int intNortheastX = intNorthwestX + 1; + int intNortheastY = intNorthwestY; + int intSouthwestX = intNorthwestX; + int intSouthwestY = intNorthwestY + 1; + int intSoutheastX = intNorthwestX + 1; + int intSoutheastY = intNorthwestY + 1; + + {{type}} fltNorthwest = (({{type}}) (intSoutheastX) - fltX) * (({{type}}) (intSoutheastY) - fltY); + {{type}} fltNortheast = (fltX - ({{type}}) (intSouthwestX)) * (({{type}}) (intSouthwestY) - fltY); + {{type}} fltSouthwest = (({{type}}) (intNortheastX) - fltX) * (fltY - ({{type}}) (intNortheastY)); + {{type}} fltSoutheast = (fltX - ({{type}}) (intNorthwestX)) * (fltY - ({{type}}) (intNorthwestY)); + + if ((intNorthwestX >= 0) && (intNorthwestX < SIZE_3(tenOut)) && (intNorthwestY >= 0) && (intNorthwestY < SIZE_2(tenOut))) { + atomicAdd(&tenOut[OFFSET_4(tenOut, intN, intC, intNorthwestY, intNorthwestX)], fltIn * fltNorthwest); + } + + if ((intNortheastX >= 0) && (intNortheastX < SIZE_3(tenOut)) && (intNortheastY >= 0) && (intNortheastY < SIZE_2(tenOut))) { + atomicAdd(&tenOut[OFFSET_4(tenOut, intN, intC, intNortheastY, intNortheastX)], fltIn * fltNortheast); + } + + if ((intSouthwestX >= 0) && (intSouthwestX < SIZE_3(tenOut)) && (intSouthwestY >= 0) && (intSouthwestY < SIZE_2(tenOut))) { + atomicAdd(&tenOut[OFFSET_4(tenOut, intN, intC, intSouthwestY, intSouthwestX)], fltIn * fltSouthwest); + } + + if ((intSoutheastX >= 0) && (intSoutheastX < SIZE_3(tenOut)) && (intSoutheastY >= 0) && (intSoutheastY < SIZE_2(tenOut))) { + atomicAdd(&tenOut[OFFSET_4(tenOut, intN, intC, intSoutheastY, intSoutheastX)], fltIn * fltSoutheast); + } + } } + ''', { + 'tenIn': tenIn, + 'tenFlow': tenFlow, + 'tenOut': tenOut + }))( + grid=tuple([int((tenOut.nelement() + 512 - 1) / 512), 1, 1]), + block=tuple([512, 1, 1]), + args=[cuda_int32(tenOut.nelement()), tenIn.data_ptr(), tenFlow.data_ptr(), tenOut.data_ptr()], + stream=collections.namedtuple('Stream', 'ptr')(torch.cuda.current_stream().cuda_stream) + ) + + elif tenIn.is_cuda != True: + assert(False) + + # end + + self.save_for_backward(tenIn, tenFlow) + + return tenOut + # end + + @staticmethod + @torch.cuda.amp.custom_bwd + def backward(self, tenOutgrad): + tenIn, tenFlow = self.saved_tensors + + tenOutgrad = tenOutgrad.contiguous(); assert(tenOutgrad.is_cuda == True) + + tenIngrad = tenIn.new_zeros([tenIn.shape[0], tenIn.shape[1], tenIn.shape[2], tenIn.shape[3]]) if self.needs_input_grad[0] == True else None + tenFlowgrad = tenFlow.new_zeros([tenFlow.shape[0], tenFlow.shape[1], tenFlow.shape[2], tenFlow.shape[3]]) if self.needs_input_grad[1] == True else None + + if tenIngrad is not None: + cuda_launch(cuda_kernel('softsplat_ingrad', ''' + extern "C" __global__ void __launch_bounds__(512) softsplat_ingrad( + const int n, + const {{type}}* __restrict__ tenIn, + const {{type}}* __restrict__ tenFlow, + const {{type}}* __restrict__ tenOutgrad, + {{type}}* __restrict__ tenIngrad, + {{type}}* __restrict__ tenFlowgrad + ) { for (int intIndex = (blockIdx.x * blockDim.x) + threadIdx.x; intIndex < n; intIndex += blockDim.x * gridDim.x) { + const int intN = ( intIndex / SIZE_3(tenIngrad) / SIZE_2(tenIngrad) / SIZE_1(tenIngrad) ) % SIZE_0(tenIngrad); + const int intC = ( intIndex / SIZE_3(tenIngrad) / SIZE_2(tenIngrad) ) % SIZE_1(tenIngrad); + const int intY = ( intIndex / SIZE_3(tenIngrad) ) % SIZE_2(tenIngrad); + const int intX = ( intIndex ) % SIZE_3(tenIngrad); + + assert(SIZE_1(tenFlow) == 2); + + {{type}} fltIngrad = 0.0f; + + {{type}} fltX = ({{type}}) (intX) + VALUE_4(tenFlow, intN, 0, intY, intX); + {{type}} fltY = ({{type}}) (intY) + VALUE_4(tenFlow, intN, 1, intY, intX); + + if (isfinite(fltX) == false) { return; } + if (isfinite(fltY) == false) { return; } + + int intNorthwestX = (int) (floor(fltX)); + int intNorthwestY = (int) (floor(fltY)); + int intNortheastX = intNorthwestX + 1; + int intNortheastY = intNorthwestY; + int intSouthwestX = intNorthwestX; + int intSouthwestY = intNorthwestY + 1; + int intSoutheastX = intNorthwestX + 1; + int intSoutheastY = intNorthwestY + 1; + + {{type}} fltNorthwest = (({{type}}) (intSoutheastX) - fltX) * (({{type}}) (intSoutheastY) - fltY); + {{type}} fltNortheast = (fltX - ({{type}}) (intSouthwestX)) * (({{type}}) (intSouthwestY) - fltY); + {{type}} fltSouthwest = (({{type}}) (intNortheastX) - fltX) * (fltY - ({{type}}) (intNortheastY)); + {{type}} fltSoutheast = (fltX - ({{type}}) (intNorthwestX)) * (fltY - ({{type}}) (intNorthwestY)); + + if ((intNorthwestX >= 0) && (intNorthwestX < SIZE_3(tenOutgrad)) && (intNorthwestY >= 0) && (intNorthwestY < SIZE_2(tenOutgrad))) { + fltIngrad += VALUE_4(tenOutgrad, intN, intC, intNorthwestY, intNorthwestX) * fltNorthwest; + } + + if ((intNortheastX >= 0) && (intNortheastX < SIZE_3(tenOutgrad)) && (intNortheastY >= 0) && (intNortheastY < SIZE_2(tenOutgrad))) { + fltIngrad += VALUE_4(tenOutgrad, intN, intC, intNortheastY, intNortheastX) * fltNortheast; + } + + if ((intSouthwestX >= 0) && (intSouthwestX < SIZE_3(tenOutgrad)) && (intSouthwestY >= 0) && (intSouthwestY < SIZE_2(tenOutgrad))) { + fltIngrad += VALUE_4(tenOutgrad, intN, intC, intSouthwestY, intSouthwestX) * fltSouthwest; + } + + if ((intSoutheastX >= 0) && (intSoutheastX < SIZE_3(tenOutgrad)) && (intSoutheastY >= 0) && (intSoutheastY < SIZE_2(tenOutgrad))) { + fltIngrad += VALUE_4(tenOutgrad, intN, intC, intSoutheastY, intSoutheastX) * fltSoutheast; + } + + tenIngrad[intIndex] = fltIngrad; + } } + ''', { + 'tenIn': tenIn, + 'tenFlow': tenFlow, + 'tenOutgrad': tenOutgrad, + 'tenIngrad': tenIngrad, + 'tenFlowgrad': tenFlowgrad + }))( + grid=tuple([int((tenIngrad.nelement() + 512 - 1) / 512), 1, 1]), + block=tuple([512, 1, 1]), + args=[cuda_int32(tenIngrad.nelement()), tenIn.data_ptr(), tenFlow.data_ptr(), tenOutgrad.data_ptr(), tenIngrad.data_ptr(), None], + stream=collections.namedtuple('Stream', 'ptr')(torch.cuda.current_stream().cuda_stream) + ) + # end + + if tenFlowgrad is not None: + cuda_launch(cuda_kernel('softsplat_flowgrad', ''' + extern "C" __global__ void __launch_bounds__(512) softsplat_flowgrad( + const int n, + const {{type}}* __restrict__ tenIn, + const {{type}}* __restrict__ tenFlow, + const {{type}}* __restrict__ tenOutgrad, + {{type}}* __restrict__ tenIngrad, + {{type}}* __restrict__ tenFlowgrad + ) { for (int intIndex = (blockIdx.x * blockDim.x) + threadIdx.x; intIndex < n; intIndex += blockDim.x * gridDim.x) { + const int intN = ( intIndex / SIZE_3(tenFlowgrad) / SIZE_2(tenFlowgrad) / SIZE_1(tenFlowgrad) ) % SIZE_0(tenFlowgrad); + const int intC = ( intIndex / SIZE_3(tenFlowgrad) / SIZE_2(tenFlowgrad) ) % SIZE_1(tenFlowgrad); + const int intY = ( intIndex / SIZE_3(tenFlowgrad) ) % SIZE_2(tenFlowgrad); + const int intX = ( intIndex ) % SIZE_3(tenFlowgrad); + + assert(SIZE_1(tenFlow) == 2); + + {{type}} fltFlowgrad = 0.0f; + + {{type}} fltX = ({{type}}) (intX) + VALUE_4(tenFlow, intN, 0, intY, intX); + {{type}} fltY = ({{type}}) (intY) + VALUE_4(tenFlow, intN, 1, intY, intX); + + if (isfinite(fltX) == false) { return; } + if (isfinite(fltY) == false) { return; } + + int intNorthwestX = (int) (floor(fltX)); + int intNorthwestY = (int) (floor(fltY)); + int intNortheastX = intNorthwestX + 1; + int intNortheastY = intNorthwestY; + int intSouthwestX = intNorthwestX; + int intSouthwestY = intNorthwestY + 1; + int intSoutheastX = intNorthwestX + 1; + int intSoutheastY = intNorthwestY + 1; + + {{type}} fltNorthwest = 0.0f; + {{type}} fltNortheast = 0.0f; + {{type}} fltSouthwest = 0.0f; + {{type}} fltSoutheast = 0.0f; + + if (intC == 0) { + fltNorthwest = (({{type}}) (-1.0f)) * (({{type}}) (intSoutheastY) - fltY); + fltNortheast = (({{type}}) (+1.0f)) * (({{type}}) (intSouthwestY) - fltY); + fltSouthwest = (({{type}}) (-1.0f)) * (fltY - ({{type}}) (intNortheastY)); + fltSoutheast = (({{type}}) (+1.0f)) * (fltY - ({{type}}) (intNorthwestY)); + + } else if (intC == 1) { + fltNorthwest = (({{type}}) (intSoutheastX) - fltX) * (({{type}}) (-1.0f)); + fltNortheast = (fltX - ({{type}}) (intSouthwestX)) * (({{type}}) (-1.0f)); + fltSouthwest = (({{type}}) (intNortheastX) - fltX) * (({{type}}) (+1.0f)); + fltSoutheast = (fltX - ({{type}}) (intNorthwestX)) * (({{type}}) (+1.0f)); + + } + + for (int intChannel = 0; intChannel < SIZE_1(tenOutgrad); intChannel += 1) { + {{type}} fltIn = VALUE_4(tenIn, intN, intChannel, intY, intX); + + if ((intNorthwestX >= 0) && (intNorthwestX < SIZE_3(tenOutgrad)) && (intNorthwestY >= 0) && (intNorthwestY < SIZE_2(tenOutgrad))) { + fltFlowgrad += VALUE_4(tenOutgrad, intN, intChannel, intNorthwestY, intNorthwestX) * fltIn * fltNorthwest; + } + + if ((intNortheastX >= 0) && (intNortheastX < SIZE_3(tenOutgrad)) && (intNortheastY >= 0) && (intNortheastY < SIZE_2(tenOutgrad))) { + fltFlowgrad += VALUE_4(tenOutgrad, intN, intChannel, intNortheastY, intNortheastX) * fltIn * fltNortheast; + } + + if ((intSouthwestX >= 0) && (intSouthwestX < SIZE_3(tenOutgrad)) && (intSouthwestY >= 0) && (intSouthwestY < SIZE_2(tenOutgrad))) { + fltFlowgrad += VALUE_4(tenOutgrad, intN, intChannel, intSouthwestY, intSouthwestX) * fltIn * fltSouthwest; + } + + if ((intSoutheastX >= 0) && (intSoutheastX < SIZE_3(tenOutgrad)) && (intSoutheastY >= 0) && (intSoutheastY < SIZE_2(tenOutgrad))) { + fltFlowgrad += VALUE_4(tenOutgrad, intN, intChannel, intSoutheastY, intSoutheastX) * fltIn * fltSoutheast; + } + } + + tenFlowgrad[intIndex] = fltFlowgrad; + } } + ''', { + 'tenIn': tenIn, + 'tenFlow': tenFlow, + 'tenOutgrad': tenOutgrad, + 'tenIngrad': tenIngrad, + 'tenFlowgrad': tenFlowgrad + }))( + grid=tuple([int((tenFlowgrad.nelement() + 512 - 1) / 512), 1, 1]), + block=tuple([512, 1, 1]), + args=[cuda_int32(tenFlowgrad.nelement()), tenIn.data_ptr(), tenFlow.data_ptr(), tenOutgrad.data_ptr(), None, tenFlowgrad.data_ptr()], + stream=collections.namedtuple('Stream', 'ptr')(torch.cuda.current_stream().cuda_stream) + ) + # end + + return tenIngrad, tenFlowgrad + # end +# end diff --git a/depth_anything_v2/softmax-splatting/videos/README.md b/depth_anything_v2/softmax-splatting/videos/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7f6d609da095387ad2d338da5ae1e39d6bab11f5 --- /dev/null +++ b/depth_anything_v2/softmax-splatting/videos/README.md @@ -0,0 +1 @@ +The used example originates from the DAVIS challenge: https://davischallenge.org/ \ No newline at end of file diff --git a/depth_anything_v2/softmax-splatting/videos/car-turn.mp4 b/depth_anything_v2/softmax-splatting/videos/car-turn.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..151c5848ded82c5e1416b701538822ed6c6e466a Binary files /dev/null and b/depth_anything_v2/softmax-splatting/videos/car-turn.mp4 differ diff --git a/depth_anything_v2/util/__pycache__/blocks.cpython-310.pyc b/depth_anything_v2/util/__pycache__/blocks.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d519a0ce470979f4416691486fa6b85b6d0af32 Binary files /dev/null and b/depth_anything_v2/util/__pycache__/blocks.cpython-310.pyc differ diff --git a/depth_anything_v2/util/__pycache__/transform.cpython-310.pyc b/depth_anything_v2/util/__pycache__/transform.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b7cf772329cea699085a64c9eab5e5b7c391229c Binary files /dev/null and b/depth_anything_v2/util/__pycache__/transform.cpython-310.pyc differ diff --git a/depth_anything_v2/util/blocks.py b/depth_anything_v2/util/blocks.py new file mode 100644 index 0000000000000000000000000000000000000000..382ea183a40264056142afffc201c992a2b01d37 --- /dev/null +++ b/depth_anything_v2/util/blocks.py @@ -0,0 +1,148 @@ +import torch.nn as nn + + +def _make_scratch(in_shape, out_shape, groups=1, expand=False): + scratch = nn.Module() + + out_shape1 = out_shape + out_shape2 = out_shape + out_shape3 = out_shape + if len(in_shape) >= 4: + out_shape4 = out_shape + + if expand: + out_shape1 = out_shape + out_shape2 = out_shape * 2 + out_shape3 = out_shape * 4 + if len(in_shape) >= 4: + out_shape4 = out_shape * 8 + + scratch.layer1_rn = nn.Conv2d(in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups) + scratch.layer2_rn = nn.Conv2d(in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups) + scratch.layer3_rn = nn.Conv2d(in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups) + if len(in_shape) >= 4: + scratch.layer4_rn = nn.Conv2d(in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups) + + return scratch + + +class ResidualConvUnit(nn.Module): + """Residual convolution module. + """ + + def __init__(self, features, activation, bn): + """Init. + + Args: + features (int): number of features + """ + super().__init__() + + self.bn = bn + + self.groups=1 + + self.conv1 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups) + + self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups) + + if self.bn == True: + self.bn1 = nn.BatchNorm2d(features) + self.bn2 = nn.BatchNorm2d(features) + + self.activation = activation + + self.skip_add = nn.quantized.FloatFunctional() + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input + + Returns: + tensor: output + """ + + out = self.activation(x) + out = self.conv1(out) + if self.bn == True: + out = self.bn1(out) + + out = self.activation(out) + out = self.conv2(out) + if self.bn == True: + out = self.bn2(out) + + if self.groups > 1: + out = self.conv_merge(out) + + return self.skip_add.add(out, x) + + +class FeatureFusionBlock(nn.Module): + """Feature fusion block. + """ + + def __init__( + self, + features, + activation, + deconv=False, + bn=False, + expand=False, + align_corners=True, + size=None + ): + """Init. + + Args: + features (int): number of features + """ + super(FeatureFusionBlock, self).__init__() + + self.deconv = deconv + self.align_corners = align_corners + + self.groups=1 + + self.expand = expand + out_features = features + if self.expand == True: + out_features = features // 2 + + self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1) + + self.resConfUnit1 = ResidualConvUnit(features, activation, bn) + self.resConfUnit2 = ResidualConvUnit(features, activation, bn) + + self.skip_add = nn.quantized.FloatFunctional() + + self.size=size + + def forward(self, *xs, size=None): + """Forward pass. + + Returns: + tensor: output + """ + output = xs[0] + + if len(xs) == 2: + res = self.resConfUnit1(xs[1]) + output = self.skip_add.add(output, res) + + output = self.resConfUnit2(output) + + if (size is None) and (self.size is None): + modifier = {"scale_factor": 2} + elif size is None: + modifier = {"size": self.size} + else: + modifier = {"size": size} + + output = nn.functional.interpolate(output, **modifier, mode="bilinear", align_corners=self.align_corners) + + output = self.out_conv(output) + + return output diff --git a/depth_anything_v2/util/transform.py b/depth_anything_v2/util/transform.py new file mode 100644 index 0000000000000000000000000000000000000000..b14aacd44ea086b01725a9ca68bb49eadcf37d73 --- /dev/null +++ b/depth_anything_v2/util/transform.py @@ -0,0 +1,158 @@ +import numpy as np +import cv2 + + +class Resize(object): + """Resize sample to given size (width, height). + """ + + def __init__( + self, + width, + height, + resize_target=True, + keep_aspect_ratio=False, + ensure_multiple_of=1, + resize_method="lower_bound", + image_interpolation_method=cv2.INTER_AREA, + ): + """Init. + + Args: + width (int): desired output width + height (int): desired output height + resize_target (bool, optional): + True: Resize the full sample (image, mask, target). + False: Resize image only. + Defaults to True. + keep_aspect_ratio (bool, optional): + True: Keep the aspect ratio of the input sample. + Output sample might not have the given width and height, and + resize behaviour depends on the parameter 'resize_method'. + Defaults to False. + ensure_multiple_of (int, optional): + Output width and height is constrained to be multiple of this parameter. + Defaults to 1. + resize_method (str, optional): + "lower_bound": Output will be at least as large as the given size. + "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.) + "minimal": Scale as least as possible. (Output size might be smaller than given size.) + Defaults to "lower_bound". + """ + self.__width = width + self.__height = height + + self.__resize_target = resize_target + self.__keep_aspect_ratio = keep_aspect_ratio + self.__multiple_of = ensure_multiple_of + self.__resize_method = resize_method + self.__image_interpolation_method = image_interpolation_method + + def constrain_to_multiple_of(self, x, min_val=0, max_val=None): + y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int) + + if max_val is not None and y > max_val: + y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int) + + if y < min_val: + y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int) + + return y + + def get_size(self, width, height): + # determine new height and width + scale_height = self.__height / height + scale_width = self.__width / width + + if self.__keep_aspect_ratio: + if self.__resize_method == "lower_bound": + # scale such that output size is lower bound + if scale_width > scale_height: + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + elif self.__resize_method == "upper_bound": + # scale such that output size is upper bound + if scale_width < scale_height: + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + elif self.__resize_method == "minimal": + # scale as least as possbile + if abs(1 - scale_width) < abs(1 - scale_height): + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + else: + raise ValueError(f"resize_method {self.__resize_method} not implemented") + + if self.__resize_method == "lower_bound": + new_height = self.constrain_to_multiple_of(scale_height * height, min_val=self.__height) + new_width = self.constrain_to_multiple_of(scale_width * width, min_val=self.__width) + elif self.__resize_method == "upper_bound": + new_height = self.constrain_to_multiple_of(scale_height * height, max_val=self.__height) + new_width = self.constrain_to_multiple_of(scale_width * width, max_val=self.__width) + elif self.__resize_method == "minimal": + new_height = self.constrain_to_multiple_of(scale_height * height) + new_width = self.constrain_to_multiple_of(scale_width * width) + else: + raise ValueError(f"resize_method {self.__resize_method} not implemented") + + return (new_width, new_height) + + def __call__(self, sample): + width, height = self.get_size(sample["image"].shape[1], sample["image"].shape[0]) + + # resize sample + sample["image"] = cv2.resize(sample["image"], (width, height), interpolation=self.__image_interpolation_method) + + if self.__resize_target: + if "depth" in sample: + sample["depth"] = cv2.resize(sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST) + + if "mask" in sample: + sample["mask"] = cv2.resize(sample["mask"].astype(np.float32), (width, height), interpolation=cv2.INTER_NEAREST) + + return sample + + +class NormalizeImage(object): + """Normlize image by given mean and std. + """ + + def __init__(self, mean, std): + self.__mean = mean + self.__std = std + + def __call__(self, sample): + sample["image"] = (sample["image"] - self.__mean) / self.__std + + return sample + + +class PrepareForNet(object): + """Prepare sample for usage as network input. + """ + + def __init__(self): + pass + + def __call__(self, sample): + image = np.transpose(sample["image"], (2, 0, 1)) + sample["image"] = np.ascontiguousarray(image).astype(np.float32) + + if "depth" in sample: + depth = sample["depth"].astype(np.float32) + sample["depth"] = np.ascontiguousarray(depth) + + if "mask" in sample: + sample["mask"] = sample["mask"].astype(np.float32) + sample["mask"] = np.ascontiguousarray(sample["mask"]) + + return sample \ No newline at end of file