kofaceid

Running on Zero

App Files Files Community

lixiang46 commited on Aug 1

Commit

66fd925

•

1 Parent(s): 7132521

init faceid

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
README.md +3 -3
annotator/canny/__init__.py +0 -6
annotator/midas/LICENSE +0 -21
annotator/midas/__init__.py +0 -35
annotator/midas/api.py +0 -169
annotator/midas/midas/__init__.py +0 -0
annotator/midas/midas/base_model.py +0 -16
annotator/midas/midas/blocks.py +0 -342
annotator/midas/midas/dpt_depth.py +0 -109
annotator/midas/midas/midas_net.py +0 -76
annotator/midas/midas/midas_net_custom.py +0 -128
annotator/midas/midas/transforms.py +0 -234
annotator/midas/midas/vit.py +0 -491
annotator/midas/utils.py +0 -189
annotator/util.py +0 -129
app.py +84 -155
assets/title.md +3 -3
basicsr/__init__.py +0 -11
basicsr/archs/__init__.py +0 -24
basicsr/archs/arch_util.py +0 -313
basicsr/archs/basicvsr_arch.py +0 -336
basicsr/archs/basicvsrpp_arch.py +0 -417
basicsr/archs/dfdnet_arch.py +0 -169
basicsr/archs/dfdnet_util.py +0 -162
basicsr/archs/discriminator_arch.py +0 -150
basicsr/archs/duf_arch.py +0 -276
basicsr/archs/ecbsr_arch.py +0 -275
basicsr/archs/edsr_arch.py +0 -61
basicsr/archs/edvr_arch.py +0 -382
basicsr/archs/hifacegan_arch.py +0 -260
basicsr/archs/hifacegan_util.py +0 -255
basicsr/archs/inception.py +0 -307
basicsr/archs/rcan_arch.py +0 -135
basicsr/archs/ridnet_arch.py +0 -180
basicsr/archs/rrdbnet_arch.py +0 -119
basicsr/archs/spynet_arch.py +0 -96
basicsr/archs/srresnet_arch.py +0 -65
basicsr/archs/srvgg_arch.py +0 -70
basicsr/archs/stylegan2_arch.py +0 -799
basicsr/archs/stylegan2_bilinear_arch.py +0 -614
basicsr/archs/swinir_arch.py +0 -956
basicsr/archs/tof_arch.py +0 -172
basicsr/archs/vgg_arch.py +0 -161
basicsr/data/__init__.py +0 -101
basicsr/data/data_sampler.py +0 -48
basicsr/data/data_util.py +0 -315
basicsr/data/degradations.py +0 -764
basicsr/data/ffhq_dataset.py +0 -80
basicsr/data/meta_info/meta_info_DIV2K800sub_GT.txt +0 -0

.gitattributes CHANGED Viewed

@@ -37,3 +37,5 @@ image/bird.png filter=lfs diff=lfs merge=lfs -text
 image/dog.png filter=lfs diff=lfs merge=lfs -text
 image/woman_1.png filter=lfs diff=lfs merge=lfs -text
 image/woman_2.png filter=lfs diff=lfs merge=lfs -text

 image/dog.png filter=lfs diff=lfs merge=lfs -text
 image/woman_1.png filter=lfs diff=lfs merge=lfs -text
 image/woman_2.png filter=lfs diff=lfs merge=lfs -text
+image/image1.png filter=lfs diff=lfs merge=lfs -text
+image/image2.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: Kolors-Controlnet
-emoji: 🏞️
 colorFrom: purple
-colorTo: green
 sdk: gradio
 sdk_version: 4.38.1
 app_file: app.py

 ---
+title: Kolors-FaceID
+emoji: 🥸
 colorFrom: purple
+colorTo: yellow
 sdk: gradio
 sdk_version: 4.38.1
 app_file: app.py

annotator/canny/__init__.py DELETED Viewed

@@ -1,6 +0,0 @@
-import cv2
-class CannyDetector:
-    def __call__(self, img, low_threshold, high_threshold):
-        return cv2.Canny(img, low_threshold, high_threshold)

annotator/midas/LICENSE DELETED Viewed

@@ -1,21 +0,0 @@
-MIT License
-Copyright (c) 2019 Intel ISL (Intel Intelligent Systems Lab)
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.

annotator/midas/__init__.py DELETED Viewed

@@ -1,35 +0,0 @@
-# Midas Depth Estimation
-# From https://github.com/isl-org/MiDaS
-# MIT LICENSE
-import cv2
-import numpy as np
-import torch
-from einops import rearrange
-from .api import MiDaSInference
-class MidasDetector:
-    def __init__(self):
-        self.model = MiDaSInference(model_type="dpt_hybrid").cuda()
-        self.rng = np.random.RandomState(0)
-    def __call__(self, input_image):
-        assert input_image.ndim == 3
-        image_depth = input_image
-        with torch.no_grad():
-            image_depth = torch.from_numpy(image_depth).float().cuda()
-            image_depth = image_depth / 127.5 - 1.0
-            image_depth = rearrange(image_depth, 'h w c -> 1 c h w')
-            depth = self.model(image_depth)[0]
-            depth -= torch.min(depth)
-            depth /= torch.max(depth)
-            depth = depth.cpu().numpy()
-            depth_image = (depth * 255.0).clip(0, 255).astype(np.uint8)
-            return depth_image

annotator/midas/api.py DELETED Viewed

@@ -1,169 +0,0 @@
-# based on https://github.com/isl-org/MiDaS
-import cv2
-import os
-import torch
-import torch.nn as nn
-from torchvision.transforms import Compose
-from .midas.dpt_depth import DPTDepthModel
-from .midas.midas_net import MidasNet
-from .midas.midas_net_custom import MidasNet_small
-from .midas.transforms import Resize, NormalizeImage, PrepareForNet
-from annotator.util import annotator_ckpts_path
-ISL_PATHS = {
-    "dpt_large": os.path.join(annotator_ckpts_path, "dpt_large-midas-2f21e586.pt"),
-    "dpt_hybrid": os.path.join(annotator_ckpts_path, "dpt_hybrid-midas-501f0c75.pt"),
-    "midas_v21": "",
-    "midas_v21_small": "",
-}
-remote_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/dpt_hybrid-midas-501f0c75.pt"
-def disabled_train(self, mode=True):
-    """Overwrite model.train with this function to make sure train/eval mode
-    does not change anymore."""
-    return self
-def load_midas_transform(model_type):
-    # https://github.com/isl-org/MiDaS/blob/master/run.py
-    # load transform only
-    if model_type == "dpt_large":  # DPT-Large
-        net_w, net_h = 384, 384
-        resize_mode = "minimal"
-        normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
-    elif model_type == "dpt_hybrid":  # DPT-Hybrid
-        net_w, net_h = 384, 384
-        resize_mode = "minimal"
-        normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
-    elif model_type == "midas_v21":
-        net_w, net_h = 384, 384
-        resize_mode = "upper_bound"
-        normalization = NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-    elif model_type == "midas_v21_small":
-        net_w, net_h = 256, 256
-        resize_mode = "upper_bound"
-        normalization = NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-    else:
-        assert False, f"model_type '{model_type}' not implemented, use: --model_type large"
-    transform = Compose(
-        [
-            Resize(
-                net_w,
-                net_h,
-                resize_target=None,
-                keep_aspect_ratio=True,
-                ensure_multiple_of=32,
-                resize_method=resize_mode,
-                image_interpolation_method=cv2.INTER_CUBIC,
-            ),
-            normalization,
-            PrepareForNet(),
-        ]
-    )
-    return transform
-def load_model(model_type):
-    # https://github.com/isl-org/MiDaS/blob/master/run.py
-    # load network
-    model_path = ISL_PATHS[model_type]
-    if model_type == "dpt_large":  # DPT-Large
-        model = DPTDepthModel(
-            path=model_path,
-            backbone="vitl16_384",
-            non_negative=True,
-        )
-        net_w, net_h = 384, 384
-        resize_mode = "minimal"
-        normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
-    elif model_type == "dpt_hybrid":  # DPT-Hybrid
-        if not os.path.exists(model_path):
-            from basicsr.utils.download_util import load_file_from_url
-            load_file_from_url(remote_model_path, model_dir=annotator_ckpts_path)
-        model = DPTDepthModel(
-            path=model_path,
-            backbone="vitb_rn50_384",
-            non_negative=True,
-        )
-        net_w, net_h = 384, 384
-        resize_mode = "minimal"
-        normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
-    elif model_type == "midas_v21":
-        model = MidasNet(model_path, non_negative=True)
-        net_w, net_h = 384, 384
-        resize_mode = "upper_bound"
-        normalization = NormalizeImage(
-            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
-        )
-    elif model_type == "midas_v21_small":
-        model = MidasNet_small(model_path, features=64, backbone="efficientnet_lite3", exportable=True,
-                               non_negative=True, blocks={'expand': True})
-        net_w, net_h = 256, 256
-        resize_mode = "upper_bound"
-        normalization = NormalizeImage(
-            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
-        )
-    else:
-        print(f"model_type '{model_type}' not implemented, use: --model_type large")
-        assert False
-    transform = Compose(
-        [
-            Resize(
-                net_w,
-                net_h,
-                resize_target=None,
-                keep_aspect_ratio=True,
-                ensure_multiple_of=32,
-                resize_method=resize_mode,
-                image_interpolation_method=cv2.INTER_CUBIC,
-            ),
-            normalization,
-            PrepareForNet(),
-        ]
-    )
-    return model.eval(), transform
-class MiDaSInference(nn.Module):
-    MODEL_TYPES_TORCH_HUB = [
-        "DPT_Large",
-        "DPT_Hybrid",
-        "MiDaS_small"
-    ]
-    MODEL_TYPES_ISL = [
-        "dpt_large",
-        "dpt_hybrid",
-        "midas_v21",
-        "midas_v21_small",
-    ]
-    def __init__(self, model_type):
-        super().__init__()
-        assert (model_type in self.MODEL_TYPES_ISL)
-        model, _ = load_model(model_type)
-        self.model = model
-        self.model.train = disabled_train
-    def forward(self, x):
-        with torch.no_grad():
-            prediction = self.model(x)
-        return prediction

annotator/midas/midas/__init__.py DELETED Viewed

File without changes

annotator/midas/midas/base_model.py DELETED Viewed

@@ -1,16 +0,0 @@
-import torch
-class BaseModel(torch.nn.Module):
-    def load(self, path):
-        """Load model from file.
-        Args:
-            path (str): file path
-        """
-        parameters = torch.load(path, map_location=torch.device('cpu'))
-        if "optimizer" in parameters:
-            parameters = parameters["model"]
-        self.load_state_dict(parameters)

annotator/midas/midas/blocks.py DELETED Viewed

@@ -1,342 +0,0 @@
-import torch
-import torch.nn as nn
-from .vit import (
-    _make_pretrained_vitb_rn50_384,
-    _make_pretrained_vitl16_384,
-    _make_pretrained_vitb16_384,
-    forward_vit,
-)
-def _make_encoder(backbone, features, use_pretrained, groups=1, expand=False, exportable=True, hooks=None, use_vit_only=False, use_readout="ignore",):
-    if backbone == "vitl16_384":
-        pretrained = _make_pretrained_vitl16_384(
-            use_pretrained, hooks=hooks, use_readout=use_readout
-        )
-        scratch = _make_scratch(
-            [256, 512, 1024, 1024], features, groups=groups, expand=expand
-        )  # ViT-L/16 - 85.0% Top1 (backbone)
-    elif backbone == "vitb_rn50_384":
-        pretrained = _make_pretrained_vitb_rn50_384(
-            use_pretrained,
-            hooks=hooks,
-            use_vit_only=use_vit_only,
-            use_readout=use_readout,
-        )
-        scratch = _make_scratch(
-            [256, 512, 768, 768], features, groups=groups, expand=expand
-        )  # ViT-H/16 - 85.0% Top1 (backbone)
-    elif backbone == "vitb16_384":
-        pretrained = _make_pretrained_vitb16_384(
-            use_pretrained, hooks=hooks, use_readout=use_readout
-        )
-        scratch = _make_scratch(
-            [96, 192, 384, 768], features, groups=groups, expand=expand
-        )  # ViT-B/16 - 84.6% Top1 (backbone)
-    elif backbone == "resnext101_wsl":
-        pretrained = _make_pretrained_resnext101_wsl(use_pretrained)
-        scratch = _make_scratch([256, 512, 1024, 2048], features, groups=groups, expand=expand)     # efficientnet_lite3
-    elif backbone == "efficientnet_lite3":
-        pretrained = _make_pretrained_efficientnet_lite3(use_pretrained, exportable=exportable)
-        scratch = _make_scratch([32, 48, 136, 384], features, groups=groups, expand=expand)  # efficientnet_lite3
-    else:
-        print(f"Backbone '{backbone}' not implemented")
-        assert False
-    return pretrained, scratch
-def _make_scratch(in_shape, out_shape, groups=1, expand=False):
-    scratch = nn.Module()
-    out_shape1 = out_shape
-    out_shape2 = out_shape
-    out_shape3 = out_shape
-    out_shape4 = out_shape
-    if expand==True:
-        out_shape1 = out_shape
-        out_shape2 = out_shape*2
-        out_shape3 = out_shape*4
-        out_shape4 = out_shape*8
-    scratch.layer1_rn = nn.Conv2d(
-        in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
-    )
-    scratch.layer2_rn = nn.Conv2d(
-        in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
-    )
-    scratch.layer3_rn = nn.Conv2d(
-        in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
-    )
-    scratch.layer4_rn = nn.Conv2d(
-        in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
-    )
-    return scratch
-def _make_pretrained_efficientnet_lite3(use_pretrained, exportable=False):
-    efficientnet = torch.hub.load(
-        "rwightman/gen-efficientnet-pytorch",
-        "tf_efficientnet_lite3",
-        pretrained=use_pretrained,
-        exportable=exportable
-    )
-    return _make_efficientnet_backbone(efficientnet)
-def _make_efficientnet_backbone(effnet):
-    pretrained = nn.Module()
-    pretrained.layer1 = nn.Sequential(
-        effnet.conv_stem, effnet.bn1, effnet.act1, *effnet.blocks[0:2]
-    )
-    pretrained.layer2 = nn.Sequential(*effnet.blocks[2:3])
-    pretrained.layer3 = nn.Sequential(*effnet.blocks[3:5])
-    pretrained.layer4 = nn.Sequential(*effnet.blocks[5:9])
-    return pretrained
-def _make_resnet_backbone(resnet):
-    pretrained = nn.Module()
-    pretrained.layer1 = nn.Sequential(
-        resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool, resnet.layer1
-    )
-    pretrained.layer2 = resnet.layer2
-    pretrained.layer3 = resnet.layer3
-    pretrained.layer4 = resnet.layer4
-    return pretrained
-def _make_pretrained_resnext101_wsl(use_pretrained):
-    resnet = torch.hub.load("facebookresearch/WSL-Images", "resnext101_32x8d_wsl")
-    return _make_resnet_backbone(resnet)
-class Interpolate(nn.Module):
-    """Interpolation module.
-    """
-    def __init__(self, scale_factor, mode, align_corners=False):
-        """Init.
-        Args:
-            scale_factor (float): scaling
-            mode (str): interpolation mode
-        """
-        super(Interpolate, self).__init__()
-        self.interp = nn.functional.interpolate
-        self.scale_factor = scale_factor
-        self.mode = mode
-        self.align_corners = align_corners
-    def forward(self, x):
-        """Forward pass.
-        Args:
-            x (tensor): input
-        Returns:
-            tensor: interpolated data
-        """
-        x = self.interp(
-            x, scale_factor=self.scale_factor, mode=self.mode, align_corners=self.align_corners
-        )
-        return x
-class ResidualConvUnit(nn.Module):
-    """Residual convolution module.
-    """
-    def __init__(self, features):
-        """Init.
-        Args:
-            features (int): number of features
-        """
-        super().__init__()
-        self.conv1 = nn.Conv2d(
-            features, features, kernel_size=3, stride=1, padding=1, bias=True
-        )
-        self.conv2 = nn.Conv2d(
-            features, features, kernel_size=3, stride=1, padding=1, bias=True
-        )
-        self.relu = nn.ReLU(inplace=True)
-    def forward(self, x):
-        """Forward pass.
-        Args:
-            x (tensor): input
-        Returns:
-            tensor: output
-        """
-        out = self.relu(x)
-        out = self.conv1(out)
-        out = self.relu(out)
-        out = self.conv2(out)
-        return out + x
-class FeatureFusionBlock(nn.Module):
-    """Feature fusion block.
-    """
-    def __init__(self, features):
-        """Init.
-        Args:
-            features (int): number of features
-        """
-        super(FeatureFusionBlock, self).__init__()
-        self.resConfUnit1 = ResidualConvUnit(features)
-        self.resConfUnit2 = ResidualConvUnit(features)
-    def forward(self, *xs):
-        """Forward pass.
-        Returns:
-            tensor: output
-        """
-        output = xs[0]
-        if len(xs) == 2:
-            output += self.resConfUnit1(xs[1])
-        output = self.resConfUnit2(output)
-        output = nn.functional.interpolate(
-            output, scale_factor=2, mode="bilinear", align_corners=True
-        )
-        return output
-class ResidualConvUnit_custom(nn.Module):
-    """Residual convolution module.
-    """
-    def __init__(self, features, activation, bn):
-        """Init.
-        Args:
-            features (int): number of features
-        """
-        super().__init__()
-        self.bn = bn
-        self.groups=1
-        self.conv1 = nn.Conv2d(
-            features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
-        )
-        self.conv2 = nn.Conv2d(
-            features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
-        )
-        if self.bn==True:
-            self.bn1 = nn.BatchNorm2d(features)
-            self.bn2 = nn.BatchNorm2d(features)
-        self.activation = activation
-        self.skip_add = nn.quantized.FloatFunctional()
-    def forward(self, x):
-        """Forward pass.
-        Args:
-            x (tensor): input
-        Returns:
-            tensor: output
-        """
-        out = self.activation(x)
-        out = self.conv1(out)
-        if self.bn==True:
-            out = self.bn1(out)
-        out = self.activation(out)
-        out = self.conv2(out)
-        if self.bn==True:
-            out = self.bn2(out)
-        if self.groups > 1:
-            out = self.conv_merge(out)
-        return self.skip_add.add(out, x)
-        # return out + x
-class FeatureFusionBlock_custom(nn.Module):
-    """Feature fusion block.
-    """
-    def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True):
-        """Init.
-        Args:
-            features (int): number of features
-        """
-        super(FeatureFusionBlock_custom, self).__init__()
-        self.deconv = deconv
-        self.align_corners = align_corners
-        self.groups=1
-        self.expand = expand
-        out_features = features
-        if self.expand==True:
-            out_features = features//2
-        self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
-        self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
-        self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
-        self.skip_add = nn.quantized.FloatFunctional()
-    def forward(self, *xs):
-        """Forward pass.
-        Returns:
-            tensor: output
-        """
-        output = xs[0]
-        if len(xs) == 2:
-            res = self.resConfUnit1(xs[1])
-            output = self.skip_add.add(output, res)
-            # output += res
-        output = self.resConfUnit2(output)
-        output = nn.functional.interpolate(
-            output, scale_factor=2, mode="bilinear", align_corners=self.align_corners
-        )
-        output = self.out_conv(output)
-        return output

annotator/midas/midas/dpt_depth.py DELETED Viewed

@@ -1,109 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from .base_model import BaseModel
-from .blocks import (
-    FeatureFusionBlock,
-    FeatureFusionBlock_custom,
-    Interpolate,
-    _make_encoder,
-    forward_vit,
-)
-def _make_fusion_block(features, use_bn):
-    return FeatureFusionBlock_custom(
-        features,
-        nn.ReLU(False),
-        deconv=False,
-        bn=use_bn,
-        expand=False,
-        align_corners=True,
-    )
-class DPT(BaseModel):
-    def __init__(
-        self,
-        head,
-        features=256,
-        backbone="vitb_rn50_384",
-        readout="project",
-        channels_last=False,
-        use_bn=False,
-    ):
-        super(DPT, self).__init__()
-        self.channels_last = channels_last
-        hooks = {
-            "vitb_rn50_384": [0, 1, 8, 11],
-            "vitb16_384": [2, 5, 8, 11],
-            "vitl16_384": [5, 11, 17, 23],
-        }
-        # Instantiate backbone and reassemble blocks
-        self.pretrained, self.scratch = _make_encoder(
-            backbone,
-            features,
-            False, # Set to true of you want to train from scratch, uses ImageNet weights
-            groups=1,
-            expand=False,
-            exportable=False,
-            hooks=hooks[backbone],
-            use_readout=readout,
-        )
-        self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
-        self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
-        self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
-        self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
-        self.scratch.output_conv = head
-    def forward(self, x):
-        if self.channels_last == True:
-            x.contiguous(memory_format=torch.channels_last)
-        layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x)
-        layer_1_rn = self.scratch.layer1_rn(layer_1)
-        layer_2_rn = self.scratch.layer2_rn(layer_2)
-        layer_3_rn = self.scratch.layer3_rn(layer_3)
-        layer_4_rn = self.scratch.layer4_rn(layer_4)
-        path_4 = self.scratch.refinenet4(layer_4_rn)
-        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
-        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
-        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
-        out = self.scratch.output_conv(path_1)
-        return out
-class DPTDepthModel(DPT):
-    def __init__(self, path=None, non_negative=True, **kwargs):
-        features = kwargs["features"] if "features" in kwargs else 256
-        head = nn.Sequential(
-            nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
-            Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
-            nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
-            nn.ReLU(True),
-            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
-            nn.ReLU(True) if non_negative else nn.Identity(),
-            nn.Identity(),
-        )
-        super().__init__(head, **kwargs)
-        if path is not None:
-           self.load(path)
-    def forward(self, x):
-        return super().forward(x).squeeze(dim=1)

annotator/midas/midas/midas_net.py DELETED Viewed

@@ -1,76 +0,0 @@
-"""MidashNet: Network for monocular depth estimation trained by mixing several datasets.
-This file contains code that is adapted from
-https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
-"""
-import torch
-import torch.nn as nn
-from .base_model import BaseModel
-from .blocks import FeatureFusionBlock, Interpolate, _make_encoder
-class MidasNet(BaseModel):
-    """Network for monocular depth estimation.
-    """
-    def __init__(self, path=None, features=256, non_negative=True):
-        """Init.
-        Args:
-            path (str, optional): Path to saved model. Defaults to None.
-            features (int, optional): Number of features. Defaults to 256.
-            backbone (str, optional): Backbone network for encoder. Defaults to resnet50
-        """
-        print("Loading weights: ", path)
-        super(MidasNet, self).__init__()
-        use_pretrained = False if path is None else True
-        self.pretrained, self.scratch = _make_encoder(backbone="resnext101_wsl", features=features, use_pretrained=use_pretrained)
-        self.scratch.refinenet4 = FeatureFusionBlock(features)
-        self.scratch.refinenet3 = FeatureFusionBlock(features)
-        self.scratch.refinenet2 = FeatureFusionBlock(features)
-        self.scratch.refinenet1 = FeatureFusionBlock(features)
-        self.scratch.output_conv = nn.Sequential(
-            nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1),
-            Interpolate(scale_factor=2, mode="bilinear"),
-            nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1),
-            nn.ReLU(True),
-            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
-            nn.ReLU(True) if non_negative else nn.Identity(),
-        )
-        if path:
-            self.load(path)
-    def forward(self, x):
-        """Forward pass.
-        Args:
-            x (tensor): input data (image)
-        Returns:
-            tensor: depth
-        """
-        layer_1 = self.pretrained.layer1(x)
-        layer_2 = self.pretrained.layer2(layer_1)
-        layer_3 = self.pretrained.layer3(layer_2)
-        layer_4 = self.pretrained.layer4(layer_3)
-        layer_1_rn = self.scratch.layer1_rn(layer_1)
-        layer_2_rn = self.scratch.layer2_rn(layer_2)
-        layer_3_rn = self.scratch.layer3_rn(layer_3)
-        layer_4_rn = self.scratch.layer4_rn(layer_4)
-        path_4 = self.scratch.refinenet4(layer_4_rn)
-        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
-        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
-        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
-        out = self.scratch.output_conv(path_1)
-        return torch.squeeze(out, dim=1)

annotator/midas/midas/midas_net_custom.py DELETED Viewed

@@ -1,128 +0,0 @@
-"""MidashNet: Network for monocular depth estimation trained by mixing several datasets.
-This file contains code that is adapted from
-https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
-"""
-import torch
-import torch.nn as nn
-from .base_model import BaseModel
-from .blocks import FeatureFusionBlock, FeatureFusionBlock_custom, Interpolate, _make_encoder
-class MidasNet_small(BaseModel):
-    """Network for monocular depth estimation.
-    """
-    def __init__(self, path=None, features=64, backbone="efficientnet_lite3", non_negative=True, exportable=True, channels_last=False, align_corners=True,
-        blocks={'expand': True}):
-        """Init.
-        Args:
-            path (str, optional): Path to saved model. Defaults to None.
-            features (int, optional): Number of features. Defaults to 256.
-            backbone (str, optional): Backbone network for encoder. Defaults to resnet50
-        """
-        print("Loading weights: ", path)
-        super(MidasNet_small, self).__init__()
-        use_pretrained = False if path else True
-        self.channels_last = channels_last
-        self.blocks = blocks
-        self.backbone = backbone
-        self.groups = 1
-        features1=features
-        features2=features
-        features3=features
-        features4=features
-        self.expand = False
-        if "expand" in self.blocks and self.blocks['expand'] == True:
-            self.expand = True
-            features1=features
-            features2=features*2
-            features3=features*4
-            features4=features*8
-        self.pretrained, self.scratch = _make_encoder(self.backbone, features, use_pretrained, groups=self.groups, expand=self.expand, exportable=exportable)
-        self.scratch.activation = nn.ReLU(False)
-        self.scratch.refinenet4 = FeatureFusionBlock_custom(features4, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
-        self.scratch.refinenet3 = FeatureFusionBlock_custom(features3, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
-        self.scratch.refinenet2 = FeatureFusionBlock_custom(features2, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
-        self.scratch.refinenet1 = FeatureFusionBlock_custom(features1, self.scratch.activation, deconv=False, bn=False, align_corners=align_corners)
-        self.scratch.output_conv = nn.Sequential(
-            nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1, groups=self.groups),
-            Interpolate(scale_factor=2, mode="bilinear"),
-            nn.Conv2d(features//2, 32, kernel_size=3, stride=1, padding=1),
-            self.scratch.activation,
-            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
-            nn.ReLU(True) if non_negative else nn.Identity(),
-            nn.Identity(),
-        )
-        if path:
-            self.load(path)
-    def forward(self, x):
-        """Forward pass.
-        Args:
-            x (tensor): input data (image)
-        Returns:
-            tensor: depth
-        """
-        if self.channels_last==True:
-            print("self.channels_last = ", self.channels_last)
-            x.contiguous(memory_format=torch.channels_last)
-        layer_1 = self.pretrained.layer1(x)
-        layer_2 = self.pretrained.layer2(layer_1)
-        layer_3 = self.pretrained.layer3(layer_2)
-        layer_4 = self.pretrained.layer4(layer_3)
-        layer_1_rn = self.scratch.layer1_rn(layer_1)
-        layer_2_rn = self.scratch.layer2_rn(layer_2)
-        layer_3_rn = self.scratch.layer3_rn(layer_3)
-        layer_4_rn = self.scratch.layer4_rn(layer_4)
-        path_4 = self.scratch.refinenet4(layer_4_rn)
-        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
-        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
-        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
-        out = self.scratch.output_conv(path_1)
-        return torch.squeeze(out, dim=1)
-def fuse_model(m):
-    prev_previous_type = nn.Identity()
-    prev_previous_name = ''
-    previous_type = nn.Identity()
-    previous_name = ''
-    for name, module in m.named_modules():
-        if prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d and type(module) == nn.ReLU:
-            # print("FUSED ", prev_previous_name, previous_name, name)
-            torch.quantization.fuse_modules(m, [prev_previous_name, previous_name, name], inplace=True)
-        elif prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d:
-            # print("FUSED ", prev_previous_name, previous_name)
-            torch.quantization.fuse_modules(m, [prev_previous_name, previous_name], inplace=True)
-        # elif previous_type == nn.Conv2d and type(module) == nn.ReLU:
-        #    print("FUSED ", previous_name, name)
-        #    torch.quantization.fuse_modules(m, [previous_name, name], inplace=True)
-        prev_previous_type = previous_type
-        prev_previous_name = previous_name
-        previous_type = type(module)
-        previous_name = name

annotator/midas/midas/transforms.py DELETED Viewed

@@ -1,234 +0,0 @@
-import numpy as np
-import cv2
-import math
-def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA):
-    """Rezise the sample to ensure the given size. Keeps aspect ratio.
-    Args:
-        sample (dict): sample
-        size (tuple): image size
-    Returns:
-        tuple: new size
-    """
-    shape = list(sample["disparity"].shape)
-    if shape[0] >= size[0] and shape[1] >= size[1]:
-        return sample
-    scale = [0, 0]
-    scale[0] = size[0] / shape[0]
-    scale[1] = size[1] / shape[1]
-    scale = max(scale)
-    shape[0] = math.ceil(scale * shape[0])
-    shape[1] = math.ceil(scale * shape[1])
-    # resize
-    sample["image"] = cv2.resize(
-        sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method
-    )
-    sample["disparity"] = cv2.resize(
-        sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST
-    )
-    sample["mask"] = cv2.resize(
-        sample["mask"].astype(np.float32),
-        tuple(shape[::-1]),
-        interpolation=cv2.INTER_NEAREST,
-    )
-    sample["mask"] = sample["mask"].astype(bool)
-    return tuple(shape)
-class Resize(object):
-    """Resize sample to given size (width, height).
-    """
-    def __init__(
-        self,
-        width,
-        height,
-        resize_target=True,
-        keep_aspect_ratio=False,
-        ensure_multiple_of=1,
-        resize_method="lower_bound",
-        image_interpolation_method=cv2.INTER_AREA,
-    ):
-        """Init.
-        Args:
-            width (int): desired output width
-            height (int): desired output height
-            resize_target (bool, optional):
-                True: Resize the full sample (image, mask, target).
-                False: Resize image only.
-                Defaults to True.
-            keep_aspect_ratio (bool, optional):
-                True: Keep the aspect ratio of the input sample.
-                Output sample might not have the given width and height, and
-                resize behaviour depends on the parameter 'resize_method'.
-                Defaults to False.
-            ensure_multiple_of (int, optional):
-                Output width and height is constrained to be multiple of this parameter.
-                Defaults to 1.
-            resize_method (str, optional):
-                "lower_bound": Output will be at least as large as the given size.
-                "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
-                "minimal": Scale as least as possible.  (Output size might be smaller than given size.)
-                Defaults to "lower_bound".
-        """
-        self.__width = width
-        self.__height = height
-        self.__resize_target = resize_target
-        self.__keep_aspect_ratio = keep_aspect_ratio
-        self.__multiple_of = ensure_multiple_of
-        self.__resize_method = resize_method
-        self.__image_interpolation_method = image_interpolation_method
-    def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
-        y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
-        if max_val is not None and y > max_val:
-            y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
-        if y < min_val:
-            y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
-        return y
-    def get_size(self, width, height):
-        # determine new height and width
-        scale_height = self.__height / height
-        scale_width = self.__width / width
-        if self.__keep_aspect_ratio:
-            if self.__resize_method == "lower_bound":
-                # scale such that output size is lower bound
-                if scale_width > scale_height:
-                    # fit width
-                    scale_height = scale_width
-                else:
-                    # fit height
-                    scale_width = scale_height
-            elif self.__resize_method == "upper_bound":
-                # scale such that output size is upper bound
-                if scale_width < scale_height:
-                    # fit width
-                    scale_height = scale_width
-                else:
-                    # fit height
-                    scale_width = scale_height
-            elif self.__resize_method == "minimal":
-                # scale as least as possbile
-                if abs(1 - scale_width) < abs(1 - scale_height):
-                    # fit width
-                    scale_height = scale_width
-                else:
-                    # fit height
-                    scale_width = scale_height
-            else:
-                raise ValueError(
-                    f"resize_method {self.__resize_method} not implemented"
-                )
-        if self.__resize_method == "lower_bound":
-            new_height = self.constrain_to_multiple_of(
-                scale_height * height, min_val=self.__height
-            )
-            new_width = self.constrain_to_multiple_of(
-                scale_width * width, min_val=self.__width
-            )
-        elif self.__resize_method == "upper_bound":
-            new_height = self.constrain_to_multiple_of(
-                scale_height * height, max_val=self.__height
-            )
-            new_width = self.constrain_to_multiple_of(
-                scale_width * width, max_val=self.__width
-            )
-        elif self.__resize_method == "minimal":
-            new_height = self.constrain_to_multiple_of(scale_height * height)
-            new_width = self.constrain_to_multiple_of(scale_width * width)
-        else:
-            raise ValueError(f"resize_method {self.__resize_method} not implemented")
-        return (new_width, new_height)
-    def __call__(self, sample):
-        width, height = self.get_size(
-            sample["image"].shape[1], sample["image"].shape[0]
-        )
-        # resize sample
-        sample["image"] = cv2.resize(
-            sample["image"],
-            (width, height),
-            interpolation=self.__image_interpolation_method,
-        )
-        if self.__resize_target:
-            if "disparity" in sample:
-                sample["disparity"] = cv2.resize(
-                    sample["disparity"],
-                    (width, height),
-                    interpolation=cv2.INTER_NEAREST,
-                )
-            if "depth" in sample:
-                sample["depth"] = cv2.resize(
-                    sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST
-                )
-            sample["mask"] = cv2.resize(
-                sample["mask"].astype(np.float32),
-                (width, height),
-                interpolation=cv2.INTER_NEAREST,
-            )
-            sample["mask"] = sample["mask"].astype(bool)
-        return sample
-class NormalizeImage(object):
-    """Normlize image by given mean and std.
-    """
-    def __init__(self, mean, std):
-        self.__mean = mean
-        self.__std = std
-    def __call__(self, sample):
-        sample["image"] = (sample["image"] - self.__mean) / self.__std
-        return sample
-class PrepareForNet(object):
-    """Prepare sample for usage as network input.
-    """
-    def __init__(self):
-        pass
-    def __call__(self, sample):
-        image = np.transpose(sample["image"], (2, 0, 1))
-        sample["image"] = np.ascontiguousarray(image).astype(np.float32)
-        if "mask" in sample:
-            sample["mask"] = sample["mask"].astype(np.float32)
-            sample["mask"] = np.ascontiguousarray(sample["mask"])
-        if "disparity" in sample:
-            disparity = sample["disparity"].astype(np.float32)
-            sample["disparity"] = np.ascontiguousarray(disparity)
-        if "depth" in sample:
-            depth = sample["depth"].astype(np.float32)
-            sample["depth"] = np.ascontiguousarray(depth)
-        return sample

annotator/midas/midas/vit.py DELETED Viewed

@@ -1,491 +0,0 @@
-import torch
-import torch.nn as nn
-import timm
-import types
-import math
-import torch.nn.functional as F
-class Slice(nn.Module):
-    def __init__(self, start_index=1):
-        super(Slice, self).__init__()
-        self.start_index = start_index
-    def forward(self, x):
-        return x[:, self.start_index :]
-class AddReadout(nn.Module):
-    def __init__(self, start_index=1):
-        super(AddReadout, self).__init__()
-        self.start_index = start_index
-    def forward(self, x):
-        if self.start_index == 2:
-            readout = (x[:, 0] + x[:, 1]) / 2
-        else:
-            readout = x[:, 0]
-        return x[:, self.start_index :] + readout.unsqueeze(1)
-class ProjectReadout(nn.Module):
-    def __init__(self, in_features, start_index=1):
-        super(ProjectReadout, self).__init__()
-        self.start_index = start_index
-        self.project = nn.Sequential(nn.Linear(2 * in_features, in_features), nn.GELU())
-    def forward(self, x):
-        readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index :])
-        features = torch.cat((x[:, self.start_index :], readout), -1)
-        return self.project(features)
-class Transpose(nn.Module):
-    def __init__(self, dim0, dim1):
-        super(Transpose, self).__init__()
-        self.dim0 = dim0
-        self.dim1 = dim1
-    def forward(self, x):
-        x = x.transpose(self.dim0, self.dim1)
-        return x
-def forward_vit(pretrained, x):
-    b, c, h, w = x.shape
-    glob = pretrained.model.forward_flex(x)
-    layer_1 = pretrained.activations["1"]
-    layer_2 = pretrained.activations["2"]
-    layer_3 = pretrained.activations["3"]
-    layer_4 = pretrained.activations["4"]
-    layer_1 = pretrained.act_postprocess1[0:2](layer_1)
-    layer_2 = pretrained.act_postprocess2[0:2](layer_2)
-    layer_3 = pretrained.act_postprocess3[0:2](layer_3)
-    layer_4 = pretrained.act_postprocess4[0:2](layer_4)
-    unflatten = nn.Sequential(
-        nn.Unflatten(
-            2,
-            torch.Size(
-                [
-                    h // pretrained.model.patch_size[1],
-                    w // pretrained.model.patch_size[0],
-                ]
-            ),
-        )
-    )
-    if layer_1.ndim == 3:
-        layer_1 = unflatten(layer_1)
-    if layer_2.ndim == 3:
-        layer_2 = unflatten(layer_2)
-    if layer_3.ndim == 3:
-        layer_3 = unflatten(layer_3)
-    if layer_4.ndim == 3:
-        layer_4 = unflatten(layer_4)
-    layer_1 = pretrained.act_postprocess1[3 : len(pretrained.act_postprocess1)](layer_1)
-    layer_2 = pretrained.act_postprocess2[3 : len(pretrained.act_postprocess2)](layer_2)
-    layer_3 = pretrained.act_postprocess3[3 : len(pretrained.act_postprocess3)](layer_3)
-    layer_4 = pretrained.act_postprocess4[3 : len(pretrained.act_postprocess4)](layer_4)
-    return layer_1, layer_2, layer_3, layer_4
-def _resize_pos_embed(self, posemb, gs_h, gs_w):
-    posemb_tok, posemb_grid = (
-        posemb[:, : self.start_index],
-        posemb[0, self.start_index :],
-    )
-    gs_old = int(math.sqrt(len(posemb_grid)))
-    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
-    posemb_grid = F.interpolate(posemb_grid, size=(gs_h, gs_w), mode="bilinear")
-    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)
-    posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
-    return posemb
-def forward_flex(self, x):
-    b, c, h, w = x.shape
-    pos_embed = self._resize_pos_embed(
-        self.pos_embed, h // self.patch_size[1], w // self.patch_size[0]
-    )
-    B = x.shape[0]
-    if hasattr(self.patch_embed, "backbone"):
-        x = self.patch_embed.backbone(x)
-        if isinstance(x, (list, tuple)):
-            x = x[-1]  # last feature if backbone outputs list/tuple of features
-    x = self.patch_embed.proj(x).flatten(2).transpose(1, 2)
-    if getattr(self, "dist_token", None) is not None:
-        cls_tokens = self.cls_token.expand(
-            B, -1, -1
-        )  # stole cls_tokens impl from Phil Wang, thanks
-        dist_token = self.dist_token.expand(B, -1, -1)
-        x = torch.cat((cls_tokens, dist_token, x), dim=1)
-    else:
-        cls_tokens = self.cls_token.expand(
-            B, -1, -1
-        )  # stole cls_tokens impl from Phil Wang, thanks
-        x = torch.cat((cls_tokens, x), dim=1)
-    x = x + pos_embed
-    x = self.pos_drop(x)
-    for blk in self.blocks:
-        x = blk(x)
-    x = self.norm(x)
-    return x
-activations = {}
-def get_activation(name):
-    def hook(model, input, output):
-        activations[name] = output
-    return hook
-def get_readout_oper(vit_features, features, use_readout, start_index=1):
-    if use_readout == "ignore":
-        readout_oper = [Slice(start_index)] * len(features)
-    elif use_readout == "add":
-        readout_oper = [AddReadout(start_index)] * len(features)
-    elif use_readout == "project":
-        readout_oper = [
-            ProjectReadout(vit_features, start_index) for out_feat in features
-        ]
-    else:
-        assert (
-            False
-        ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'"
-    return readout_oper
-def _make_vit_b16_backbone(
-    model,
-    features=[96, 192, 384, 768],
-    size=[384, 384],
-    hooks=[2, 5, 8, 11],
-    vit_features=768,
-    use_readout="ignore",
-    start_index=1,
-):
-    pretrained = nn.Module()
-    pretrained.model = model
-    pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
-    pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
-    pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
-    pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
-    pretrained.activations = activations
-    readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
-    # 32, 48, 136, 384
-    pretrained.act_postprocess1 = nn.Sequential(
-        readout_oper[0],
-        Transpose(1, 2),
-        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
-        nn.Conv2d(
-            in_channels=vit_features,
-            out_channels=features[0],
-            kernel_size=1,
-            stride=1,
-            padding=0,
-        ),
-        nn.ConvTranspose2d(
-            in_channels=features[0],
-            out_channels=features[0],
-            kernel_size=4,
-            stride=4,
-            padding=0,
-            bias=True,
-            dilation=1,
-            groups=1,
-        ),
-    )
-    pretrained.act_postprocess2 = nn.Sequential(
-        readout_oper[1],
-        Transpose(1, 2),
-        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
-        nn.Conv2d(
-            in_channels=vit_features,
-            out_channels=features[1],
-            kernel_size=1,
-            stride=1,
-            padding=0,
-        ),
-        nn.ConvTranspose2d(
-            in_channels=features[1],
-            out_channels=features[1],
-            kernel_size=2,
-            stride=2,
-            padding=0,
-            bias=True,
-            dilation=1,
-            groups=1,
-        ),
-    )
-    pretrained.act_postprocess3 = nn.Sequential(
-        readout_oper[2],
-        Transpose(1, 2),
-        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
-        nn.Conv2d(
-            in_channels=vit_features,
-            out_channels=features[2],
-            kernel_size=1,
-            stride=1,
-            padding=0,
-        ),
-    )
-    pretrained.act_postprocess4 = nn.Sequential(
-        readout_oper[3],
-        Transpose(1, 2),
-        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
-        nn.Conv2d(
-            in_channels=vit_features,
-            out_channels=features[3],
-            kernel_size=1,
-            stride=1,
-            padding=0,
-        ),
-        nn.Conv2d(
-            in_channels=features[3],
-            out_channels=features[3],
-            kernel_size=3,
-            stride=2,
-            padding=1,
-        ),
-    )
-    pretrained.model.start_index = start_index
-    pretrained.model.patch_size = [16, 16]
-    # We inject this function into the VisionTransformer instances so that
-    # we can use it with interpolated position embeddings without modifying the library source.
-    pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
-    pretrained.model._resize_pos_embed = types.MethodType(
-        _resize_pos_embed, pretrained.model
-    )
-    return pretrained
-def _make_pretrained_vitl16_384(pretrained, use_readout="ignore", hooks=None):
-    model = timm.create_model("vit_large_patch16_384", pretrained=pretrained)
-    hooks = [5, 11, 17, 23] if hooks == None else hooks
-    return _make_vit_b16_backbone(
-        model,
-        features=[256, 512, 1024, 1024],
-        hooks=hooks,
-        vit_features=1024,
-        use_readout=use_readout,
-    )
-def _make_pretrained_vitb16_384(pretrained, use_readout="ignore", hooks=None):
-    model = timm.create_model("vit_base_patch16_384", pretrained=pretrained)
-    hooks = [2, 5, 8, 11] if hooks == None else hooks
-    return _make_vit_b16_backbone(
-        model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout
-    )
-def _make_pretrained_deitb16_384(pretrained, use_readout="ignore", hooks=None):
-    model = timm.create_model("vit_deit_base_patch16_384", pretrained=pretrained)
-    hooks = [2, 5, 8, 11] if hooks == None else hooks
-    return _make_vit_b16_backbone(
-        model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout
-    )
-def _make_pretrained_deitb16_distil_384(pretrained, use_readout="ignore", hooks=None):
-    model = timm.create_model(
-        "vit_deit_base_distilled_patch16_384", pretrained=pretrained
-    )
-    hooks = [2, 5, 8, 11] if hooks == None else hooks
-    return _make_vit_b16_backbone(
-        model,
-        features=[96, 192, 384, 768],
-        hooks=hooks,
-        use_readout=use_readout,
-        start_index=2,
-    )
-def _make_vit_b_rn50_backbone(
-    model,
-    features=[256, 512, 768, 768],
-    size=[384, 384],
-    hooks=[0, 1, 8, 11],
-    vit_features=768,
-    use_vit_only=False,
-    use_readout="ignore",
-    start_index=1,
-):
-    pretrained = nn.Module()
-    pretrained.model = model
-    if use_vit_only == True:
-        pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
-        pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
-    else:
-        pretrained.model.patch_embed.backbone.stages[0].register_forward_hook(
-            get_activation("1")
-        )
-        pretrained.model.patch_embed.backbone.stages[1].register_forward_hook(
-            get_activation("2")
-        )
-    pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
-    pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
-    pretrained.activations = activations
-    readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
-    if use_vit_only == True:
-        pretrained.act_postprocess1 = nn.Sequential(
-            readout_oper[0],
-            Transpose(1, 2),
-            nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
-            nn.Conv2d(
-                in_channels=vit_features,
-                out_channels=features[0],
-                kernel_size=1,
-                stride=1,
-                padding=0,
-            ),
-            nn.ConvTranspose2d(
-                in_channels=features[0],
-                out_channels=features[0],
-                kernel_size=4,
-                stride=4,
-                padding=0,
-                bias=True,
-                dilation=1,
-                groups=1,
-            ),
-        )
-        pretrained.act_postprocess2 = nn.Sequential(
-            readout_oper[1],
-            Transpose(1, 2),
-            nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
-            nn.Conv2d(
-                in_channels=vit_features,
-                out_channels=features[1],
-                kernel_size=1,
-                stride=1,
-                padding=0,
-            ),
-            nn.ConvTranspose2d(
-                in_channels=features[1],
-                out_channels=features[1],
-                kernel_size=2,
-                stride=2,
-                padding=0,
-                bias=True,
-                dilation=1,
-                groups=1,
-            ),
-        )
-    else:
-        pretrained.act_postprocess1 = nn.Sequential(
-            nn.Identity(), nn.Identity(), nn.Identity()
-        )
-        pretrained.act_postprocess2 = nn.Sequential(
-            nn.Identity(), nn.Identity(), nn.Identity()
-        )
-    pretrained.act_postprocess3 = nn.Sequential(
-        readout_oper[2],
-        Transpose(1, 2),
-        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
-        nn.Conv2d(
-            in_channels=vit_features,
-            out_channels=features[2],
-            kernel_size=1,
-            stride=1,
-            padding=0,
-        ),
-    )
-    pretrained.act_postprocess4 = nn.Sequential(
-        readout_oper[3],
-        Transpose(1, 2),
-        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
-        nn.Conv2d(
-            in_channels=vit_features,
-            out_channels=features[3],
-            kernel_size=1,
-            stride=1,
-            padding=0,
-        ),
-        nn.Conv2d(
-            in_channels=features[3],
-            out_channels=features[3],
-            kernel_size=3,
-            stride=2,
-            padding=1,
-        ),
-    )
-    pretrained.model.start_index = start_index
-    pretrained.model.patch_size = [16, 16]
-    # We inject this function into the VisionTransformer instances so that
-    # we can use it with interpolated position embeddings without modifying the library source.
-    pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
-    # We inject this function into the VisionTransformer instances so that
-    # we can use it with interpolated position embeddings without modifying the library source.
-    pretrained.model._resize_pos_embed = types.MethodType(
-        _resize_pos_embed, pretrained.model
-    )
-    return pretrained
-def _make_pretrained_vitb_rn50_384(
-    pretrained, use_readout="ignore", hooks=None, use_vit_only=False
-):
-    model = timm.create_model("vit_base_resnet50_384", pretrained=pretrained)
-    hooks = [0, 1, 8, 11] if hooks == None else hooks
-    return _make_vit_b_rn50_backbone(
-        model,
-        features=[256, 512, 768, 768],
-        size=[384, 384],
-        hooks=hooks,
-        use_vit_only=use_vit_only,
-        use_readout=use_readout,
-    )

annotator/midas/utils.py DELETED Viewed

@@ -1,189 +0,0 @@
-"""Utils for monoDepth."""
-import sys
-import re
-import numpy as np
-import cv2
-import torch
-def read_pfm(path):
-    """Read pfm file.
-    Args:
-        path (str): path to file
-    Returns:
-        tuple: (data, scale)
-    """
-    with open(path, "rb") as file:
-        color = None
-        width = None
-        height = None
-        scale = None
-        endian = None
-        header = file.readline().rstrip()
-        if header.decode("ascii") == "PF":
-            color = True
-        elif header.decode("ascii") == "Pf":
-            color = False
-        else:
-            raise Exception("Not a PFM file: " + path)
-        dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii"))
-        if dim_match:
-            width, height = list(map(int, dim_match.groups()))
-        else:
-            raise Exception("Malformed PFM header.")
-        scale = float(file.readline().decode("ascii").rstrip())
-        if scale < 0:
-            # little-endian
-            endian = "<"
-            scale = -scale
-        else:
-            # big-endian
-            endian = ">"
-        data = np.fromfile(file, endian + "f")
-        shape = (height, width, 3) if color else (height, width)
-        data = np.reshape(data, shape)
-        data = np.flipud(data)
-        return data, scale
-def write_pfm(path, image, scale=1):
-    """Write pfm file.
-    Args:
-        path (str): pathto file
-        image (array): data
-        scale (int, optional): Scale. Defaults to 1.
-    """
-    with open(path, "wb") as file:
-        color = None
-        if image.dtype.name != "float32":
-            raise Exception("Image dtype must be float32.")
-        image = np.flipud(image)
-        if len(image.shape) == 3 and image.shape[2] == 3:  # color image
-            color = True
-        elif (
-            len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1
-        ):  # greyscale
-            color = False
-        else:
-            raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.")
-        file.write("PF\n" if color else "Pf\n".encode())
-        file.write("%d %d\n".encode() % (image.shape[1], image.shape[0]))
-        endian = image.dtype.byteorder
-        if endian == "<" or endian == "=" and sys.byteorder == "little":
-            scale = -scale
-        file.write("%f\n".encode() % scale)
-        image.tofile(file)
-def read_image(path):
-    """Read image and output RGB image (0-1).
-    Args:
-        path (str): path to file
-    Returns:
-        array: RGB image (0-1)
-    """
-    img = cv2.imread(path)
-    if img.ndim == 2:
-        img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
-    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0
-    return img
-def resize_image(img):
-    """Resize image and make it fit for network.
-    Args:
-        img (array): image
-    Returns:
-        tensor: data ready for network
-    """
-    height_orig = img.shape[0]
-    width_orig = img.shape[1]
-    if width_orig > height_orig:
-        scale = width_orig / 384
-    else:
-        scale = height_orig / 384
-    height = (np.ceil(height_orig / scale / 32) * 32).astype(int)
-    width = (np.ceil(width_orig / scale / 32) * 32).astype(int)
-    img_resized = cv2.resize(img, (width, height), interpolation=cv2.INTER_AREA)
-    img_resized = (
-        torch.from_numpy(np.transpose(img_resized, (2, 0, 1))).contiguous().float()
-    )
-    img_resized = img_resized.unsqueeze(0)
-    return img_resized
-def resize_depth(depth, width, height):
-    """Resize depth map and bring to CPU (numpy).
-    Args:
-        depth (tensor): depth
-        width (int): image width
-        height (int): image height
-    Returns:
-        array: processed depth
-    """
-    depth = torch.squeeze(depth[0, :, :, :]).to("cpu")
-    depth_resized = cv2.resize(
-        depth.numpy(), (width, height), interpolation=cv2.INTER_CUBIC
-    )
-    return depth_resized
-def write_depth(path, depth, bits=1):
-    """Write depth map to pfm and png file.
-    Args:
-        path (str): filepath without extension
-        depth (array): depth
-    """
-    write_pfm(path + ".pfm", depth.astype(np.float32))
-    depth_min = depth.min()
-    depth_max = depth.max()
-    max_val = (2**(8*bits))-1
-    if depth_max - depth_min > np.finfo("float").eps:
-        out = max_val * (depth - depth_min) / (depth_max - depth_min)
-    else:
-        out = np.zeros(depth.shape, dtype=depth.type)
-    if bits == 1:
-        cv2.imwrite(path + ".png", out.astype("uint8"))
-    elif bits == 2:
-        cv2.imwrite(path + ".png", out.astype("uint16"))
-    return

annotator/util.py DELETED Viewed

@@ -1,129 +0,0 @@
-import random
-import numpy as np
-import cv2
-import os
-import PIL
-annotator_ckpts_path = os.path.join(os.path.dirname(__file__), 'ckpts')
-def HWC3(x):
-    assert x.dtype == np.uint8
-    if x.ndim == 2:
-        x = x[:, :, None]
-    assert x.ndim == 3
-    H, W, C = x.shape
-    assert C == 1 or C == 3 or C == 4
-    if C == 3:
-        return x
-    if C == 1:
-        return np.concatenate([x, x, x], axis=2)
-    if C == 4:
-        color = x[:, :, 0:3].astype(np.float32)
-        alpha = x[:, :, 3:4].astype(np.float32) / 255.0
-        y = color * alpha + 255.0 * (1.0 - alpha)
-        y = y.clip(0, 255).astype(np.uint8)
-        return y
-def resize_image(input_image, resolution, short = False, interpolation=None):
-    if isinstance(input_image,PIL.Image.Image):
-        mode = 'pil'
-        W,H = input_image.size
-    elif isinstance(input_image,np.ndarray):
-        mode = 'cv2'
-        H, W, _ = input_image.shape
-    H = float(H)
-    W = float(W)
-    if short:
-        k = float(resolution) / min(H, W) # k>1 放大， k<1 缩小
-    else:
-        k = float(resolution) / max(H, W) # k>1 放大， k<1 缩小
-    H *= k
-    W *= k
-    H = int(np.round(H / 64.0)) * 64
-    W = int(np.round(W / 64.0)) * 64
-    if mode == 'cv2':
-        if interpolation is None:
-            interpolation = cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA
-        img = cv2.resize(input_image, (W, H), interpolation=interpolation)
-    elif mode == 'pil':
-        if interpolation is None:
-            interpolation = PIL.Image.LANCZOS if k > 1 else PIL.Image.BILINEAR
-        img = input_image.resize((W, H), resample=interpolation)
-    return img
-# def resize_image(input_image, resolution):
-#     H, W, C = input_image.shape
-#     H = float(H)
-#     W = float(W)
-#     k = float(resolution) / min(H, W)
-#     H *= k
-#     W *= k
-#     H = int(np.round(H / 64.0)) * 64
-#     W = int(np.round(W / 64.0)) * 64
-#     img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA)
-#     return img
-def nms(x, t, s):
-    x = cv2.GaussianBlur(x.astype(np.float32), (0, 0), s)
-    f1 = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]], dtype=np.uint8)
-    f2 = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=np.uint8)
-    f3 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.uint8)
-    f4 = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]], dtype=np.uint8)
-    y = np.zeros_like(x)
-    for f in [f1, f2, f3, f4]:
-        np.putmask(y, cv2.dilate(x, kernel=f) == x, x)
-    z = np.zeros_like(y, dtype=np.uint8)
-    z[y > t] = 255
-    return z
-def make_noise_disk(H, W, C, F):
-    noise = np.random.uniform(low=0, high=1, size=((H // F) + 2, (W // F) + 2, C))
-    noise = cv2.resize(noise, (W + 2 * F, H + 2 * F), interpolation=cv2.INTER_CUBIC)
-    noise = noise[F: F + H, F: F + W]
-    noise -= np.min(noise)
-    noise /= np.max(noise)
-    if C == 1:
-        noise = noise[:, :, None]
-    return noise
-def min_max_norm(x):
-    x -= np.min(x)
-    x /= np.maximum(np.max(x), 1e-5)
-    return x
-def safe_step(x, step=2):
-    y = x.astype(np.float32) * float(step + 1)
-    y = y.astype(np.int32).astype(np.float32) / float(step)
-    return y
-def img2mask(img, H, W, low=10, high=90):
-    assert img.ndim == 3 or img.ndim == 2
-    assert img.dtype == np.uint8
-    if img.ndim == 3:
-        y = img[:, :, random.randrange(0, img.shape[2])]
-    else:
-        y = img
-    y = cv2.resize(y, (W, H), interpolation=cv2.INTER_CUBIC)
-    if random.uniform(0, 1) < 0.5:
-        y = 255 - y
-    return y < np.percentile(y, random.randrange(low, high))

app.py CHANGED Viewed

@@ -2,158 +2,126 @@ import spaces
 import random
 import torch
 import cv2
 import gradio as gr
 import numpy as np
 from huggingface_hub import snapshot_download
 from transformers import CLIPVisionModelWithProjection,CLIPImageProcessor
-from diffusers.utils import load_image
-from kolors.pipelines.pipeline_controlnet_xl_kolors_img2img import StableDiffusionXLControlNetImg2ImgPipeline
 from kolors.models.modeling_chatglm import ChatGLMModel
 from kolors.models.tokenization_chatglm import ChatGLMTokenizer
-from kolors.models.controlnet import ControlNetModel
-from diffusers import  AutoencoderKL
 from kolors.models.unet_2d_condition import UNet2DConditionModel
 from diffusers import EulerDiscreteScheduler
 from PIL import Image
-from annotator.midas import MidasDetector
-from annotator.util import resize_image, HWC3
 device = "cuda"
 ckpt_dir = snapshot_download(repo_id="Kwai-Kolors/Kolors")
-ckpt_dir_depth = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Depth")
-ckpt_dir_canny = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Canny")
 text_encoder = ChatGLMModel.from_pretrained(f'{ckpt_dir}/text_encoder', torch_dtype=torch.float16).half().to(device)
 tokenizer = ChatGLMTokenizer.from_pretrained(f'{ckpt_dir}/text_encoder')
 vae = AutoencoderKL.from_pretrained(f"{ckpt_dir}/vae", revision=None).half().to(device)
 scheduler = EulerDiscreteScheduler.from_pretrained(f"{ckpt_dir}/scheduler")
 unet = UNet2DConditionModel.from_pretrained(f"{ckpt_dir}/unet", revision=None).half().to(device)
-controlnet_depth = ControlNetModel.from_pretrained(f"{ckpt_dir_depth}", revision=None).half().to(device)
-controlnet_canny = ControlNetModel.from_pretrained(f"{ckpt_dir_canny}", revision=None).half().to(device)
-pipe_depth = StableDiffusionXLControlNetImg2ImgPipeline(
-    vae=vae,
-    controlnet = controlnet_depth,
-    text_encoder=text_encoder,
-    tokenizer=tokenizer,
-    unet=unet,
-    scheduler=scheduler,
-    force_zeros_for_empty_prompt=False
 )
-pipe_canny = StableDiffusionXLControlNetImg2ImgPipeline(
-    vae=vae,
-    controlnet = controlnet_canny,
-    text_encoder=text_encoder,
-    tokenizer=tokenizer,
-    unet=unet,
-    scheduler=scheduler,
-    force_zeros_for_empty_prompt=False
-)
-@spaces.GPU
-def process_canny_condition(image, canny_threods=[100,200]):
-    np_image = image.copy()
-    np_image = cv2.Canny(np_image, canny_threods[0], canny_threods[1])
-    np_image = np_image[:, :, None]
-    np_image = np.concatenate([np_image, np_image, np_image], axis=2)
-    np_image = HWC3(np_image)
-    return Image.fromarray(np_image)
-model_midas = MidasDetector()
-@spaces.GPU
-def process_depth_condition_midas(img, res = 1024):
-    h,w,_ = img.shape
-    img = resize_image(HWC3(img), res)
-    result = HWC3(model_midas(img))
-    result = cv2.resize(result, (w,h))
-    return Image.fromarray(result)
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 1024
 @spaces.GPU
-def infer_depth(prompt,
           image = None,
           negative_prompt = "nsfw，脸部阴影，低分辨率，jpeg伪影、模糊、糟糕，黑脸，霓虹灯",
-          seed = 397886929,
           randomize_seed = False,
           guidance_scale = 6.0,
-          num_inference_steps = 50,
-          controlnet_conditioning_scale = 0.7,
-          control_guidance_end = 0.9,
-          strength = 1.0
         ):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     generator = torch.Generator().manual_seed(seed)
-    init_image = resize_image(image,  MAX_IMAGE_SIZE)
-    pipe = pipe_depth.to("cuda")
-    condi_img = process_depth_condition_midas( np.array(init_image), MAX_IMAGE_SIZE)
-    image = pipe(
-        prompt= prompt ,
-        image = init_image,
-        controlnet_conditioning_scale = controlnet_conditioning_scale,
-        control_guidance_end = control_guidance_end,
-        strength= strength ,
-        control_image = condi_img,
-        negative_prompt= negative_prompt ,
-        num_inference_steps= num_inference_steps,
-        guidance_scale= guidance_scale,
-        num_images_per_prompt=1,
-        generator=generator,
-    ).images[0]
-    return [condi_img, image], seed
-@spaces.GPU
-def infer_canny(prompt,
-          image = None,
-          negative_prompt = "nsfw，脸部阴影，低分辨率，jpeg伪影、模糊、糟糕，黑脸，霓虹灯",
-          seed = 397886929,
-          randomize_seed = False,
-          guidance_scale = 6.0,
-          num_inference_steps = 50,
-          controlnet_conditioning_scale = 0.7,
-          control_guidance_end = 0.9,
-          strength = 1.0
-        ):
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    generator = torch.Generator().manual_seed(seed)
-    init_image = resize_image(image,  MAX_IMAGE_SIZE)
-    pipe = pipe_canny.to("cuda")
-    condi_img = process_canny_condition(np.array(init_image))
     image = pipe(
-        prompt= prompt ,
-        image = init_image,
-        controlnet_conditioning_scale = controlnet_conditioning_scale,
-        control_guidance_end = control_guidance_end,
-        strength= strength ,
-        control_image = condi_img,
-        negative_prompt= negative_prompt ,
         num_inference_steps= num_inference_steps,
-        guidance_scale= guidance_scale,
-        num_images_per_prompt=1,
-        generator=generator,
     ).images[0]
-    return [condi_img, image], seed
-canny_examples = [
-    ["一个漂亮的女孩，高品质，超清晰，色彩鲜艳，超高分辨率，最佳品质，8k，高清，4K",
-     "image/woman_1.png"],
-    ["全景，一只可爱的白色小狗坐在杯子里，看向镜头，动漫风格，3d渲染，辛烷值渲染",
-    "image/dog.png"]
-]
-depth_examples = [
-    ["新海诚风格，丰富的色彩，穿着绿色衬衫的女人站在田野里，唯美风景，清新明亮，斑驳的光影，最好的质量，超细节，8K画质",
-     "image/woman_2.png"],
-    ["一只颜色鲜艳的小鸟，高品质，超清晰，色彩鲜艳，超高分辨率，最佳品质，8k，高清，4K",
-     "image/bird.png"]
 ]
 css="""
 #col-left {
     margin: 0 auto;
@@ -190,7 +158,6 @@ with gr.Blocks(css=css) as Kolors:
                     label="Negative prompt",
                     placeholder="Enter a negative prompt",
                     visible=True,
-                    value="nsfw，脸部阴影，低分辨率，jpeg伪影、模糊、糟糕，黑脸，霓虹灯"
                 )
                 seed = gr.Slider(
                     label="Seed",
@@ -206,73 +173,35 @@ with gr.Blocks(css=css) as Kolors:
                         minimum=0.0,
                         maximum=10.0,
                         step=0.1,
-                        value=6.0,
                     )
                     num_inference_steps = gr.Slider(
                         label="Number of inference steps",
                         minimum=10,
                         maximum=50,
                         step=1,
-                        value=30,
-                    )
-                with gr.Row():
-                    controlnet_conditioning_scale = gr.Slider(
-                        label="Controlnet Conditioning Scale",
-                        minimum=0.0,
-                        maximum=1.0,
-                        step=0.1,
-                        value=0.7,
-                    )
-                    control_guidance_end = gr.Slider(
-                        label="Control Guidance End",
-                        minimum=0.0,
-                        maximum=1.0,
-                        step=0.1,
-                        value=0.9,
-                    )
-                with gr.Row():
-                    strength = gr.Slider(
-                        label="Strength",
-                        minimum=0.0,
-                        maximum=1.0,
-                        step=0.1,
-                        value=1.0,
                     )
             with gr.Row():
-                canny_button = gr.Button("Canny", elem_id="button")
-                depth_button = gr.Button("Depth", elem_id="button")
         with gr.Column(elem_id="col-right"):
-            result = gr.Gallery(label="Result", show_label=False, columns=2)
             seed_used = gr.Number(label="Seed Used")
     with gr.Row():
         gr.Examples(
-                fn = infer_canny,
-                examples = canny_examples,
                 inputs = [prompt, image],
                 outputs = [result, seed_used],
-                label = "Canny"
-            )
-    with gr.Row():
-        gr.Examples(
-                fn = infer_depth,
-                examples = depth_examples,
-                inputs = [prompt, image],
-                outputs = [result, seed_used],
-                label = "Depth"
             )
-    canny_button.click(
-        fn = infer_canny,
-        inputs = [prompt, image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, controlnet_conditioning_scale, control_guidance_end, strength],
         outputs = [result, seed_used]
     )
-    depth_button.click(
-        fn = infer_depth,
-        inputs = [prompt, image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, controlnet_conditioning_scale, control_guidance_end, strength],
-        outputs = [result, seed_used]
-    )
 Kolors.queue().launch(debug=True)

 import random
 import torch
 import cv2
+import insightface
 import gradio as gr
 import numpy as np
 from huggingface_hub import snapshot_download
 from transformers import CLIPVisionModelWithProjection,CLIPImageProcessor
+from kolors.pipelines.pipeline_stable_diffusion_xl_chatglm_256_ipadapter_FaceID import StableDiffusionXLPipeline
 from kolors.models.modeling_chatglm import ChatGLMModel
 from kolors.models.tokenization_chatglm import ChatGLMTokenizer
+from diffusers import AutoencoderKL
 from kolors.models.unet_2d_condition import UNet2DConditionModel
 from diffusers import EulerDiscreteScheduler
 from PIL import Image
+from insightface.app import FaceAnalysis
+from insightface.data import get_image as ins_get_image
 device = "cuda"
 ckpt_dir = snapshot_download(repo_id="Kwai-Kolors/Kolors")
+ckpt_dir_faceid = snapshot_download(repo_id="Kwai-Kolors/Kolors-IP-Adapter-FaceID-Plus")
 text_encoder = ChatGLMModel.from_pretrained(f'{ckpt_dir}/text_encoder', torch_dtype=torch.float16).half().to(device)
 tokenizer = ChatGLMTokenizer.from_pretrained(f'{ckpt_dir}/text_encoder')
 vae = AutoencoderKL.from_pretrained(f"{ckpt_dir}/vae", revision=None).half().to(device)
 scheduler = EulerDiscreteScheduler.from_pretrained(f"{ckpt_dir}/scheduler")
 unet = UNet2DConditionModel.from_pretrained(f"{ckpt_dir}/unet", revision=None).half().to(device)
+clip_image_encoder = CLIPVisionModelWithProjection.from_pretrained(f'{ckpt_dir_faceid}/clip-vit-large-patch14-336', ignore_mismatched_sizes=True)
+clip_image_encoder.to(device)
+clip_image_processor = CLIPImageProcessor(size = 336, crop_size = 336)
+pipe = StableDiffusionXLPipeline(
+    vae = vae,
+    text_encoder = text_encoder,
+    tokenizer = tokenizer,
+    unet = unet,
+    scheduler = scheduler,
+    face_clip_encoder = clip_image_encoder,
+    face_clip_processor = clip_image_processor,
+    force_zeros_for_empty_prompt = False,
 )
+class FaceInfoGenerator():
+    def __init__(self, root_dir = "./"):
+        self.app = FaceAnalysis(name = 'antelopev2', root = root_dir, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
+        self.app.prepare(ctx_id = 0, det_size = (640, 640))
+    def get_faceinfo_one_img(self, face_image):
+        face_info = self.app.get(cv2.cvtColor(np.array(face_image), cv2.COLOR_RGB2BGR))
+        if len(face_info) == 0:
+            face_info = None
+        else:
+            face_info = sorted(face_info, key=lambda x:(x['bbox'][2]-x['bbox'][0])*(x['bbox'][3]-x['bbox'][1]))[-1]  # only use the maximum face
+        return face_info
+def face_bbox_to_square(bbox):
+    ## l, t, r, b to square l, t, r, b
+    l,t,r,b = bbox
+    cent_x = (l + r) / 2
+    cent_y = (t + b) / 2
+    w, h = r - l, b - t
+    r = max(w, h) / 2
+    l0 = cent_x - r
+    r0 = cent_x + r
+    t0 = cent_y - r
+    b0 = cent_y + r
+    return [l0, t0, r0, b0]
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 1024
 @spaces.GPU
+def infer(prompt,
           image = None,
           negative_prompt = "nsfw，脸部阴影，低分辨率，jpeg伪影、模糊、糟糕，黑脸，霓虹灯",
+          seed = 66,
           randomize_seed = False,
           guidance_scale = 6.0,
+          num_inference_steps = 50
         ):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     generator = torch.Generator().manual_seed(seed)
+    pipe = pipe.to(device)
+    pipe.load_ip_adapter_faceid_plus(f'{ckpt_dir_faceid}/ipa-faceid-plus.bin', device = device)
+    scale = 0.8
+    pipe.set_face_fidelity_scale(scale)
+    face_info_generator = FaceInfoGenerator(root_dir = "./")
+    face_info = face_info_generator.get_faceinfo_one_img(image)
+    face_bbox_square = face_bbox_to_square(face_info["bbox"])
+    crop_image = image.crop(face_bbox_square)
+    crop_image = crop_image.resize((336, 336))
+    crop_image = [crop_image]
+    face_embeds = torch.from_numpy(np.array([face_info["embedding"]]))
+    face_embeds = face_embeds.to(device, dtype = torch.float16)
     image = pipe(
+        prompt = prompt,
+        negative_prompt = negative_prompt,
+        height = 1024,
+        width = 1024,
         num_inference_steps= num_inference_steps,
+        guidance_scale = guidance_scale,
+        num_images_per_prompt = 1,
+        generator = generator,
+        face_crop_image = crop_image,
+        face_insightface_embeds = face_embeds
     ).images[0]
+    return image, seed
+examples = [
+    ["穿着晚礼服，在星光下的晚宴场景中，烛光闪闪，整个场景洋溢着浪漫而奢华的氛围", "image/image1.png"],
+    ["西部牛仔，牛仔帽，荒野大镖客，背景是西部小镇，仙人掌，,日落余晖, 暖色调, 使用XT4胶片拍摄, 噪点, 晕影, 柯达胶卷，复古", "image/image2.png"]
 ]
 css="""
 #col-left {
     margin: 0 auto;
                     label="Negative prompt",
                     placeholder="Enter a negative prompt",
                     visible=True,
                 )
                 seed = gr.Slider(
                     label="Seed",
                         minimum=0.0,
                         maximum=10.0,
                         step=0.1,
+                        value=5.0,
                     )
                     num_inference_steps = gr.Slider(
                         label="Number of inference steps",
                         minimum=10,
                         maximum=50,
                         step=1,
+                        value=25,
                     )
             with gr.Row():
+                button = gr.Button("Run", elem_id="button")
         with gr.Column(elem_id="col-right"):
+            result = gr.Image(label="Result", show_label=False)
             seed_used = gr.Number(label="Seed Used")
     with gr.Row():
         gr.Examples(
+                fn = infer,
+                examples = examples,
                 inputs = [prompt, image],
                 outputs = [result, seed_used],
             )
+    button.click(
+        fn = infer,
+        inputs = [prompt, image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps],
         outputs = [result, seed_used]
     )
 Kolors.queue().launch(debug=True)

assets/title.md CHANGED Viewed

@@ -1,10 +1,10 @@
 <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
   <div>
-    <h1>Kolors-Controlnet</h1>
-    <span>Two ControlNet based on Kolors-Basemodel: Canny and Depth</span>
     <br>
     <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
-        <a href="https://github.com/Kwai-Kolors/Kolors/tree/master/controlnet"><img src="https://img.shields.io/static/v1?label=Kolors Code&message=Github&color=blue&logo=github-pages"></a> &ensp;
         <a href="https://kwai-kolors.github.io/"><img src="https://img.shields.io/static/v1?label=Team%20Page&message=Page&color=green"></a> &ensp;
         <a href="https://github.com/Kwai-Kolors/Kolors/blob/master/imgs/Kolors_paper.pdf"><img src="https://img.shields.io/static/v1?label=Tech Report&message=Kolors&color=red"></a> &ensp;
         <a href="https://klingai.kuaishou.com/"><img src="https://img.shields.io/static/v1?label=Official Website&message=Page&color=green"></a>

 <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
   <div>
+    <h1>Kolors-FaceID</h1>
+    <span>Kolors-IP-Adapter-FaceID-Plus based on Kolors-Basemodel.</span>
     <br>
     <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
+        <a href="https://github.com/Kwai-Kolors/Kolors/tree/master/ipadapter_FaceID"><img src="https://img.shields.io/static/v1?label=Kolors Code&message=Github&color=blue&logo=github-pages"></a> &ensp;
         <a href="https://kwai-kolors.github.io/"><img src="https://img.shields.io/static/v1?label=Team%20Page&message=Page&color=green"></a> &ensp;
         <a href="https://github.com/Kwai-Kolors/Kolors/blob/master/imgs/Kolors_paper.pdf"><img src="https://img.shields.io/static/v1?label=Tech Report&message=Kolors&color=red"></a> &ensp;
         <a href="https://klingai.kuaishou.com/"><img src="https://img.shields.io/static/v1?label=Official Website&message=Page&color=green"></a>

basicsr/__init__.py DELETED Viewed

@@ -1,11 +0,0 @@
-# https://github.com/xinntao/BasicSR
-# flake8: noqa
-from .archs import *
-from .data import *
-from .losses import *
-from .metrics import *
-from .models import *
-from .ops import *
-from .test import *
-from .train import *
-from .utils import *

basicsr/archs/__init__.py DELETED Viewed

@@ -1,24 +0,0 @@
-import importlib
-from copy import deepcopy
-from os import path as osp
-from basicsr.utils import get_root_logger, scandir
-from basicsr.utils.registry import ARCH_REGISTRY
-__all__ = ['build_network']
-# automatically scan and import arch modules for registry
-# scan all the files under the 'archs' folder and collect files ending with '_arch.py'
-arch_folder = osp.dirname(osp.abspath(__file__))
-arch_filenames = [osp.splitext(osp.basename(v))[0] for v in scandir(arch_folder) if v.endswith('_arch.py')]
-# import all the arch modules
-_arch_modules = [importlib.import_module(f'basicsr.archs.{file_name}') for file_name in arch_filenames]
-def build_network(opt):
-    opt = deepcopy(opt)
-    network_type = opt.pop('type')
-    net = ARCH_REGISTRY.get(network_type)(**opt)
-    logger = get_root_logger()
-    logger.info(f'Network [{net.__class__.__name__}] is created.')
-    return net

basicsr/archs/arch_util.py DELETED Viewed

@@ -1,313 +0,0 @@
-import collections.abc
-import math
-import torch
-import torchvision
-import warnings
-from distutils.version import LooseVersion
-from itertools import repeat
-from torch import nn as nn
-from torch.nn import functional as F
-from torch.nn import init as init
-from torch.nn.modules.batchnorm import _BatchNorm
-from basicsr.ops.dcn import ModulatedDeformConvPack, modulated_deform_conv
-from basicsr.utils import get_root_logger
-@torch.no_grad()
-def default_init_weights(module_list, scale=1, bias_fill=0, **kwargs):
-    """Initialize network weights.
-    Args:
-        module_list (list[nn.Module] | nn.Module): Modules to be initialized.
-        scale (float): Scale initialized weights, especially for residual
-            blocks. Default: 1.
-        bias_fill (float): The value to fill bias. Default: 0
-        kwargs (dict): Other arguments for initialization function.
-    """
-    if not isinstance(module_list, list):
-        module_list = [module_list]
-    for module in module_list:
-        for m in module.modules():
-            if isinstance(m, nn.Conv2d):
-                init.kaiming_normal_(m.weight, **kwargs)
-                m.weight.data *= scale
-                if m.bias is not None:
-                    m.bias.data.fill_(bias_fill)
-            elif isinstance(m, nn.Linear):
-                init.kaiming_normal_(m.weight, **kwargs)
-                m.weight.data *= scale
-                if m.bias is not None:
-                    m.bias.data.fill_(bias_fill)
-            elif isinstance(m, _BatchNorm):
-                init.constant_(m.weight, 1)
-                if m.bias is not None:
-                    m.bias.data.fill_(bias_fill)
-def make_layer(basic_block, num_basic_block, **kwarg):
-    """Make layers by stacking the same blocks.
-    Args:
-        basic_block (nn.module): nn.module class for basic block.
-        num_basic_block (int): number of blocks.
-    Returns:
-        nn.Sequential: Stacked blocks in nn.Sequential.
-    """
-    layers = []
-    for _ in range(num_basic_block):
-        layers.append(basic_block(**kwarg))
-    return nn.Sequential(*layers)
-class ResidualBlockNoBN(nn.Module):
-    """Residual block without BN.
-    Args:
-        num_feat (int): Channel number of intermediate features.
-            Default: 64.
-        res_scale (float): Residual scale. Default: 1.
-        pytorch_init (bool): If set to True, use pytorch default init,
-            otherwise, use default_init_weights. Default: False.
-    """
-    def __init__(self, num_feat=64, res_scale=1, pytorch_init=False):
-        super(ResidualBlockNoBN, self).__init__()
-        self.res_scale = res_scale
-        self.conv1 = nn.Conv2d(num_feat, num_feat, 3, 1, 1, bias=True)
-        self.conv2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1, bias=True)
-        self.relu = nn.ReLU(inplace=True)
-        if not pytorch_init:
-            default_init_weights([self.conv1, self.conv2], 0.1)
-    def forward(self, x):
-        identity = x
-        out = self.conv2(self.relu(self.conv1(x)))
-        return identity + out * self.res_scale
-class Upsample(nn.Sequential):
-    """Upsample module.
-    Args:
-        scale (int): Scale factor. Supported scales: 2^n and 3.
-        num_feat (int): Channel number of intermediate features.
-    """
-    def __init__(self, scale, num_feat):
-        m = []
-        if (scale & (scale - 1)) == 0:  # scale = 2^n
-            for _ in range(int(math.log(scale, 2))):
-                m.append(nn.Conv2d(num_feat, 4 * num_feat, 3, 1, 1))
-                m.append(nn.PixelShuffle(2))
-        elif scale == 3:
-            m.append(nn.Conv2d(num_feat, 9 * num_feat, 3, 1, 1))
-            m.append(nn.PixelShuffle(3))
-        else:
-            raise ValueError(f'scale {scale} is not supported. Supported scales: 2^n and 3.')
-        super(Upsample, self).__init__(*m)
-def flow_warp(x, flow, interp_mode='bilinear', padding_mode='zeros', align_corners=True):
-    """Warp an image or feature map with optical flow.
-    Args:
-        x (Tensor): Tensor with size (n, c, h, w).
-        flow (Tensor): Tensor with size (n, h, w, 2), normal value.
-        interp_mode (str): 'nearest' or 'bilinear'. Default: 'bilinear'.
-        padding_mode (str): 'zeros' or 'border' or 'reflection'.
-            Default: 'zeros'.
-        align_corners (bool): Before pytorch 1.3, the default value is
-            align_corners=True. After pytorch 1.3, the default value is
-            align_corners=False. Here, we use the True as default.
-    Returns:
-        Tensor: Warped image or feature map.
-    """
-    assert x.size()[-2:] == flow.size()[1:3]
-    _, _, h, w = x.size()
-    # create mesh grid
-    grid_y, grid_x = torch.meshgrid(torch.arange(0, h).type_as(x), torch.arange(0, w).type_as(x))
-    grid = torch.stack((grid_x, grid_y), 2).float()  # W(x), H(y), 2
-    grid.requires_grad = False
-    vgrid = grid + flow
-    # scale grid to [-1,1]
-    vgrid_x = 2.0 * vgrid[:, :, :, 0] / max(w - 1, 1) - 1.0
-    vgrid_y = 2.0 * vgrid[:, :, :, 1] / max(h - 1, 1) - 1.0
-    vgrid_scaled = torch.stack((vgrid_x, vgrid_y), dim=3)
-    output = F.grid_sample(x, vgrid_scaled, mode=interp_mode, padding_mode=padding_mode, align_corners=align_corners)
-    # TODO, what if align_corners=False
-    return output
-def resize_flow(flow, size_type, sizes, interp_mode='bilinear', align_corners=False):
-    """Resize a flow according to ratio or shape.
-    Args:
-        flow (Tensor): Precomputed flow. shape [N, 2, H, W].
-        size_type (str): 'ratio' or 'shape'.
-        sizes (list[int | float]): the ratio for resizing or the final output
-            shape.
-            1) The order of ratio should be [ratio_h, ratio_w]. For
-            downsampling, the ratio should be smaller than 1.0 (i.e., ratio
-            < 1.0). For upsampling, the ratio should be larger than 1.0 (i.e.,
-            ratio > 1.0).
-            2) The order of output_size should be [out_h, out_w].
-        interp_mode (str): The mode of interpolation for resizing.
-            Default: 'bilinear'.
-        align_corners (bool): Whether align corners. Default: False.
-    Returns:
-        Tensor: Resized flow.
-    """
-    _, _, flow_h, flow_w = flow.size()
-    if size_type == 'ratio':
-        output_h, output_w = int(flow_h * sizes[0]), int(flow_w * sizes[1])
-    elif size_type == 'shape':
-        output_h, output_w = sizes[0], sizes[1]
-    else:
-        raise ValueError(f'Size type should be ratio or shape, but got type {size_type}.')
-    input_flow = flow.clone()
-    ratio_h = output_h / flow_h
-    ratio_w = output_w / flow_w
-    input_flow[:, 0, :, :] *= ratio_w
-    input_flow[:, 1, :, :] *= ratio_h
-    resized_flow = F.interpolate(
-        input=input_flow, size=(output_h, output_w), mode=interp_mode, align_corners=align_corners)
-    return resized_flow
-# TODO: may write a cpp file
-def pixel_unshuffle(x, scale):
-    """ Pixel unshuffle.
-    Args:
-        x (Tensor): Input feature with shape (b, c, hh, hw).
-        scale (int): Downsample ratio.
-    Returns:
-        Tensor: the pixel unshuffled feature.
-    """
-    b, c, hh, hw = x.size()
-    out_channel = c * (scale**2)
-    assert hh % scale == 0 and hw % scale == 0
-    h = hh // scale
-    w = hw // scale
-    x_view = x.view(b, c, h, scale, w, scale)
-    return x_view.permute(0, 1, 3, 5, 2, 4).reshape(b, out_channel, h, w)
-class DCNv2Pack(ModulatedDeformConvPack):
-    """Modulated deformable conv for deformable alignment.
-    Different from the official DCNv2Pack, which generates offsets and masks
-    from the preceding features, this DCNv2Pack takes another different
-    features to generate offsets and masks.
-    ``Paper: Delving Deep into Deformable Alignment in Video Super-Resolution``
-    """
-    def forward(self, x, feat):
-        out = self.conv_offset(feat)
-        o1, o2, mask = torch.chunk(out, 3, dim=1)
-        offset = torch.cat((o1, o2), dim=1)
-        mask = torch.sigmoid(mask)
-        offset_absmean = torch.mean(torch.abs(offset))
-        if offset_absmean > 50:
-            logger = get_root_logger()
-            logger.warning(f'Offset abs mean is {offset_absmean}, larger than 50.')
-        if LooseVersion(torchvision.__version__) >= LooseVersion('0.9.0'):
-            return torchvision.ops.deform_conv2d(x, offset, self.weight, self.bias, self.stride, self.padding,
-                                                 self.dilation, mask)
-        else:
-            return modulated_deform_conv(x, offset, mask, self.weight, self.bias, self.stride, self.padding,
-                                         self.dilation, self.groups, self.deformable_groups)
-def _no_grad_trunc_normal_(tensor, mean, std, a, b):
-    # From: https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/weight_init.py
-    # Cut & paste from PyTorch official master until it's in a few official releases - RW
-    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
-    def norm_cdf(x):
-        # Computes standard normal cumulative distribution function
-        return (1. + math.erf(x / math.sqrt(2.))) / 2.
-    if (mean < a - 2 * std) or (mean > b + 2 * std):
-        warnings.warn(
-            'mean is more than 2 std from [a, b] in nn.init.trunc_normal_. '
-            'The distribution of values may be incorrect.',
-            stacklevel=2)
-    with torch.no_grad():
-        # Values are generated by using a truncated uniform distribution and
-        # then using the inverse CDF for the normal distribution.
-        # Get upper and lower cdf values
-        low = norm_cdf((a - mean) / std)
-        up = norm_cdf((b - mean) / std)
-        # Uniformly fill tensor with values from [low, up], then translate to
-        # [2l-1, 2u-1].
-        tensor.uniform_(2 * low - 1, 2 * up - 1)
-        # Use inverse cdf transform for normal distribution to get truncated
-        # standard normal
-        tensor.erfinv_()
-        # Transform to proper mean, std
-        tensor.mul_(std * math.sqrt(2.))
-        tensor.add_(mean)
-        # Clamp to ensure it's in the proper range
-        tensor.clamp_(min=a, max=b)
-        return tensor
-def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
-    r"""Fills the input Tensor with values drawn from a truncated
-    normal distribution.
-    From: https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/weight_init.py
-    The values are effectively drawn from the
-    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
-    with values outside :math:`[a, b]` redrawn until they are within
-    the bounds. The method used for generating the random values works
-    best when :math:`a \leq \text{mean} \leq b`.
-    Args:
-        tensor: an n-dimensional `torch.Tensor`
-        mean: the mean of the normal distribution
-        std: the standard deviation of the normal distribution
-        a: the minimum cutoff value
-        b: the maximum cutoff value
-    Examples:
-        >>> w = torch.empty(3, 5)
-        >>> nn.init.trunc_normal_(w)
-    """
-    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
-# From PyTorch
-def _ntuple(n):
-    def parse(x):
-        if isinstance(x, collections.abc.Iterable):
-            return x
-        return tuple(repeat(x, n))
-    return parse
-to_1tuple = _ntuple(1)
-to_2tuple = _ntuple(2)
-to_3tuple = _ntuple(3)
-to_4tuple = _ntuple(4)
-to_ntuple = _ntuple

basicsr/archs/basicvsr_arch.py DELETED Viewed

@@ -1,336 +0,0 @@
-import torch
-from torch import nn as nn
-from torch.nn import functional as F
-from basicsr.utils.registry import ARCH_REGISTRY
-from .arch_util import ResidualBlockNoBN, flow_warp, make_layer
-from .edvr_arch import PCDAlignment, TSAFusion
-from .spynet_arch import SpyNet
-@ARCH_REGISTRY.register()
-class BasicVSR(nn.Module):
-    """A recurrent network for video SR. Now only x4 is supported.
-    Args:
-        num_feat (int): Number of channels. Default: 64.
-        num_block (int): Number of residual blocks for each branch. Default: 15
-        spynet_path (str): Path to the pretrained weights of SPyNet. Default: None.
-    """
-    def __init__(self, num_feat=64, num_block=15, spynet_path=None):
-        super().__init__()
-        self.num_feat = num_feat
-        # alignment
-        self.spynet = SpyNet(spynet_path)
-        # propagation
-        self.backward_trunk = ConvResidualBlocks(num_feat + 3, num_feat, num_block)
-        self.forward_trunk = ConvResidualBlocks(num_feat + 3, num_feat, num_block)
-        # reconstruction
-        self.fusion = nn.Conv2d(num_feat * 2, num_feat, 1, 1, 0, bias=True)
-        self.upconv1 = nn.Conv2d(num_feat, num_feat * 4, 3, 1, 1, bias=True)
-        self.upconv2 = nn.Conv2d(num_feat, 64 * 4, 3, 1, 1, bias=True)
-        self.conv_hr = nn.Conv2d(64, 64, 3, 1, 1)
-        self.conv_last = nn.Conv2d(64, 3, 3, 1, 1)
-        self.pixel_shuffle = nn.PixelShuffle(2)
-        # activation functions
-        self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True)
-    def get_flow(self, x):
-        b, n, c, h, w = x.size()
-        x_1 = x[:, :-1, :, :, :].reshape(-1, c, h, w)
-        x_2 = x[:, 1:, :, :, :].reshape(-1, c, h, w)
-        flows_backward = self.spynet(x_1, x_2).view(b, n - 1, 2, h, w)
-        flows_forward = self.spynet(x_2, x_1).view(b, n - 1, 2, h, w)
-        return flows_forward, flows_backward
-    def forward(self, x):
-        """Forward function of BasicVSR.
-        Args:
-            x: Input frames with shape (b, n, c, h, w). n is the temporal dimension / number of frames.
-        """
-        flows_forward, flows_backward = self.get_flow(x)
-        b, n, _, h, w = x.size()
-        # backward branch
-        out_l = []
-        feat_prop = x.new_zeros(b, self.num_feat, h, w)
-        for i in range(n - 1, -1, -1):
-            x_i = x[:, i, :, :, :]
-            if i < n - 1:
-                flow = flows_backward[:, i, :, :, :]
-                feat_prop = flow_warp(feat_prop, flow.permute(0, 2, 3, 1))
-            feat_prop = torch.cat([x_i, feat_prop], dim=1)
-            feat_prop = self.backward_trunk(feat_prop)
-            out_l.insert(0, feat_prop)
-        # forward branch
-        feat_prop = torch.zeros_like(feat_prop)
-        for i in range(0, n):
-            x_i = x[:, i, :, :, :]
-            if i > 0:
-                flow = flows_forward[:, i - 1, :, :, :]
-                feat_prop = flow_warp(feat_prop, flow.permute(0, 2, 3, 1))
-            feat_prop = torch.cat([x_i, feat_prop], dim=1)
-            feat_prop = self.forward_trunk(feat_prop)
-            # upsample
-            out = torch.cat([out_l[i], feat_prop], dim=1)
-            out = self.lrelu(self.fusion(out))
-            out = self.lrelu(self.pixel_shuffle(self.upconv1(out)))
-            out = self.lrelu(self.pixel_shuffle(self.upconv2(out)))
-            out = self.lrelu(self.conv_hr(out))
-            out = self.conv_last(out)
-            base = F.interpolate(x_i, scale_factor=4, mode='bilinear', align_corners=False)
-            out += base
-            out_l[i] = out
-        return torch.stack(out_l, dim=1)
-class ConvResidualBlocks(nn.Module):
-    """Conv and residual block used in BasicVSR.
-    Args:
-        num_in_ch (int): Number of input channels. Default: 3.
-        num_out_ch (int): Number of output channels. Default: 64.
-        num_block (int): Number of residual blocks. Default: 15.
-    """
-    def __init__(self, num_in_ch=3, num_out_ch=64, num_block=15):
-        super().__init__()
-        self.main = nn.Sequential(
-            nn.Conv2d(num_in_ch, num_out_ch, 3, 1, 1, bias=True), nn.LeakyReLU(negative_slope=0.1, inplace=True),
-            make_layer(ResidualBlockNoBN, num_block, num_feat=num_out_ch))
-    def forward(self, fea):
-        return self.main(fea)
-@ARCH_REGISTRY.register()
-class IconVSR(nn.Module):
-    """IconVSR, proposed also in the BasicVSR paper.
-    Args:
-        num_feat (int): Number of channels. Default: 64.
-        num_block (int): Number of residual blocks for each branch. Default: 15.
-        keyframe_stride (int): Keyframe stride. Default: 5.
-        temporal_padding (int): Temporal padding. Default: 2.
-        spynet_path (str): Path to the pretrained weights of SPyNet. Default: None.
-        edvr_path (str): Path to the pretrained EDVR model. Default: None.
-    """
-    def __init__(self,
-                 num_feat=64,
-                 num_block=15,
-                 keyframe_stride=5,
-                 temporal_padding=2,
-                 spynet_path=None,
-                 edvr_path=None):
-        super().__init__()
-        self.num_feat = num_feat
-        self.temporal_padding = temporal_padding
-        self.keyframe_stride = keyframe_stride
-        # keyframe_branch
-        self.edvr = EDVRFeatureExtractor(temporal_padding * 2 + 1, num_feat, edvr_path)
-        # alignment
-        self.spynet = SpyNet(spynet_path)
-        # propagation
-        self.backward_fusion = nn.Conv2d(2 * num_feat, num_feat, 3, 1, 1, bias=True)
-        self.backward_trunk = ConvResidualBlocks(num_feat + 3, num_feat, num_block)
-        self.forward_fusion = nn.Conv2d(2 * num_feat, num_feat, 3, 1, 1, bias=True)
-        self.forward_trunk = ConvResidualBlocks(2 * num_feat + 3, num_feat, num_block)
-        # reconstruction
-        self.upconv1 = nn.Conv2d(num_feat, num_feat * 4, 3, 1, 1, bias=True)
-        self.upconv2 = nn.Conv2d(num_feat, 64 * 4, 3, 1, 1, bias=True)
-        self.conv_hr = nn.Conv2d(64, 64, 3, 1, 1)
-        self.conv_last = nn.Conv2d(64, 3, 3, 1, 1)
-        self.pixel_shuffle = nn.PixelShuffle(2)
-        # activation functions
-        self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True)
-    def pad_spatial(self, x):
-        """Apply padding spatially.
-        Since the PCD module in EDVR requires that the resolution is a multiple
-        of 4, we apply padding to the input LR images if their resolution is
-        not divisible by 4.
-        Args:
-            x (Tensor): Input LR sequence with shape (n, t, c, h, w).
-        Returns:
-            Tensor: Padded LR sequence with shape (n, t, c, h_pad, w_pad).
-        """
-        n, t, c, h, w = x.size()
-        pad_h = (4 - h % 4) % 4
-        pad_w = (4 - w % 4) % 4
-        # padding
-        x = x.view(-1, c, h, w)
-        x = F.pad(x, [0, pad_w, 0, pad_h], mode='reflect')
-        return x.view(n, t, c, h + pad_h, w + pad_w)
-    def get_flow(self, x):
-        b, n, c, h, w = x.size()
-        x_1 = x[:, :-1, :, :, :].reshape(-1, c, h, w)
-        x_2 = x[:, 1:, :, :, :].reshape(-1, c, h, w)
-        flows_backward = self.spynet(x_1, x_2).view(b, n - 1, 2, h, w)
-        flows_forward = self.spynet(x_2, x_1).view(b, n - 1, 2, h, w)
-        return flows_forward, flows_backward
-    def get_keyframe_feature(self, x, keyframe_idx):
-        if self.temporal_padding == 2:
-            x = [x[:, [4, 3]], x, x[:, [-4, -5]]]
-        elif self.temporal_padding == 3:
-            x = [x[:, [6, 5, 4]], x, x[:, [-5, -6, -7]]]
-        x = torch.cat(x, dim=1)
-        num_frames = 2 * self.temporal_padding + 1
-        feats_keyframe = {}
-        for i in keyframe_idx:
-            feats_keyframe[i] = self.edvr(x[:, i:i + num_frames].contiguous())
-        return feats_keyframe
-    def forward(self, x):
-        b, n, _, h_input, w_input = x.size()
-        x = self.pad_spatial(x)
-        h, w = x.shape[3:]
-        keyframe_idx = list(range(0, n, self.keyframe_stride))
-        if keyframe_idx[-1] != n - 1:
-            keyframe_idx.append(n - 1)  # last frame is a keyframe
-        # compute flow and keyframe features
-        flows_forward, flows_backward = self.get_flow(x)
-        feats_keyframe = self.get_keyframe_feature(x, keyframe_idx)
-        # backward branch
-        out_l = []
-        feat_prop = x.new_zeros(b, self.num_feat, h, w)
-        for i in range(n - 1, -1, -1):
-            x_i = x[:, i, :, :, :]
-            if i < n - 1:
-                flow = flows_backward[:, i, :, :, :]
-                feat_prop = flow_warp(feat_prop, flow.permute(0, 2, 3, 1))
-            if i in keyframe_idx:
-                feat_prop = torch.cat([feat_prop, feats_keyframe[i]], dim=1)
-                feat_prop = self.backward_fusion(feat_prop)
-            feat_prop = torch.cat([x_i, feat_prop], dim=1)
-            feat_prop = self.backward_trunk(feat_prop)
-            out_l.insert(0, feat_prop)
-        # forward branch
-        feat_prop = torch.zeros_like(feat_prop)
-        for i in range(0, n):
-            x_i = x[:, i, :, :, :]
-            if i > 0:
-                flow = flows_forward[:, i - 1, :, :, :]
-                feat_prop = flow_warp(feat_prop, flow.permute(0, 2, 3, 1))
-            if i in keyframe_idx:
-                feat_prop = torch.cat([feat_prop, feats_keyframe[i]], dim=1)
-                feat_prop = self.forward_fusion(feat_prop)
-            feat_prop = torch.cat([x_i, out_l[i], feat_prop], dim=1)
-            feat_prop = self.forward_trunk(feat_prop)
-            # upsample
-            out = self.lrelu(self.pixel_shuffle(self.upconv1(feat_prop)))
-            out = self.lrelu(self.pixel_shuffle(self.upconv2(out)))
-            out = self.lrelu(self.conv_hr(out))
-            out = self.conv_last(out)
-            base = F.interpolate(x_i, scale_factor=4, mode='bilinear', align_corners=False)
-            out += base
-            out_l[i] = out
-        return torch.stack(out_l, dim=1)[..., :4 * h_input, :4 * w_input]
-class EDVRFeatureExtractor(nn.Module):
-    """EDVR feature extractor used in IconVSR.
-    Args:
-        num_input_frame (int): Number of input frames.
-        num_feat (int): Number of feature channels
-        load_path (str): Path to the pretrained weights of EDVR. Default: None.
-    """
-    def __init__(self, num_input_frame, num_feat, load_path):
-        super(EDVRFeatureExtractor, self).__init__()
-        self.center_frame_idx = num_input_frame // 2
-        # extract pyramid features
-        self.conv_first = nn.Conv2d(3, num_feat, 3, 1, 1)
-        self.feature_extraction = make_layer(ResidualBlockNoBN, 5, num_feat=num_feat)
-        self.conv_l2_1 = nn.Conv2d(num_feat, num_feat, 3, 2, 1)
-        self.conv_l2_2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-        self.conv_l3_1 = nn.Conv2d(num_feat, num_feat, 3, 2, 1)
-        self.conv_l3_2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-        # pcd and tsa module
-        self.pcd_align = PCDAlignment(num_feat=num_feat, deformable_groups=8)
-        self.fusion = TSAFusion(num_feat=num_feat, num_frame=num_input_frame, center_frame_idx=self.center_frame_idx)
-        # activation function
-        self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True)
-        if load_path:
-            self.load_state_dict(torch.load(load_path, map_location=lambda storage, loc: storage)['params'])
-    def forward(self, x):
-        b, n, c, h, w = x.size()
-        # extract features for each frame
-        # L1
-        feat_l1 = self.lrelu(self.conv_first(x.view(-1, c, h, w)))
-        feat_l1 = self.feature_extraction(feat_l1)
-        # L2
-        feat_l2 = self.lrelu(self.conv_l2_1(feat_l1))
-        feat_l2 = self.lrelu(self.conv_l2_2(feat_l2))
-        # L3
-        feat_l3 = self.lrelu(self.conv_l3_1(feat_l2))
-        feat_l3 = self.lrelu(self.conv_l3_2(feat_l3))
-        feat_l1 = feat_l1.view(b, n, -1, h, w)
-        feat_l2 = feat_l2.view(b, n, -1, h // 2, w // 2)
-        feat_l3 = feat_l3.view(b, n, -1, h // 4, w // 4)
-        # PCD alignment
-        ref_feat_l = [  # reference feature list
-            feat_l1[:, self.center_frame_idx, :, :, :].clone(), feat_l2[:, self.center_frame_idx, :, :, :].clone(),
-            feat_l3[:, self.center_frame_idx, :, :, :].clone()
-        ]
-        aligned_feat = []
-        for i in range(n):
-            nbr_feat_l = [  # neighboring feature list
-                feat_l1[:, i, :, :, :].clone(), feat_l2[:, i, :, :, :].clone(), feat_l3[:, i, :, :, :].clone()
-            ]
-            aligned_feat.append(self.pcd_align(nbr_feat_l, ref_feat_l))
-        aligned_feat = torch.stack(aligned_feat, dim=1)  # (b, t, c, h, w)
-        # TSA fusion
-        return self.fusion(aligned_feat)

basicsr/archs/basicvsrpp_arch.py DELETED Viewed

@@ -1,417 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torchvision
-import warnings
-from basicsr.archs.arch_util import flow_warp
-from basicsr.archs.basicvsr_arch import ConvResidualBlocks
-from basicsr.archs.spynet_arch import SpyNet
-from basicsr.ops.dcn import ModulatedDeformConvPack
-from basicsr.utils.registry import ARCH_REGISTRY
-@ARCH_REGISTRY.register()
-class BasicVSRPlusPlus(nn.Module):
-    """BasicVSR++ network structure.
-    Support either x4 upsampling or same size output. Since DCN is used in this
-    model, it can only be used with CUDA enabled. If CUDA is not enabled,
-    feature alignment will be skipped. Besides, we adopt the official DCN
-    implementation and the version of torch need to be higher than 1.9.
-    ``Paper: BasicVSR++: Improving Video Super-Resolution with Enhanced Propagation and Alignment``
-    Args:
-        mid_channels (int, optional): Channel number of the intermediate
-            features. Default: 64.
-        num_blocks (int, optional): The number of residual blocks in each
-            propagation branch. Default: 7.
-        max_residue_magnitude (int): The maximum magnitude of the offset
-            residue (Eq. 6 in paper). Default: 10.
-        is_low_res_input (bool, optional): Whether the input is low-resolution
-            or not. If False, the output resolution is equal to the input
-            resolution. Default: True.
-        spynet_path (str): Path to the pretrained weights of SPyNet. Default: None.
-        cpu_cache_length (int, optional): When the length of sequence is larger
-            than this value, the intermediate features are sent to CPU. This
-            saves GPU memory, but slows down the inference speed. You can
-            increase this number if you have a GPU with large memory.
-            Default: 100.
-    """
-    def __init__(self,
-                 mid_channels=64,
-                 num_blocks=7,
-                 max_residue_magnitude=10,
-                 is_low_res_input=True,
-                 spynet_path=None,
-                 cpu_cache_length=100):
-        super().__init__()
-        self.mid_channels = mid_channels
-        self.is_low_res_input = is_low_res_input
-        self.cpu_cache_length = cpu_cache_length
-        # optical flow
-        self.spynet = SpyNet(spynet_path)
-        # feature extraction module
-        if is_low_res_input:
-            self.feat_extract = ConvResidualBlocks(3, mid_channels, 5)
-        else:
-            self.feat_extract = nn.Sequential(
-                nn.Conv2d(3, mid_channels, 3, 2, 1), nn.LeakyReLU(negative_slope=0.1, inplace=True),
-                nn.Conv2d(mid_channels, mid_channels, 3, 2, 1), nn.LeakyReLU(negative_slope=0.1, inplace=True),
-                ConvResidualBlocks(mid_channels, mid_channels, 5))
-        # propagation branches
-        self.deform_align = nn.ModuleDict()
-        self.backbone = nn.ModuleDict()
-        modules = ['backward_1', 'forward_1', 'backward_2', 'forward_2']
-        for i, module in enumerate(modules):
-            if torch.cuda.is_available():
-                self.deform_align[module] = SecondOrderDeformableAlignment(
-                    2 * mid_channels,
-                    mid_channels,
-                    3,
-                    padding=1,
-                    deformable_groups=16,
-                    max_residue_magnitude=max_residue_magnitude)
-            self.backbone[module] = ConvResidualBlocks((2 + i) * mid_channels, mid_channels, num_blocks)
-        # upsampling module
-        self.reconstruction = ConvResidualBlocks(5 * mid_channels, mid_channels, 5)
-        self.upconv1 = nn.Conv2d(mid_channels, mid_channels * 4, 3, 1, 1, bias=True)
-        self.upconv2 = nn.Conv2d(mid_channels, 64 * 4, 3, 1, 1, bias=True)
-        self.pixel_shuffle = nn.PixelShuffle(2)
-        self.conv_hr = nn.Conv2d(64, 64, 3, 1, 1)
-        self.conv_last = nn.Conv2d(64, 3, 3, 1, 1)
-        self.img_upsample = nn.Upsample(scale_factor=4, mode='bilinear', align_corners=False)
-        # activation function
-        self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True)
-        # check if the sequence is augmented by flipping
-        self.is_mirror_extended = False
-        if len(self.deform_align) > 0:
-            self.is_with_alignment = True
-        else:
-            self.is_with_alignment = False
-            warnings.warn('Deformable alignment module is not added. '
-                          'Probably your CUDA is not configured correctly. DCN can only '
-                          'be used with CUDA enabled. Alignment is skipped now.')
-    def check_if_mirror_extended(self, lqs):
-        """Check whether the input is a mirror-extended sequence.
-        If mirror-extended, the i-th (i=0, ..., t-1) frame is equal to the (t-1-i)-th frame.
-        Args:
-            lqs (tensor): Input low quality (LQ) sequence with shape (n, t, c, h, w).
-        """
-        if lqs.size(1) % 2 == 0:
-            lqs_1, lqs_2 = torch.chunk(lqs, 2, dim=1)
-            if torch.norm(lqs_1 - lqs_2.flip(1)) == 0:
-                self.is_mirror_extended = True
-    def compute_flow(self, lqs):
-        """Compute optical flow using SPyNet for feature alignment.
-        Note that if the input is an mirror-extended sequence, 'flows_forward'
-        is not needed, since it is equal to 'flows_backward.flip(1)'.
-        Args:
-            lqs (tensor): Input low quality (LQ) sequence with
-                shape (n, t, c, h, w).
-        Return:
-            tuple(Tensor): Optical flow. 'flows_forward' corresponds to the flows used for forward-time propagation \
-                (current to previous). 'flows_backward' corresponds to the flows used for backward-time \
-                propagation (current to next).
-        """
-        n, t, c, h, w = lqs.size()
-        lqs_1 = lqs[:, :-1, :, :, :].reshape(-1, c, h, w)
-        lqs_2 = lqs[:, 1:, :, :, :].reshape(-1, c, h, w)
-        flows_backward = self.spynet(lqs_1, lqs_2).view(n, t - 1, 2, h, w)
-        if self.is_mirror_extended:  # flows_forward = flows_backward.flip(1)
-            flows_forward = flows_backward.flip(1)
-        else:
-            flows_forward = self.spynet(lqs_2, lqs_1).view(n, t - 1, 2, h, w)
-        if self.cpu_cache:
-            flows_backward = flows_backward.cpu()
-            flows_forward = flows_forward.cpu()
-        return flows_forward, flows_backward
-    def propagate(self, feats, flows, module_name):
-        """Propagate the latent features throughout the sequence.
-        Args:
-            feats dict(list[tensor]): Features from previous branches. Each
-                component is a list of tensors with shape (n, c, h, w).
-            flows (tensor): Optical flows with shape (n, t - 1, 2, h, w).
-            module_name (str): The name of the propgation branches. Can either
-                be 'backward_1', 'forward_1', 'backward_2', 'forward_2'.
-        Return:
-            dict(list[tensor]): A dictionary containing all the propagated \
-                features. Each key in the dictionary corresponds to a \
-                propagation branch, which is represented by a list of tensors.
-        """
-        n, t, _, h, w = flows.size()
-        frame_idx = range(0, t + 1)
-        flow_idx = range(-1, t)
-        mapping_idx = list(range(0, len(feats['spatial'])))
-        mapping_idx += mapping_idx[::-1]
-        if 'backward' in module_name:
-            frame_idx = frame_idx[::-1]
-            flow_idx = frame_idx
-        feat_prop = flows.new_zeros(n, self.mid_channels, h, w)
-        for i, idx in enumerate(frame_idx):
-            feat_current = feats['spatial'][mapping_idx[idx]]
-            if self.cpu_cache:
-                feat_current = feat_current.cuda()
-                feat_prop = feat_prop.cuda()
-            # second-order deformable alignment
-            if i > 0 and self.is_with_alignment:
-                flow_n1 = flows[:, flow_idx[i], :, :, :]
-                if self.cpu_cache:
-                    flow_n1 = flow_n1.cuda()
-                cond_n1 = flow_warp(feat_prop, flow_n1.permute(0, 2, 3, 1))
-                # initialize second-order features
-                feat_n2 = torch.zeros_like(feat_prop)
-                flow_n2 = torch.zeros_like(flow_n1)
-                cond_n2 = torch.zeros_like(cond_n1)
-                if i > 1:  # second-order features
-                    feat_n2 = feats[module_name][-2]
-                    if self.cpu_cache:
-                        feat_n2 = feat_n2.cuda()
-                    flow_n2 = flows[:, flow_idx[i - 1], :, :, :]
-                    if self.cpu_cache:
-                        flow_n2 = flow_n2.cuda()
-                    flow_n2 = flow_n1 + flow_warp(flow_n2, flow_n1.permute(0, 2, 3, 1))
-                    cond_n2 = flow_warp(feat_n2, flow_n2.permute(0, 2, 3, 1))
-                # flow-guided deformable convolution
-                cond = torch.cat([cond_n1, feat_current, cond_n2], dim=1)
-                feat_prop = torch.cat([feat_prop, feat_n2], dim=1)
-                feat_prop = self.deform_align[module_name](feat_prop, cond, flow_n1, flow_n2)
-            # concatenate and residual blocks
-            feat = [feat_current] + [feats[k][idx] for k in feats if k not in ['spatial', module_name]] + [feat_prop]
-            if self.cpu_cache:
-                feat = [f.cuda() for f in feat]
-            feat = torch.cat(feat, dim=1)
-            feat_prop = feat_prop + self.backbone[module_name](feat)
-            feats[module_name].append(feat_prop)
-            if self.cpu_cache:
-                feats[module_name][-1] = feats[module_name][-1].cpu()
-                torch.cuda.empty_cache()
-        if 'backward' in module_name:
-            feats[module_name] = feats[module_name][::-1]
-        return feats
-    def upsample(self, lqs, feats):
-        """Compute the output image given the features.
-        Args:
-            lqs (tensor): Input low quality (LQ) sequence with
-                shape (n, t, c, h, w).
-            feats (dict): The features from the propagation branches.
-        Returns:
-            Tensor: Output HR sequence with shape (n, t, c, 4h, 4w).
-        """
-        outputs = []
-        num_outputs = len(feats['spatial'])
-        mapping_idx = list(range(0, num_outputs))
-        mapping_idx += mapping_idx[::-1]
-        for i in range(0, lqs.size(1)):
-            hr = [feats[k].pop(0) for k in feats if k != 'spatial']
-            hr.insert(0, feats['spatial'][mapping_idx[i]])
-            hr = torch.cat(hr, dim=1)
-            if self.cpu_cache:
-                hr = hr.cuda()
-            hr = self.reconstruction(hr)
-            hr = self.lrelu(self.pixel_shuffle(self.upconv1(hr)))
-            hr = self.lrelu(self.pixel_shuffle(self.upconv2(hr)))
-            hr = self.lrelu(self.conv_hr(hr))
-            hr = self.conv_last(hr)
-            if self.is_low_res_input:
-                hr += self.img_upsample(lqs[:, i, :, :, :])
-            else:
-                hr += lqs[:, i, :, :, :]
-            if self.cpu_cache:
-                hr = hr.cpu()
-                torch.cuda.empty_cache()
-            outputs.append(hr)
-        return torch.stack(outputs, dim=1)
-    def forward(self, lqs):
-        """Forward function for BasicVSR++.
-        Args:
-            lqs (tensor): Input low quality (LQ) sequence with
-                shape (n, t, c, h, w).
-        Returns:
-            Tensor: Output HR sequence with shape (n, t, c, 4h, 4w).
-        """
-        n, t, c, h, w = lqs.size()
-        # whether to cache the features in CPU
-        self.cpu_cache = True if t > self.cpu_cache_length else False
-        if self.is_low_res_input:
-            lqs_downsample = lqs.clone()
-        else:
-            lqs_downsample = F.interpolate(
-                lqs.view(-1, c, h, w), scale_factor=0.25, mode='bicubic').view(n, t, c, h // 4, w // 4)
-        # check whether the input is an extended sequence
-        self.check_if_mirror_extended(lqs)
-        feats = {}
-        # compute spatial features
-        if self.cpu_cache:
-            feats['spatial'] = []
-            for i in range(0, t):
-                feat = self.feat_extract(lqs[:, i, :, :, :]).cpu()
-                feats['spatial'].append(feat)
-                torch.cuda.empty_cache()
-        else:
-            feats_ = self.feat_extract(lqs.view(-1, c, h, w))
-            h, w = feats_.shape[2:]
-            feats_ = feats_.view(n, t, -1, h, w)
-            feats['spatial'] = [feats_[:, i, :, :, :] for i in range(0, t)]
-        # compute optical flow using the low-res inputs
-        assert lqs_downsample.size(3) >= 64 and lqs_downsample.size(4) >= 64, (
-            'The height and width of low-res inputs must be at least 64, '
-            f'but got {h} and {w}.')
-        flows_forward, flows_backward = self.compute_flow(lqs_downsample)
-        # feature propgation
-        for iter_ in [1, 2]:
-            for direction in ['backward', 'forward']:
-                module = f'{direction}_{iter_}'
-                feats[module] = []
-                if direction == 'backward':
-                    flows = flows_backward
-                elif flows_forward is not None:
-                    flows = flows_forward
-                else:
-                    flows = flows_backward.flip(1)
-                feats = self.propagate(feats, flows, module)
-                if self.cpu_cache:
-                    del flows
-                    torch.cuda.empty_cache()
-        return self.upsample(lqs, feats)
-class SecondOrderDeformableAlignment(ModulatedDeformConvPack):
-    """Second-order deformable alignment module.
-    Args:
-        in_channels (int): Same as nn.Conv2d.
-        out_channels (int): Same as nn.Conv2d.
-        kernel_size (int or tuple[int]): Same as nn.Conv2d.
-        stride (int or tuple[int]): Same as nn.Conv2d.
-        padding (int or tuple[int]): Same as nn.Conv2d.
-        dilation (int or tuple[int]): Same as nn.Conv2d.
-        groups (int): Same as nn.Conv2d.
-        bias (bool or str): If specified as `auto`, it will be decided by the
-            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
-            False.
-        max_residue_magnitude (int): The maximum magnitude of the offset
-            residue (Eq. 6 in paper). Default: 10.
-    """
-    def __init__(self, *args, **kwargs):
-        self.max_residue_magnitude = kwargs.pop('max_residue_magnitude', 10)
-        super(SecondOrderDeformableAlignment, self).__init__(*args, **kwargs)
-        self.conv_offset = nn.Sequential(
-            nn.Conv2d(3 * self.out_channels + 4, self.out_channels, 3, 1, 1),
-            nn.LeakyReLU(negative_slope=0.1, inplace=True),
-            nn.Conv2d(self.out_channels, self.out_channels, 3, 1, 1),
-            nn.LeakyReLU(negative_slope=0.1, inplace=True),
-            nn.Conv2d(self.out_channels, self.out_channels, 3, 1, 1),
-            nn.LeakyReLU(negative_slope=0.1, inplace=True),
-            nn.Conv2d(self.out_channels, 27 * self.deformable_groups, 3, 1, 1),
-        )
-        self.init_offset()
-    def init_offset(self):
-        def _constant_init(module, val, bias=0):
-            if hasattr(module, 'weight') and module.weight is not None:
-                nn.init.constant_(module.weight, val)
-            if hasattr(module, 'bias') and module.bias is not None:
-                nn.init.constant_(module.bias, bias)
-        _constant_init(self.conv_offset[-1], val=0, bias=0)
-    def forward(self, x, extra_feat, flow_1, flow_2):
-        extra_feat = torch.cat([extra_feat, flow_1, flow_2], dim=1)
-        out = self.conv_offset(extra_feat)
-        o1, o2, mask = torch.chunk(out, 3, dim=1)
-        # offset
-        offset = self.max_residue_magnitude * torch.tanh(torch.cat((o1, o2), dim=1))
-        offset_1, offset_2 = torch.chunk(offset, 2, dim=1)
-        offset_1 = offset_1 + flow_1.flip(1).repeat(1, offset_1.size(1) // 2, 1, 1)
-        offset_2 = offset_2 + flow_2.flip(1).repeat(1, offset_2.size(1) // 2, 1, 1)
-        offset = torch.cat([offset_1, offset_2], dim=1)
-        # mask
-        mask = torch.sigmoid(mask)
-        return torchvision.ops.deform_conv2d(x, offset, self.weight, self.bias, self.stride, self.padding,
-                                             self.dilation, mask)
-# if __name__ == '__main__':
-#     spynet_path = 'experiments/pretrained_models/flownet/spynet_sintel_final-3d2a1287.pth'
-#     model = BasicVSRPlusPlus(spynet_path=spynet_path).cuda()
-#     input = torch.rand(1, 2, 3, 64, 64).cuda()
-#     output = model(input)
-#     print('===================')
-#     print(output.shape)

basicsr/archs/dfdnet_arch.py DELETED Viewed

@@ -1,169 +0,0 @@
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn.utils.spectral_norm import spectral_norm
-from basicsr.utils.registry import ARCH_REGISTRY
-from .dfdnet_util import AttentionBlock, Blur, MSDilationBlock, UpResBlock, adaptive_instance_normalization
-from .vgg_arch import VGGFeatureExtractor
-class SFTUpBlock(nn.Module):
-    """Spatial feature transform (SFT) with upsampling block.
-    Args:
-        in_channel (int): Number of input channels.
-        out_channel (int): Number of output channels.
-        kernel_size (int): Kernel size in convolutions. Default: 3.
-        padding (int): Padding in convolutions. Default: 1.
-    """
-    def __init__(self, in_channel, out_channel, kernel_size=3, padding=1):
-        super(SFTUpBlock, self).__init__()
-        self.conv1 = nn.Sequential(
-            Blur(in_channel),
-            spectral_norm(nn.Conv2d(in_channel, out_channel, kernel_size, padding=padding)),
-            nn.LeakyReLU(0.04, True),
-            # The official codes use two LeakyReLU here, so 0.04 for equivalent
-        )
-        self.convup = nn.Sequential(
-            nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False),
-            spectral_norm(nn.Conv2d(out_channel, out_channel, kernel_size, padding=padding)),
-            nn.LeakyReLU(0.2, True),
-        )
-        # for SFT scale and shift
-        self.scale_block = nn.Sequential(
-            spectral_norm(nn.Conv2d(in_channel, out_channel, 3, 1, 1)), nn.LeakyReLU(0.2, True),
-            spectral_norm(nn.Conv2d(out_channel, out_channel, 3, 1, 1)))
-        self.shift_block = nn.Sequential(
-            spectral_norm(nn.Conv2d(in_channel, out_channel, 3, 1, 1)), nn.LeakyReLU(0.2, True),
-            spectral_norm(nn.Conv2d(out_channel, out_channel, 3, 1, 1)), nn.Sigmoid())
-        # The official codes use sigmoid for shift block, do not know why
-    def forward(self, x, updated_feat):
-        out = self.conv1(x)
-        # SFT
-        scale = self.scale_block(updated_feat)
-        shift = self.shift_block(updated_feat)
-        out = out * scale + shift
-        # upsample
-        out = self.convup(out)
-        return out
-@ARCH_REGISTRY.register()
-class DFDNet(nn.Module):
-    """DFDNet: Deep Face Dictionary Network.
-    It only processes faces with 512x512 size.
-    Args:
-        num_feat (int): Number of feature channels.
-        dict_path (str): Path to the facial component dictionary.
-    """
-    def __init__(self, num_feat, dict_path):
-        super().__init__()
-        self.parts = ['left_eye', 'right_eye', 'nose', 'mouth']
-        # part_sizes: [80, 80, 50, 110]
-        channel_sizes = [128, 256, 512, 512]
-        self.feature_sizes = np.array([256, 128, 64, 32])
-        self.vgg_layers = ['relu2_2', 'relu3_4', 'relu4_4', 'conv5_4']
-        self.flag_dict_device = False
-        # dict
-        self.dict = torch.load(dict_path)
-        # vgg face extractor
-        self.vgg_extractor = VGGFeatureExtractor(
-            layer_name_list=self.vgg_layers,
-            vgg_type='vgg19',
-            use_input_norm=True,
-            range_norm=True,
-            requires_grad=False)
-        # attention block for fusing dictionary features and input features
-        self.attn_blocks = nn.ModuleDict()
-        for idx, feat_size in enumerate(self.feature_sizes):
-            for name in self.parts:
-                self.attn_blocks[f'{name}_{feat_size}'] = AttentionBlock(channel_sizes[idx])
-        # multi scale dilation block
-        self.multi_scale_dilation = MSDilationBlock(num_feat * 8, dilation=[4, 3, 2, 1])
-        # upsampling and reconstruction
-        self.upsample0 = SFTUpBlock(num_feat * 8, num_feat * 8)
-        self.upsample1 = SFTUpBlock(num_feat * 8, num_feat * 4)
-        self.upsample2 = SFTUpBlock(num_feat * 4, num_feat * 2)
-        self.upsample3 = SFTUpBlock(num_feat * 2, num_feat)
-        self.upsample4 = nn.Sequential(
-            spectral_norm(nn.Conv2d(num_feat, num_feat, 3, 1, 1)), nn.LeakyReLU(0.2, True), UpResBlock(num_feat),
-            UpResBlock(num_feat), nn.Conv2d(num_feat, 3, kernel_size=3, stride=1, padding=1), nn.Tanh())
-    def swap_feat(self, vgg_feat, updated_feat, dict_feat, location, part_name, f_size):
-        """swap the features from the dictionary."""
-        # get the original vgg features
-        part_feat = vgg_feat[:, :, location[1]:location[3], location[0]:location[2]].clone()
-        # resize original vgg features
-        part_resize_feat = F.interpolate(part_feat, dict_feat.size()[2:4], mode='bilinear', align_corners=False)
-        # use adaptive instance normalization to adjust color and illuminations
-        dict_feat = adaptive_instance_normalization(dict_feat, part_resize_feat)
-        # get similarity scores
-        similarity_score = F.conv2d(part_resize_feat, dict_feat)
-        similarity_score = F.softmax(similarity_score.view(-1), dim=0)
-        # select the most similar features in the dict (after norm)
-        select_idx = torch.argmax(similarity_score)
-        swap_feat = F.interpolate(dict_feat[select_idx:select_idx + 1], part_feat.size()[2:4])
-        # attention
-        attn = self.attn_blocks[f'{part_name}_' + str(f_size)](swap_feat - part_feat)
-        attn_feat = attn * swap_feat
-        # update features
-        updated_feat[:, :, location[1]:location[3], location[0]:location[2]] = attn_feat + part_feat
-        return updated_feat
-    def put_dict_to_device(self, x):
-        if self.flag_dict_device is False:
-            for k, v in self.dict.items():
-                for kk, vv in v.items():
-                    self.dict[k][kk] = vv.to(x)
-            self.flag_dict_device = True
-    def forward(self, x, part_locations):
-        """
-        Now only support testing with batch size = 0.
-        Args:
-            x (Tensor): Input faces with shape (b, c, 512, 512).
-            part_locations (list[Tensor]): Part locations.
-        """
-        self.put_dict_to_device(x)
-        # extract vggface features
-        vgg_features = self.vgg_extractor(x)
-        # update vggface features using the dictionary for each part
-        updated_vgg_features = []
-        batch = 0  # only supports testing with batch size = 0
-        for vgg_layer, f_size in zip(self.vgg_layers, self.feature_sizes):
-            dict_features = self.dict[f'{f_size}']
-            vgg_feat = vgg_features[vgg_layer]
-            updated_feat = vgg_feat.clone()
-            # swap features from dictionary
-            for part_idx, part_name in enumerate(self.parts):
-                location = (part_locations[part_idx][batch] // (512 / f_size)).int()
-                updated_feat = self.swap_feat(vgg_feat, updated_feat, dict_features[part_name], location, part_name,
-                                              f_size)
-            updated_vgg_features.append(updated_feat)
-        vgg_feat_dilation = self.multi_scale_dilation(vgg_features['conv5_4'])
-        # use updated vgg features to modulate the upsampled features with
-        # SFT (Spatial Feature Transform) scaling and shifting manner.
-        upsampled_feat = self.upsample0(vgg_feat_dilation, updated_vgg_features[3])
-        upsampled_feat = self.upsample1(upsampled_feat, updated_vgg_features[2])
-        upsampled_feat = self.upsample2(upsampled_feat, updated_vgg_features[1])
-        upsampled_feat = self.upsample3(upsampled_feat, updated_vgg_features[0])
-        out = self.upsample4(upsampled_feat)
-        return out

basicsr/archs/dfdnet_util.py DELETED Viewed

@@ -1,162 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.autograd import Function
-from torch.nn.utils.spectral_norm import spectral_norm
-class BlurFunctionBackward(Function):
-    @staticmethod
-    def forward(ctx, grad_output, kernel, kernel_flip):
-        ctx.save_for_backward(kernel, kernel_flip)
-        grad_input = F.conv2d(grad_output, kernel_flip, padding=1, groups=grad_output.shape[1])
-        return grad_input
-    @staticmethod
-    def backward(ctx, gradgrad_output):
-        kernel, _ = ctx.saved_tensors
-        grad_input = F.conv2d(gradgrad_output, kernel, padding=1, groups=gradgrad_output.shape[1])
-        return grad_input, None, None
-class BlurFunction(Function):
-    @staticmethod
-    def forward(ctx, x, kernel, kernel_flip):
-        ctx.save_for_backward(kernel, kernel_flip)
-        output = F.conv2d(x, kernel, padding=1, groups=x.shape[1])
-        return output
-    @staticmethod
-    def backward(ctx, grad_output):
-        kernel, kernel_flip = ctx.saved_tensors
-        grad_input = BlurFunctionBackward.apply(grad_output, kernel, kernel_flip)
-        return grad_input, None, None
-blur = BlurFunction.apply
-class Blur(nn.Module):
-    def __init__(self, channel):
-        super().__init__()
-        kernel = torch.tensor([[1, 2, 1], [2, 4, 2], [1, 2, 1]], dtype=torch.float32)
-        kernel = kernel.view(1, 1, 3, 3)
-        kernel = kernel / kernel.sum()
-        kernel_flip = torch.flip(kernel, [2, 3])
-        self.kernel = kernel.repeat(channel, 1, 1, 1)
-        self.kernel_flip = kernel_flip.repeat(channel, 1, 1, 1)
-    def forward(self, x):
-        return blur(x, self.kernel.type_as(x), self.kernel_flip.type_as(x))
-def calc_mean_std(feat, eps=1e-5):
-    """Calculate mean and std for adaptive_instance_normalization.
-    Args:
-        feat (Tensor): 4D tensor.
-        eps (float): A small value added to the variance to avoid
-            divide-by-zero. Default: 1e-5.
-    """
-    size = feat.size()
-    assert len(size) == 4, 'The input feature should be 4D tensor.'
-    n, c = size[:2]
-    feat_var = feat.view(n, c, -1).var(dim=2) + eps
-    feat_std = feat_var.sqrt().view(n, c, 1, 1)
-    feat_mean = feat.view(n, c, -1).mean(dim=2).view(n, c, 1, 1)
-    return feat_mean, feat_std
-def adaptive_instance_normalization(content_feat, style_feat):
-    """Adaptive instance normalization.
-    Adjust the reference features to have the similar color and illuminations
-    as those in the degradate features.
-    Args:
-        content_feat (Tensor): The reference feature.
-        style_feat (Tensor): The degradate features.
-    """
-    size = content_feat.size()
-    style_mean, style_std = calc_mean_std(style_feat)
-    content_mean, content_std = calc_mean_std(content_feat)
-    normalized_feat = (content_feat - content_mean.expand(size)) / content_std.expand(size)
-    return normalized_feat * style_std.expand(size) + style_mean.expand(size)
-def AttentionBlock(in_channel):
-    return nn.Sequential(
-        spectral_norm(nn.Conv2d(in_channel, in_channel, 3, 1, 1)), nn.LeakyReLU(0.2, True),
-        spectral_norm(nn.Conv2d(in_channel, in_channel, 3, 1, 1)))
-def conv_block(in_channels, out_channels, kernel_size=3, stride=1, dilation=1, bias=True):
-    """Conv block used in MSDilationBlock."""
-    return nn.Sequential(
-        spectral_norm(
-            nn.Conv2d(
-                in_channels,
-                out_channels,
-                kernel_size=kernel_size,
-                stride=stride,
-                dilation=dilation,
-                padding=((kernel_size - 1) // 2) * dilation,
-                bias=bias)),
-        nn.LeakyReLU(0.2),
-        spectral_norm(
-            nn.Conv2d(
-                out_channels,
-                out_channels,
-                kernel_size=kernel_size,
-                stride=stride,
-                dilation=dilation,
-                padding=((kernel_size - 1) // 2) * dilation,
-                bias=bias)),
-    )
-class MSDilationBlock(nn.Module):
-    """Multi-scale dilation block."""
-    def __init__(self, in_channels, kernel_size=3, dilation=(1, 1, 1, 1), bias=True):
-        super(MSDilationBlock, self).__init__()
-        self.conv_blocks = nn.ModuleList()
-        for i in range(4):
-            self.conv_blocks.append(conv_block(in_channels, in_channels, kernel_size, dilation=dilation[i], bias=bias))
-        self.conv_fusion = spectral_norm(
-            nn.Conv2d(
-                in_channels * 4,
-                in_channels,
-                kernel_size=kernel_size,
-                stride=1,
-                padding=(kernel_size - 1) // 2,
-                bias=bias))
-    def forward(self, x):
-        out = []
-        for i in range(4):
-            out.append(self.conv_blocks[i](x))
-        out = torch.cat(out, 1)
-        out = self.conv_fusion(out) + x
-        return out
-class UpResBlock(nn.Module):
-    def __init__(self, in_channel):
-        super(UpResBlock, self).__init__()
-        self.body = nn.Sequential(
-            nn.Conv2d(in_channel, in_channel, 3, 1, 1),
-            nn.LeakyReLU(0.2, True),
-            nn.Conv2d(in_channel, in_channel, 3, 1, 1),
-        )
-    def forward(self, x):
-        out = x + self.body(x)
-        return out

basicsr/archs/discriminator_arch.py DELETED Viewed

@@ -1,150 +0,0 @@
-from torch import nn as nn
-from torch.nn import functional as F
-from torch.nn.utils import spectral_norm
-from basicsr.utils.registry import ARCH_REGISTRY
-@ARCH_REGISTRY.register()
-class VGGStyleDiscriminator(nn.Module):
-    """VGG style discriminator with input size 128 x 128 or 256 x 256.
-    It is used to train SRGAN, ESRGAN, and VideoGAN.
-    Args:
-        num_in_ch (int): Channel number of inputs. Default: 3.
-        num_feat (int): Channel number of base intermediate features.Default: 64.
-    """
-    def __init__(self, num_in_ch, num_feat, input_size=128):
-        super(VGGStyleDiscriminator, self).__init__()
-        self.input_size = input_size
-        assert self.input_size == 128 or self.input_size == 256, (
-            f'input size must be 128 or 256, but received {input_size}')
-        self.conv0_0 = nn.Conv2d(num_in_ch, num_feat, 3, 1, 1, bias=True)
-        self.conv0_1 = nn.Conv2d(num_feat, num_feat, 4, 2, 1, bias=False)
-        self.bn0_1 = nn.BatchNorm2d(num_feat, affine=True)
-        self.conv1_0 = nn.Conv2d(num_feat, num_feat * 2, 3, 1, 1, bias=False)
-        self.bn1_0 = nn.BatchNorm2d(num_feat * 2, affine=True)
-        self.conv1_1 = nn.Conv2d(num_feat * 2, num_feat * 2, 4, 2, 1, bias=False)
-        self.bn1_1 = nn.BatchNorm2d(num_feat * 2, affine=True)
-        self.conv2_0 = nn.Conv2d(num_feat * 2, num_feat * 4, 3, 1, 1, bias=False)
-        self.bn2_0 = nn.BatchNorm2d(num_feat * 4, affine=True)
-        self.conv2_1 = nn.Conv2d(num_feat * 4, num_feat * 4, 4, 2, 1, bias=False)
-        self.bn2_1 = nn.BatchNorm2d(num_feat * 4, affine=True)
-        self.conv3_0 = nn.Conv2d(num_feat * 4, num_feat * 8, 3, 1, 1, bias=False)
-        self.bn3_0 = nn.BatchNorm2d(num_feat * 8, affine=True)
-        self.conv3_1 = nn.Conv2d(num_feat * 8, num_feat * 8, 4, 2, 1, bias=False)
-        self.bn3_1 = nn.BatchNorm2d(num_feat * 8, affine=True)
-        self.conv4_0 = nn.Conv2d(num_feat * 8, num_feat * 8, 3, 1, 1, bias=False)
-        self.bn4_0 = nn.BatchNorm2d(num_feat * 8, affine=True)
-        self.conv4_1 = nn.Conv2d(num_feat * 8, num_feat * 8, 4, 2, 1, bias=False)
-        self.bn4_1 = nn.BatchNorm2d(num_feat * 8, affine=True)
-        if self.input_size == 256:
-            self.conv5_0 = nn.Conv2d(num_feat * 8, num_feat * 8, 3, 1, 1, bias=False)
-            self.bn5_0 = nn.BatchNorm2d(num_feat * 8, affine=True)
-            self.conv5_1 = nn.Conv2d(num_feat * 8, num_feat * 8, 4, 2, 1, bias=False)
-            self.bn5_1 = nn.BatchNorm2d(num_feat * 8, affine=True)
-        self.linear1 = nn.Linear(num_feat * 8 * 4 * 4, 100)
-        self.linear2 = nn.Linear(100, 1)
-        # activation function
-        self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
-    def forward(self, x):
-        assert x.size(2) == self.input_size, (f'Input size must be identical to input_size, but received {x.size()}.')
-        feat = self.lrelu(self.conv0_0(x))
-        feat = self.lrelu(self.bn0_1(self.conv0_1(feat)))  # output spatial size: /2
-        feat = self.lrelu(self.bn1_0(self.conv1_0(feat)))
-        feat = self.lrelu(self.bn1_1(self.conv1_1(feat)))  # output spatial size: /4
-        feat = self.lrelu(self.bn2_0(self.conv2_0(feat)))
-        feat = self.lrelu(self.bn2_1(self.conv2_1(feat)))  # output spatial size: /8
-        feat = self.lrelu(self.bn3_0(self.conv3_0(feat)))
-        feat = self.lrelu(self.bn3_1(self.conv3_1(feat)))  # output spatial size: /16
-        feat = self.lrelu(self.bn4_0(self.conv4_0(feat)))
-        feat = self.lrelu(self.bn4_1(self.conv4_1(feat)))  # output spatial size: /32
-        if self.input_size == 256:
-            feat = self.lrelu(self.bn5_0(self.conv5_0(feat)))
-            feat = self.lrelu(self.bn5_1(self.conv5_1(feat)))  # output spatial size: / 64
-        # spatial size: (4, 4)
-        feat = feat.view(feat.size(0), -1)
-        feat = self.lrelu(self.linear1(feat))
-        out = self.linear2(feat)
-        return out
-@ARCH_REGISTRY.register(suffix='basicsr')
-class UNetDiscriminatorSN(nn.Module):
-    """Defines a U-Net discriminator with spectral normalization (SN)
-    It is used in Real-ESRGAN: Training Real-World Blind Super-Resolution with Pure Synthetic Data.
-    Arg:
-        num_in_ch (int): Channel number of inputs. Default: 3.
-        num_feat (int): Channel number of base intermediate features. Default: 64.
-        skip_connection (bool): Whether to use skip connections between U-Net. Default: True.
-    """
-    def __init__(self, num_in_ch, num_feat=64, skip_connection=True):
-        super(UNetDiscriminatorSN, self).__init__()
-        self.skip_connection = skip_connection
-        norm = spectral_norm
-        # the first convolution
-        self.conv0 = nn.Conv2d(num_in_ch, num_feat, kernel_size=3, stride=1, padding=1)
-        # downsample
-        self.conv1 = norm(nn.Conv2d(num_feat, num_feat * 2, 4, 2, 1, bias=False))
-        self.conv2 = norm(nn.Conv2d(num_feat * 2, num_feat * 4, 4, 2, 1, bias=False))
-        self.conv3 = norm(nn.Conv2d(num_feat * 4, num_feat * 8, 4, 2, 1, bias=False))
-        # upsample
-        self.conv4 = norm(nn.Conv2d(num_feat * 8, num_feat * 4, 3, 1, 1, bias=False))
-        self.conv5 = norm(nn.Conv2d(num_feat * 4, num_feat * 2, 3, 1, 1, bias=False))
-        self.conv6 = norm(nn.Conv2d(num_feat * 2, num_feat, 3, 1, 1, bias=False))
-        # extra convolutions
-        self.conv7 = norm(nn.Conv2d(num_feat, num_feat, 3, 1, 1, bias=False))
-        self.conv8 = norm(nn.Conv2d(num_feat, num_feat, 3, 1, 1, bias=False))
-        self.conv9 = nn.Conv2d(num_feat, 1, 3, 1, 1)
-    def forward(self, x):
-        # downsample
-        x0 = F.leaky_relu(self.conv0(x), negative_slope=0.2, inplace=True)
-        x1 = F.leaky_relu(self.conv1(x0), negative_slope=0.2, inplace=True)
-        x2 = F.leaky_relu(self.conv2(x1), negative_slope=0.2, inplace=True)
-        x3 = F.leaky_relu(self.conv3(x2), negative_slope=0.2, inplace=True)
-        # upsample
-        x3 = F.interpolate(x3, scale_factor=2, mode='bilinear', align_corners=False)
-        x4 = F.leaky_relu(self.conv4(x3), negative_slope=0.2, inplace=True)
-        if self.skip_connection:
-            x4 = x4 + x2
-        x4 = F.interpolate(x4, scale_factor=2, mode='bilinear', align_corners=False)
-        x5 = F.leaky_relu(self.conv5(x4), negative_slope=0.2, inplace=True)
-        if self.skip_connection:
-            x5 = x5 + x1
-        x5 = F.interpolate(x5, scale_factor=2, mode='bilinear', align_corners=False)
-        x6 = F.leaky_relu(self.conv6(x5), negative_slope=0.2, inplace=True)
-        if self.skip_connection:
-            x6 = x6 + x0
-        # extra convolutions
-        out = F.leaky_relu(self.conv7(x6), negative_slope=0.2, inplace=True)
-        out = F.leaky_relu(self.conv8(out), negative_slope=0.2, inplace=True)
-        out = self.conv9(out)
-        return out

basicsr/archs/duf_arch.py DELETED Viewed

@@ -1,276 +0,0 @@
-import numpy as np
-import torch
-from torch import nn as nn
-from torch.nn import functional as F
-from basicsr.utils.registry import ARCH_REGISTRY
-class DenseBlocksTemporalReduce(nn.Module):
-    """A concatenation of 3 dense blocks with reduction in temporal dimension.
-    Note that the output temporal dimension is 6 fewer the input temporal dimension, since there are 3 blocks.
-    Args:
-        num_feat (int): Number of channels in the blocks. Default: 64.
-        num_grow_ch (int): Growing factor of the dense blocks. Default: 32
-        adapt_official_weights (bool): Whether to adapt the weights translated from the official implementation.
-            Set to false if you want to train from scratch. Default: False.
-    """
-    def __init__(self, num_feat=64, num_grow_ch=32, adapt_official_weights=False):
-        super(DenseBlocksTemporalReduce, self).__init__()
-        if adapt_official_weights:
-            eps = 1e-3
-            momentum = 1e-3
-        else:  # pytorch default values
-            eps = 1e-05
-            momentum = 0.1
-        self.temporal_reduce1 = nn.Sequential(
-            nn.BatchNorm3d(num_feat, eps=eps, momentum=momentum), nn.ReLU(inplace=True),
-            nn.Conv3d(num_feat, num_feat, (1, 1, 1), stride=(1, 1, 1), padding=(0, 0, 0), bias=True),
-            nn.BatchNorm3d(num_feat, eps=eps, momentum=momentum), nn.ReLU(inplace=True),
-            nn.Conv3d(num_feat, num_grow_ch, (3, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=True))
-        self.temporal_reduce2 = nn.Sequential(
-            nn.BatchNorm3d(num_feat + num_grow_ch, eps=eps, momentum=momentum), nn.ReLU(inplace=True),
-            nn.Conv3d(
-                num_feat + num_grow_ch,
-                num_feat + num_grow_ch, (1, 1, 1),
-                stride=(1, 1, 1),
-                padding=(0, 0, 0),
-                bias=True), nn.BatchNorm3d(num_feat + num_grow_ch, eps=eps, momentum=momentum), nn.ReLU(inplace=True),
-            nn.Conv3d(num_feat + num_grow_ch, num_grow_ch, (3, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=True))
-        self.temporal_reduce3 = nn.Sequential(
-            nn.BatchNorm3d(num_feat + 2 * num_grow_ch, eps=eps, momentum=momentum), nn.ReLU(inplace=True),
-            nn.Conv3d(
-                num_feat + 2 * num_grow_ch,
-                num_feat + 2 * num_grow_ch, (1, 1, 1),
-                stride=(1, 1, 1),
-                padding=(0, 0, 0),
-                bias=True), nn.BatchNorm3d(num_feat + 2 * num_grow_ch, eps=eps, momentum=momentum),
-            nn.ReLU(inplace=True),
-            nn.Conv3d(
-                num_feat + 2 * num_grow_ch, num_grow_ch, (3, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=True))
-    def forward(self, x):
-        """
-        Args:
-            x (Tensor): Input tensor with shape (b, num_feat, t, h, w).
-        Returns:
-            Tensor: Output with shape (b, num_feat + num_grow_ch * 3, 1, h, w).
-        """
-        x1 = self.temporal_reduce1(x)
-        x1 = torch.cat((x[:, :, 1:-1, :, :], x1), 1)
-        x2 = self.temporal_reduce2(x1)
-        x2 = torch.cat((x1[:, :, 1:-1, :, :], x2), 1)
-        x3 = self.temporal_reduce3(x2)
-        x3 = torch.cat((x2[:, :, 1:-1, :, :], x3), 1)
-        return x3
-class DenseBlocks(nn.Module):
-    """ A concatenation of N dense blocks.
-    Args:
-        num_feat (int): Number of channels in the blocks. Default: 64.
-        num_grow_ch (int): Growing factor of the dense blocks. Default: 32.
-        num_block (int): Number of dense blocks. The values are:
-            DUF-S (16 layers): 3
-            DUF-M (18 layers): 9
-            DUF-L (52 layers): 21
-        adapt_official_weights (bool): Whether to adapt the weights translated from the official implementation.
-            Set to false if you want to train from scratch. Default: False.
-    """
-    def __init__(self, num_block, num_feat=64, num_grow_ch=16, adapt_official_weights=False):
-        super(DenseBlocks, self).__init__()
-        if adapt_official_weights:
-            eps = 1e-3
-            momentum = 1e-3
-        else:  # pytorch default values
-            eps = 1e-05
-            momentum = 0.1
-        self.dense_blocks = nn.ModuleList()
-        for i in range(0, num_block):
-            self.dense_blocks.append(
-                nn.Sequential(
-                    nn.BatchNorm3d(num_feat + i * num_grow_ch, eps=eps, momentum=momentum), nn.ReLU(inplace=True),
-                    nn.Conv3d(
-                        num_feat + i * num_grow_ch,
-                        num_feat + i * num_grow_ch, (1, 1, 1),
-                        stride=(1, 1, 1),
-                        padding=(0, 0, 0),
-                        bias=True), nn.BatchNorm3d(num_feat + i * num_grow_ch, eps=eps, momentum=momentum),
-                    nn.ReLU(inplace=True),
-                    nn.Conv3d(
-                        num_feat + i * num_grow_ch,
-                        num_grow_ch, (3, 3, 3),
-                        stride=(1, 1, 1),
-                        padding=(1, 1, 1),
-                        bias=True)))
-    def forward(self, x):
-        """
-        Args:
-            x (Tensor): Input tensor with shape (b, num_feat, t, h, w).
-        Returns:
-            Tensor: Output with shape (b, num_feat + num_block * num_grow_ch, t, h, w).
-        """
-        for i in range(0, len(self.dense_blocks)):
-            y = self.dense_blocks[i](x)
-            x = torch.cat((x, y), 1)
-        return x
-class DynamicUpsamplingFilter(nn.Module):
-    """Dynamic upsampling filter used in DUF.
-    Reference: https://github.com/yhjo09/VSR-DUF
-    It only supports input with 3 channels. And it applies the same filters to 3 channels.
-    Args:
-        filter_size (tuple): Filter size of generated filters. The shape is (kh, kw). Default: (5, 5).
-    """
-    def __init__(self, filter_size=(5, 5)):
-        super(DynamicUpsamplingFilter, self).__init__()
-        if not isinstance(filter_size, tuple):
-            raise TypeError(f'The type of filter_size must be tuple, but got type{filter_size}')
-        if len(filter_size) != 2:
-            raise ValueError(f'The length of filter size must be 2, but got {len(filter_size)}.')
-        # generate a local expansion filter, similar to im2col
-        self.filter_size = filter_size
-        filter_prod = np.prod(filter_size)
-        expansion_filter = torch.eye(int(filter_prod)).view(filter_prod, 1, *filter_size)  # (kh*kw, 1, kh, kw)
-        self.expansion_filter = expansion_filter.repeat(3, 1, 1, 1)  # repeat for all the 3 channels
-    def forward(self, x, filters):
-        """Forward function for DynamicUpsamplingFilter.
-        Args:
-            x (Tensor): Input image with 3 channels. The shape is (n, 3, h, w).
-            filters (Tensor): Generated dynamic filters. The shape is (n, filter_prod, upsampling_square, h, w).
-                filter_prod: prod of filter kernel size, e.g., 1*5*5=25.
-                upsampling_square: similar to pixel shuffle, upsampling_square = upsampling * upsampling.
-                e.g., for x 4 upsampling, upsampling_square= 4*4 = 16
-        Returns:
-            Tensor: Filtered image with shape (n, 3*upsampling_square, h, w)
-        """
-        n, filter_prod, upsampling_square, h, w = filters.size()
-        kh, kw = self.filter_size
-        expanded_input = F.conv2d(
-            x, self.expansion_filter.to(x), padding=(kh // 2, kw // 2), groups=3)  # (n, 3*filter_prod, h, w)
-        expanded_input = expanded_input.view(n, 3, filter_prod, h, w).permute(0, 3, 4, 1,
-                                                                              2)  # (n, h, w, 3, filter_prod)
-        filters = filters.permute(0, 3, 4, 1, 2)  # (n, h, w, filter_prod, upsampling_square]
-        out = torch.matmul(expanded_input, filters)  # (n, h, w, 3, upsampling_square)
-        return out.permute(0, 3, 4, 1, 2).view(n, 3 * upsampling_square, h, w)
-@ARCH_REGISTRY.register()
-class DUF(nn.Module):
-    """Network architecture for DUF
-    ``Paper: Deep Video Super-Resolution Network Using Dynamic Upsampling Filters Without Explicit Motion Compensation``
-    Reference: https://github.com/yhjo09/VSR-DUF
-    For all the models below, 'adapt_official_weights' is only necessary when
-    loading the weights converted from the official TensorFlow weights.
-    Please set it to False if you are training the model from scratch.
-    There are three models with different model size: DUF16Layers, DUF28Layers,
-    and DUF52Layers. This class is the base class for these models.
-    Args:
-        scale (int): The upsampling factor. Default: 4.
-        num_layer (int): The number of layers. Default: 52.
-        adapt_official_weights_weights (bool): Whether to adapt the weights
-            translated from the official implementation. Set to false if you
-            want to train from scratch. Default: False.
-    """
-    def __init__(self, scale=4, num_layer=52, adapt_official_weights=False):
-        super(DUF, self).__init__()
-        self.scale = scale
-        if adapt_official_weights:
-            eps = 1e-3
-            momentum = 1e-3
-        else:  # pytorch default values
-            eps = 1e-05
-            momentum = 0.1
-        self.conv3d1 = nn.Conv3d(3, 64, (1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=True)
-        self.dynamic_filter = DynamicUpsamplingFilter((5, 5))
-        if num_layer == 16:
-            num_block = 3
-            num_grow_ch = 32
-        elif num_layer == 28:
-            num_block = 9
-            num_grow_ch = 16
-        elif num_layer == 52:
-            num_block = 21
-            num_grow_ch = 16
-        else:
-            raise ValueError(f'Only supported (16, 28, 52) layers, but got {num_layer}.')
-        self.dense_block1 = DenseBlocks(
-            num_block=num_block, num_feat=64, num_grow_ch=num_grow_ch,
-            adapt_official_weights=adapt_official_weights)  # T = 7
-        self.dense_block2 = DenseBlocksTemporalReduce(
-            64 + num_grow_ch * num_block, num_grow_ch, adapt_official_weights=adapt_official_weights)  # T = 1
-        channels = 64 + num_grow_ch * num_block + num_grow_ch * 3
-        self.bn3d2 = nn.BatchNorm3d(channels, eps=eps, momentum=momentum)
-        self.conv3d2 = nn.Conv3d(channels, 256, (1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1), bias=True)
-        self.conv3d_r1 = nn.Conv3d(256, 256, (1, 1, 1), stride=(1, 1, 1), padding=(0, 0, 0), bias=True)
-        self.conv3d_r2 = nn.Conv3d(256, 3 * (scale**2), (1, 1, 1), stride=(1, 1, 1), padding=(0, 0, 0), bias=True)
-        self.conv3d_f1 = nn.Conv3d(256, 512, (1, 1, 1), stride=(1, 1, 1), padding=(0, 0, 0), bias=True)
-        self.conv3d_f2 = nn.Conv3d(
-            512, 1 * 5 * 5 * (scale**2), (1, 1, 1), stride=(1, 1, 1), padding=(0, 0, 0), bias=True)
-    def forward(self, x):
-        """
-        Args:
-            x (Tensor): Input with shape (b, 7, c, h, w)
-        Returns:
-            Tensor: Output with shape (b, c, h * scale, w * scale)
-        """
-        num_batches, num_imgs, _, h, w = x.size()
-        x = x.permute(0, 2, 1, 3, 4)  # (b, c, 7, h, w) for Conv3D
-        x_center = x[:, :, num_imgs // 2, :, :]
-        x = self.conv3d1(x)
-        x = self.dense_block1(x)
-        x = self.dense_block2(x)
-        x = F.relu(self.bn3d2(x), inplace=True)
-        x = F.relu(self.conv3d2(x), inplace=True)
-        # residual image
-        res = self.conv3d_r2(F.relu(self.conv3d_r1(x), inplace=True))
-        # filter
-        filter_ = self.conv3d_f2(F.relu(self.conv3d_f1(x), inplace=True))
-        filter_ = F.softmax(filter_.view(num_batches, 25, self.scale**2, h, w), dim=1)
-        # dynamic filter
-        out = self.dynamic_filter(x_center, filter_)
-        out += res.squeeze_(2)
-        out = F.pixel_shuffle(out, self.scale)
-        return out

basicsr/archs/ecbsr_arch.py DELETED Viewed

@@ -1,275 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from basicsr.utils.registry import ARCH_REGISTRY
-class SeqConv3x3(nn.Module):
-    """The re-parameterizable block used in the ECBSR architecture.
-    ``Paper: Edge-oriented Convolution Block for Real-time Super Resolution on Mobile Devices``
-    Reference: https://github.com/xindongzhang/ECBSR
-    Args:
-        seq_type (str): Sequence type, option: conv1x1-conv3x3 | conv1x1-sobelx | conv1x1-sobely | conv1x1-laplacian.
-        in_channels (int): Channel number of input.
-        out_channels (int): Channel number of output.
-        depth_multiplier (int): Width multiplier in the expand-and-squeeze conv. Default: 1.
-    """
-    def __init__(self, seq_type, in_channels, out_channels, depth_multiplier=1):
-        super(SeqConv3x3, self).__init__()
-        self.seq_type = seq_type
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        if self.seq_type == 'conv1x1-conv3x3':
-            self.mid_planes = int(out_channels * depth_multiplier)
-            conv0 = torch.nn.Conv2d(self.in_channels, self.mid_planes, kernel_size=1, padding=0)
-            self.k0 = conv0.weight
-            self.b0 = conv0.bias
-            conv1 = torch.nn.Conv2d(self.mid_planes, self.out_channels, kernel_size=3)
-            self.k1 = conv1.weight
-            self.b1 = conv1.bias
-        elif self.seq_type == 'conv1x1-sobelx':
-            conv0 = torch.nn.Conv2d(self.in_channels, self.out_channels, kernel_size=1, padding=0)
-            self.k0 = conv0.weight
-            self.b0 = conv0.bias
-            # init scale and bias
-            scale = torch.randn(size=(self.out_channels, 1, 1, 1)) * 1e-3
-            self.scale = nn.Parameter(scale)
-            bias = torch.randn(self.out_channels) * 1e-3
-            bias = torch.reshape(bias, (self.out_channels, ))
-            self.bias = nn.Parameter(bias)
-            # init mask
-            self.mask = torch.zeros((self.out_channels, 1, 3, 3), dtype=torch.float32)
-            for i in range(self.out_channels):
-                self.mask[i, 0, 0, 0] = 1.0
-                self.mask[i, 0, 1, 0] = 2.0
-                self.mask[i, 0, 2, 0] = 1.0
-                self.mask[i, 0, 0, 2] = -1.0
-                self.mask[i, 0, 1, 2] = -2.0
-                self.mask[i, 0, 2, 2] = -1.0
-            self.mask = nn.Parameter(data=self.mask, requires_grad=False)
-        elif self.seq_type == 'conv1x1-sobely':
-            conv0 = torch.nn.Conv2d(self.in_channels, self.out_channels, kernel_size=1, padding=0)
-            self.k0 = conv0.weight
-            self.b0 = conv0.bias
-            # init scale and bias
-            scale = torch.randn(size=(self.out_channels, 1, 1, 1)) * 1e-3
-            self.scale = nn.Parameter(torch.FloatTensor(scale))
-            bias = torch.randn(self.out_channels) * 1e-3
-            bias = torch.reshape(bias, (self.out_channels, ))
-            self.bias = nn.Parameter(torch.FloatTensor(bias))
-            # init mask
-            self.mask = torch.zeros((self.out_channels, 1, 3, 3), dtype=torch.float32)
-            for i in range(self.out_channels):
-                self.mask[i, 0, 0, 0] = 1.0
-                self.mask[i, 0, 0, 1] = 2.0
-                self.mask[i, 0, 0, 2] = 1.0
-                self.mask[i, 0, 2, 0] = -1.0
-                self.mask[i, 0, 2, 1] = -2.0
-                self.mask[i, 0, 2, 2] = -1.0
-            self.mask = nn.Parameter(data=self.mask, requires_grad=False)
-        elif self.seq_type == 'conv1x1-laplacian':
-            conv0 = torch.nn.Conv2d(self.in_channels, self.out_channels, kernel_size=1, padding=0)
-            self.k0 = conv0.weight
-            self.b0 = conv0.bias
-            # init scale and bias
-            scale = torch.randn(size=(self.out_channels, 1, 1, 1)) * 1e-3
-            self.scale = nn.Parameter(torch.FloatTensor(scale))
-            bias = torch.randn(self.out_channels) * 1e-3
-            bias = torch.reshape(bias, (self.out_channels, ))
-            self.bias = nn.Parameter(torch.FloatTensor(bias))
-            # init mask
-            self.mask = torch.zeros((self.out_channels, 1, 3, 3), dtype=torch.float32)
-            for i in range(self.out_channels):
-                self.mask[i, 0, 0, 1] = 1.0
-                self.mask[i, 0, 1, 0] = 1.0
-                self.mask[i, 0, 1, 2] = 1.0
-                self.mask[i, 0, 2, 1] = 1.0
-                self.mask[i, 0, 1, 1] = -4.0
-            self.mask = nn.Parameter(data=self.mask, requires_grad=False)
-        else:
-            raise ValueError('The type of seqconv is not supported!')
-    def forward(self, x):
-        if self.seq_type == 'conv1x1-conv3x3':
-            # conv-1x1
-            y0 = F.conv2d(input=x, weight=self.k0, bias=self.b0, stride=1)
-            # explicitly padding with bias
-            y0 = F.pad(y0, (1, 1, 1, 1), 'constant', 0)
-            b0_pad = self.b0.view(1, -1, 1, 1)
-            y0[:, :, 0:1, :] = b0_pad
-            y0[:, :, -1:, :] = b0_pad
-            y0[:, :, :, 0:1] = b0_pad
-            y0[:, :, :, -1:] = b0_pad
-            # conv-3x3
-            y1 = F.conv2d(input=y0, weight=self.k1, bias=self.b1, stride=1)
-        else:
-            y0 = F.conv2d(input=x, weight=self.k0, bias=self.b0, stride=1)
-            # explicitly padding with bias
-            y0 = F.pad(y0, (1, 1, 1, 1), 'constant', 0)
-            b0_pad = self.b0.view(1, -1, 1, 1)
-            y0[:, :, 0:1, :] = b0_pad
-            y0[:, :, -1:, :] = b0_pad
-            y0[:, :, :, 0:1] = b0_pad
-            y0[:, :, :, -1:] = b0_pad
-            # conv-3x3
-            y1 = F.conv2d(input=y0, weight=self.scale * self.mask, bias=self.bias, stride=1, groups=self.out_channels)
-        return y1
-    def rep_params(self):
-        device = self.k0.get_device()
-        if device < 0:
-            device = None
-        if self.seq_type == 'conv1x1-conv3x3':
-            # re-param conv kernel
-            rep_weight = F.conv2d(input=self.k1, weight=self.k0.permute(1, 0, 2, 3))
-            # re-param conv bias
-            rep_bias = torch.ones(1, self.mid_planes, 3, 3, device=device) * self.b0.view(1, -1, 1, 1)
-            rep_bias = F.conv2d(input=rep_bias, weight=self.k1).view(-1, ) + self.b1
-        else:
-            tmp = self.scale * self.mask
-            k1 = torch.zeros((self.out_channels, self.out_channels, 3, 3), device=device)
-            for i in range(self.out_channels):
-                k1[i, i, :, :] = tmp[i, 0, :, :]
-            b1 = self.bias
-            # re-param conv kernel
-            rep_weight = F.conv2d(input=k1, weight=self.k0.permute(1, 0, 2, 3))
-            # re-param conv bias
-            rep_bias = torch.ones(1, self.out_channels, 3, 3, device=device) * self.b0.view(1, -1, 1, 1)
-            rep_bias = F.conv2d(input=rep_bias, weight=k1).view(-1, ) + b1
-        return rep_weight, rep_bias
-class ECB(nn.Module):
-    """The ECB block used in the ECBSR architecture.
-    Paper: Edge-oriented Convolution Block for Real-time Super Resolution on Mobile Devices
-    Ref git repo: https://github.com/xindongzhang/ECBSR
-    Args:
-        in_channels (int): Channel number of input.
-        out_channels (int): Channel number of output.
-        depth_multiplier (int): Width multiplier in the expand-and-squeeze conv. Default: 1.
-        act_type (str): Activation type. Option: prelu | relu | rrelu | softplus | linear. Default: prelu.
-        with_idt (bool): Whether to use identity connection. Default: False.
-    """
-    def __init__(self, in_channels, out_channels, depth_multiplier, act_type='prelu', with_idt=False):
-        super(ECB, self).__init__()
-        self.depth_multiplier = depth_multiplier
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.act_type = act_type
-        if with_idt and (self.in_channels == self.out_channels):
-            self.with_idt = True
-        else:
-            self.with_idt = False
-        self.conv3x3 = torch.nn.Conv2d(self.in_channels, self.out_channels, kernel_size=3, padding=1)
-        self.conv1x1_3x3 = SeqConv3x3('conv1x1-conv3x3', self.in_channels, self.out_channels, self.depth_multiplier)
-        self.conv1x1_sbx = SeqConv3x3('conv1x1-sobelx', self.in_channels, self.out_channels)
-        self.conv1x1_sby = SeqConv3x3('conv1x1-sobely', self.in_channels, self.out_channels)
-        self.conv1x1_lpl = SeqConv3x3('conv1x1-laplacian', self.in_channels, self.out_channels)
-        if self.act_type == 'prelu':
-            self.act = nn.PReLU(num_parameters=self.out_channels)
-        elif self.act_type == 'relu':
-            self.act = nn.ReLU(inplace=True)
-        elif self.act_type == 'rrelu':
-            self.act = nn.RReLU(lower=-0.05, upper=0.05)
-        elif self.act_type == 'softplus':
-            self.act = nn.Softplus()
-        elif self.act_type == 'linear':
-            pass
-        else:
-            raise ValueError('The type of activation if not support!')
-    def forward(self, x):
-        if self.training:
-            y = self.conv3x3(x) + self.conv1x1_3x3(x) + self.conv1x1_sbx(x) + self.conv1x1_sby(x) + self.conv1x1_lpl(x)
-            if self.with_idt:
-                y += x
-        else:
-            rep_weight, rep_bias = self.rep_params()
-            y = F.conv2d(input=x, weight=rep_weight, bias=rep_bias, stride=1, padding=1)
-        if self.act_type != 'linear':
-            y = self.act(y)
-        return y
-    def rep_params(self):
-        weight0, bias0 = self.conv3x3.weight, self.conv3x3.bias
-        weight1, bias1 = self.conv1x1_3x3.rep_params()
-        weight2, bias2 = self.conv1x1_sbx.rep_params()
-        weight3, bias3 = self.conv1x1_sby.rep_params()
-        weight4, bias4 = self.conv1x1_lpl.rep_params()
-        rep_weight, rep_bias = (weight0 + weight1 + weight2 + weight3 + weight4), (
-            bias0 + bias1 + bias2 + bias3 + bias4)
-        if self.with_idt:
-            device = rep_weight.get_device()
-            if device < 0:
-                device = None
-            weight_idt = torch.zeros(self.out_channels, self.out_channels, 3, 3, device=device)
-            for i in range(self.out_channels):
-                weight_idt[i, i, 1, 1] = 1.0
-            bias_idt = 0.0
-            rep_weight, rep_bias = rep_weight + weight_idt, rep_bias + bias_idt
-        return rep_weight, rep_bias
-@ARCH_REGISTRY.register()
-class ECBSR(nn.Module):
-    """ECBSR architecture.
-    Paper: Edge-oriented Convolution Block for Real-time Super Resolution on Mobile Devices
-    Ref git repo: https://github.com/xindongzhang/ECBSR
-    Args:
-        num_in_ch (int): Channel number of inputs.
-        num_out_ch (int): Channel number of outputs.
-        num_block (int): Block number in the trunk network.
-        num_channel (int): Channel number.
-        with_idt (bool): Whether use identity in convolution layers.
-        act_type (str): Activation type.
-        scale (int): Upsampling factor.
-    """
-    def __init__(self, num_in_ch, num_out_ch, num_block, num_channel, with_idt, act_type, scale):
-        super(ECBSR, self).__init__()
-        self.num_in_ch = num_in_ch
-        self.scale = scale
-        backbone = []
-        backbone += [ECB(num_in_ch, num_channel, depth_multiplier=2.0, act_type=act_type, with_idt=with_idt)]
-        for _ in range(num_block):
-            backbone += [ECB(num_channel, num_channel, depth_multiplier=2.0, act_type=act_type, with_idt=with_idt)]
-        backbone += [
-            ECB(num_channel, num_out_ch * scale * scale, depth_multiplier=2.0, act_type='linear', with_idt=with_idt)
-        ]
-        self.backbone = nn.Sequential(*backbone)
-        self.upsampler = nn.PixelShuffle(scale)
-    def forward(self, x):
-        if self.num_in_ch > 1:
-            shortcut = torch.repeat_interleave(x, self.scale * self.scale, dim=1)
-        else:
-            shortcut = x  # will repeat the input in the channel dimension (repeat  scale * scale times)
-        y = self.backbone(x) + shortcut
-        y = self.upsampler(y)
-        return y

basicsr/archs/edsr_arch.py DELETED Viewed

@@ -1,61 +0,0 @@
-import torch
-from torch import nn as nn
-from basicsr.archs.arch_util import ResidualBlockNoBN, Upsample, make_layer
-from basicsr.utils.registry import ARCH_REGISTRY
-@ARCH_REGISTRY.register()
-class EDSR(nn.Module):
-    """EDSR network structure.
-    Paper: Enhanced Deep Residual Networks for Single Image Super-Resolution.
-    Ref git repo: https://github.com/thstkdgus35/EDSR-PyTorch
-    Args:
-        num_in_ch (int): Channel number of inputs.
-        num_out_ch (int): Channel number of outputs.
-        num_feat (int): Channel number of intermediate features.
-            Default: 64.
-        num_block (int): Block number in the trunk network. Default: 16.
-        upscale (int): Upsampling factor. Support 2^n and 3.
-            Default: 4.
-        res_scale (float): Used to scale the residual in residual block.
-            Default: 1.
-        img_range (float): Image range. Default: 255.
-        rgb_mean (tuple[float]): Image mean in RGB orders.
-            Default: (0.4488, 0.4371, 0.4040), calculated from DIV2K dataset.
-    """
-    def __init__(self,
-                 num_in_ch,
-                 num_out_ch,
-                 num_feat=64,
-                 num_block=16,
-                 upscale=4,
-                 res_scale=1,
-                 img_range=255.,
-                 rgb_mean=(0.4488, 0.4371, 0.4040)):
-        super(EDSR, self).__init__()
-        self.img_range = img_range
-        self.mean = torch.Tensor(rgb_mean).view(1, 3, 1, 1)
-        self.conv_first = nn.Conv2d(num_in_ch, num_feat, 3, 1, 1)
-        self.body = make_layer(ResidualBlockNoBN, num_block, num_feat=num_feat, res_scale=res_scale, pytorch_init=True)
-        self.conv_after_body = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-        self.upsample = Upsample(upscale, num_feat)
-        self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
-    def forward(self, x):
-        self.mean = self.mean.type_as(x)
-        x = (x - self.mean) * self.img_range
-        x = self.conv_first(x)
-        res = self.conv_after_body(self.body(x))
-        res += x
-        x = self.conv_last(self.upsample(res))
-        x = x / self.img_range + self.mean
-        return x

basicsr/archs/edvr_arch.py DELETED Viewed

@@ -1,382 +0,0 @@
-import torch
-from torch import nn as nn
-from torch.nn import functional as F
-from basicsr.utils.registry import ARCH_REGISTRY
-from .arch_util import DCNv2Pack, ResidualBlockNoBN, make_layer
-class PCDAlignment(nn.Module):
-    """Alignment module using Pyramid, Cascading and Deformable convolution
-    (PCD). It is used in EDVR.
-    ``Paper: EDVR: Video Restoration with Enhanced Deformable Convolutional Networks``
-    Args:
-        num_feat (int): Channel number of middle features. Default: 64.
-        deformable_groups (int): Deformable groups. Defaults: 8.
-    """
-    def __init__(self, num_feat=64, deformable_groups=8):
-        super(PCDAlignment, self).__init__()
-        # Pyramid has three levels:
-        # L3: level 3, 1/4 spatial size
-        # L2: level 2, 1/2 spatial size
-        # L1: level 1, original spatial size
-        self.offset_conv1 = nn.ModuleDict()
-        self.offset_conv2 = nn.ModuleDict()
-        self.offset_conv3 = nn.ModuleDict()
-        self.dcn_pack = nn.ModuleDict()
-        self.feat_conv = nn.ModuleDict()
-        # Pyramids
-        for i in range(3, 0, -1):
-            level = f'l{i}'
-            self.offset_conv1[level] = nn.Conv2d(num_feat * 2, num_feat, 3, 1, 1)
-            if i == 3:
-                self.offset_conv2[level] = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-            else:
-                self.offset_conv2[level] = nn.Conv2d(num_feat * 2, num_feat, 3, 1, 1)
-                self.offset_conv3[level] = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-            self.dcn_pack[level] = DCNv2Pack(num_feat, num_feat, 3, padding=1, deformable_groups=deformable_groups)
-            if i < 3:
-                self.feat_conv[level] = nn.Conv2d(num_feat * 2, num_feat, 3, 1, 1)
-        # Cascading dcn
-        self.cas_offset_conv1 = nn.Conv2d(num_feat * 2, num_feat, 3, 1, 1)
-        self.cas_offset_conv2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-        self.cas_dcnpack = DCNv2Pack(num_feat, num_feat, 3, padding=1, deformable_groups=deformable_groups)
-        self.upsample = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False)
-        self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True)
-    def forward(self, nbr_feat_l, ref_feat_l):
-        """Align neighboring frame features to the reference frame features.
-        Args:
-            nbr_feat_l (list[Tensor]): Neighboring feature list. It
-                contains three pyramid levels (L1, L2, L3),
-                each with shape (b, c, h, w).
-            ref_feat_l (list[Tensor]): Reference feature list. It
-                contains three pyramid levels (L1, L2, L3),
-                each with shape (b, c, h, w).
-        Returns:
-            Tensor: Aligned features.
-        """
-        # Pyramids
-        upsampled_offset, upsampled_feat = None, None
-        for i in range(3, 0, -1):
-            level = f'l{i}'
-            offset = torch.cat([nbr_feat_l[i - 1], ref_feat_l[i - 1]], dim=1)
-            offset = self.lrelu(self.offset_conv1[level](offset))
-            if i == 3:
-                offset = self.lrelu(self.offset_conv2[level](offset))
-            else:
-                offset = self.lrelu(self.offset_conv2[level](torch.cat([offset, upsampled_offset], dim=1)))
-                offset = self.lrelu(self.offset_conv3[level](offset))
-            feat = self.dcn_pack[level](nbr_feat_l[i - 1], offset)
-            if i < 3:
-                feat = self.feat_conv[level](torch.cat([feat, upsampled_feat], dim=1))
-            if i > 1:
-                feat = self.lrelu(feat)
-            if i > 1:  # upsample offset and features
-                # x2: when we upsample the offset, we should also enlarge
-                # the magnitude.
-                upsampled_offset = self.upsample(offset) * 2
-                upsampled_feat = self.upsample(feat)
-        # Cascading
-        offset = torch.cat([feat, ref_feat_l[0]], dim=1)
-        offset = self.lrelu(self.cas_offset_conv2(self.lrelu(self.cas_offset_conv1(offset))))
-        feat = self.lrelu(self.cas_dcnpack(feat, offset))
-        return feat
-class TSAFusion(nn.Module):
-    """Temporal Spatial Attention (TSA) fusion module.
-    Temporal: Calculate the correlation between center frame and
-        neighboring frames;
-    Spatial: It has 3 pyramid levels, the attention is similar to SFT.
-        (SFT: Recovering realistic texture in image super-resolution by deep
-            spatial feature transform.)
-    Args:
-        num_feat (int): Channel number of middle features. Default: 64.
-        num_frame (int): Number of frames. Default: 5.
-        center_frame_idx (int): The index of center frame. Default: 2.
-    """
-    def __init__(self, num_feat=64, num_frame=5, center_frame_idx=2):
-        super(TSAFusion, self).__init__()
-        self.center_frame_idx = center_frame_idx
-        # temporal attention (before fusion conv)
-        self.temporal_attn1 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-        self.temporal_attn2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-        self.feat_fusion = nn.Conv2d(num_frame * num_feat, num_feat, 1, 1)
-        # spatial attention (after fusion conv)
-        self.max_pool = nn.MaxPool2d(3, stride=2, padding=1)
-        self.avg_pool = nn.AvgPool2d(3, stride=2, padding=1)
-        self.spatial_attn1 = nn.Conv2d(num_frame * num_feat, num_feat, 1)
-        self.spatial_attn2 = nn.Conv2d(num_feat * 2, num_feat, 1)
-        self.spatial_attn3 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-        self.spatial_attn4 = nn.Conv2d(num_feat, num_feat, 1)
-        self.spatial_attn5 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-        self.spatial_attn_l1 = nn.Conv2d(num_feat, num_feat, 1)
-        self.spatial_attn_l2 = nn.Conv2d(num_feat * 2, num_feat, 3, 1, 1)
-        self.spatial_attn_l3 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-        self.spatial_attn_add1 = nn.Conv2d(num_feat, num_feat, 1)
-        self.spatial_attn_add2 = nn.Conv2d(num_feat, num_feat, 1)
-        self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True)
-        self.upsample = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False)
-    def forward(self, aligned_feat):
-        """
-        Args:
-            aligned_feat (Tensor): Aligned features with shape (b, t, c, h, w).
-        Returns:
-            Tensor: Features after TSA with the shape (b, c, h, w).
-        """
-        b, t, c, h, w = aligned_feat.size()
-        # temporal attention
-        embedding_ref = self.temporal_attn1(aligned_feat[:, self.center_frame_idx, :, :, :].clone())
-        embedding = self.temporal_attn2(aligned_feat.view(-1, c, h, w))
-        embedding = embedding.view(b, t, -1, h, w)  # (b, t, c, h, w)
-        corr_l = []  # correlation list
-        for i in range(t):
-            emb_neighbor = embedding[:, i, :, :, :]
-            corr = torch.sum(emb_neighbor * embedding_ref, 1)  # (b, h, w)
-            corr_l.append(corr.unsqueeze(1))  # (b, 1, h, w)
-        corr_prob = torch.sigmoid(torch.cat(corr_l, dim=1))  # (b, t, h, w)
-        corr_prob = corr_prob.unsqueeze(2).expand(b, t, c, h, w)
-        corr_prob = corr_prob.contiguous().view(b, -1, h, w)  # (b, t*c, h, w)
-        aligned_feat = aligned_feat.view(b, -1, h, w) * corr_prob
-        # fusion
-        feat = self.lrelu(self.feat_fusion(aligned_feat))
-        # spatial attention
-        attn = self.lrelu(self.spatial_attn1(aligned_feat))
-        attn_max = self.max_pool(attn)
-        attn_avg = self.avg_pool(attn)
-        attn = self.lrelu(self.spatial_attn2(torch.cat([attn_max, attn_avg], dim=1)))
-        # pyramid levels
-        attn_level = self.lrelu(self.spatial_attn_l1(attn))
-        attn_max = self.max_pool(attn_level)
-        attn_avg = self.avg_pool(attn_level)
-        attn_level = self.lrelu(self.spatial_attn_l2(torch.cat([attn_max, attn_avg], dim=1)))
-        attn_level = self.lrelu(self.spatial_attn_l3(attn_level))
-        attn_level = self.upsample(attn_level)
-        attn = self.lrelu(self.spatial_attn3(attn)) + attn_level
-        attn = self.lrelu(self.spatial_attn4(attn))
-        attn = self.upsample(attn)
-        attn = self.spatial_attn5(attn)
-        attn_add = self.spatial_attn_add2(self.lrelu(self.spatial_attn_add1(attn)))
-        attn = torch.sigmoid(attn)
-        # after initialization, * 2 makes (attn * 2) to be close to 1.
-        feat = feat * attn * 2 + attn_add
-        return feat
-class PredeblurModule(nn.Module):
-    """Pre-dublur module.
-    Args:
-        num_in_ch (int): Channel number of input image. Default: 3.
-        num_feat (int): Channel number of intermediate features. Default: 64.
-        hr_in (bool): Whether the input has high resolution. Default: False.
-    """
-    def __init__(self, num_in_ch=3, num_feat=64, hr_in=False):
-        super(PredeblurModule, self).__init__()
-        self.hr_in = hr_in
-        self.conv_first = nn.Conv2d(num_in_ch, num_feat, 3, 1, 1)
-        if self.hr_in:
-            # downsample x4 by stride conv
-            self.stride_conv_hr1 = nn.Conv2d(num_feat, num_feat, 3, 2, 1)
-            self.stride_conv_hr2 = nn.Conv2d(num_feat, num_feat, 3, 2, 1)
-        # generate feature pyramid
-        self.stride_conv_l2 = nn.Conv2d(num_feat, num_feat, 3, 2, 1)
-        self.stride_conv_l3 = nn.Conv2d(num_feat, num_feat, 3, 2, 1)
-        self.resblock_l3 = ResidualBlockNoBN(num_feat=num_feat)
-        self.resblock_l2_1 = ResidualBlockNoBN(num_feat=num_feat)
-        self.resblock_l2_2 = ResidualBlockNoBN(num_feat=num_feat)
-        self.resblock_l1 = nn.ModuleList([ResidualBlockNoBN(num_feat=num_feat) for i in range(5)])
-        self.upsample = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False)
-        self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True)
-    def forward(self, x):
-        feat_l1 = self.lrelu(self.conv_first(x))
-        if self.hr_in:
-            feat_l1 = self.lrelu(self.stride_conv_hr1(feat_l1))
-            feat_l1 = self.lrelu(self.stride_conv_hr2(feat_l1))
-        # generate feature pyramid
-        feat_l2 = self.lrelu(self.stride_conv_l2(feat_l1))
-        feat_l3 = self.lrelu(self.stride_conv_l3(feat_l2))
-        feat_l3 = self.upsample(self.resblock_l3(feat_l3))
-        feat_l2 = self.resblock_l2_1(feat_l2) + feat_l3
-        feat_l2 = self.upsample(self.resblock_l2_2(feat_l2))
-        for i in range(2):
-            feat_l1 = self.resblock_l1[i](feat_l1)
-        feat_l1 = feat_l1 + feat_l2
-        for i in range(2, 5):
-            feat_l1 = self.resblock_l1[i](feat_l1)
-        return feat_l1
-@ARCH_REGISTRY.register()
-class EDVR(nn.Module):
-    """EDVR network structure for video super-resolution.
-    Now only support X4 upsampling factor.
-    ``Paper: EDVR: Video Restoration with Enhanced Deformable Convolutional Networks``
-    Args:
-        num_in_ch (int): Channel number of input image. Default: 3.
-        num_out_ch (int): Channel number of output image. Default: 3.
-        num_feat (int): Channel number of intermediate features. Default: 64.
-        num_frame (int): Number of input frames. Default: 5.
-        deformable_groups (int): Deformable groups. Defaults: 8.
-        num_extract_block (int): Number of blocks for feature extraction.
-            Default: 5.
-        num_reconstruct_block (int): Number of blocks for reconstruction.
-            Default: 10.
-        center_frame_idx (int): The index of center frame. Frame counting from
-            0. Default: Middle of input frames.
-        hr_in (bool): Whether the input has high resolution. Default: False.
-        with_predeblur (bool): Whether has predeblur module.
-            Default: False.
-        with_tsa (bool): Whether has TSA module. Default: True.
-    """
-    def __init__(self,
-                 num_in_ch=3,
-                 num_out_ch=3,
-                 num_feat=64,
-                 num_frame=5,
-                 deformable_groups=8,
-                 num_extract_block=5,
-                 num_reconstruct_block=10,
-                 center_frame_idx=None,
-                 hr_in=False,
-                 with_predeblur=False,
-                 with_tsa=True):
-        super(EDVR, self).__init__()
-        if center_frame_idx is None:
-            self.center_frame_idx = num_frame // 2
-        else:
-            self.center_frame_idx = center_frame_idx
-        self.hr_in = hr_in
-        self.with_predeblur = with_predeblur
-        self.with_tsa = with_tsa
-        # extract features for each frame
-        if self.with_predeblur:
-            self.predeblur = PredeblurModule(num_feat=num_feat, hr_in=self.hr_in)
-            self.conv_1x1 = nn.Conv2d(num_feat, num_feat, 1, 1)
-        else:
-            self.conv_first = nn.Conv2d(num_in_ch, num_feat, 3, 1, 1)
-        # extract pyramid features
-        self.feature_extraction = make_layer(ResidualBlockNoBN, num_extract_block, num_feat=num_feat)
-        self.conv_l2_1 = nn.Conv2d(num_feat, num_feat, 3, 2, 1)
-        self.conv_l2_2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-        self.conv_l3_1 = nn.Conv2d(num_feat, num_feat, 3, 2, 1)
-        self.conv_l3_2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-        # pcd and tsa module
-        self.pcd_align = PCDAlignment(num_feat=num_feat, deformable_groups=deformable_groups)
-        if self.with_tsa:
-            self.fusion = TSAFusion(num_feat=num_feat, num_frame=num_frame, center_frame_idx=self.center_frame_idx)
-        else:
-            self.fusion = nn.Conv2d(num_frame * num_feat, num_feat, 1, 1)
-        # reconstruction
-        self.reconstruction = make_layer(ResidualBlockNoBN, num_reconstruct_block, num_feat=num_feat)
-        # upsample
-        self.upconv1 = nn.Conv2d(num_feat, num_feat * 4, 3, 1, 1)
-        self.upconv2 = nn.Conv2d(num_feat, 64 * 4, 3, 1, 1)
-        self.pixel_shuffle = nn.PixelShuffle(2)
-        self.conv_hr = nn.Conv2d(64, 64, 3, 1, 1)
-        self.conv_last = nn.Conv2d(64, 3, 3, 1, 1)
-        # activation function
-        self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True)
-    def forward(self, x):
-        b, t, c, h, w = x.size()
-        if self.hr_in:
-            assert h % 16 == 0 and w % 16 == 0, ('The height and width must be multiple of 16.')
-        else:
-            assert h % 4 == 0 and w % 4 == 0, ('The height and width must be multiple of 4.')
-        x_center = x[:, self.center_frame_idx, :, :, :].contiguous()
-        # extract features for each frame
-        # L1
-        if self.with_predeblur:
-            feat_l1 = self.conv_1x1(self.predeblur(x.view(-1, c, h, w)))
-            if self.hr_in:
-                h, w = h // 4, w // 4
-        else:
-            feat_l1 = self.lrelu(self.conv_first(x.view(-1, c, h, w)))
-        feat_l1 = self.feature_extraction(feat_l1)
-        # L2
-        feat_l2 = self.lrelu(self.conv_l2_1(feat_l1))
-        feat_l2 = self.lrelu(self.conv_l2_2(feat_l2))
-        # L3
-        feat_l3 = self.lrelu(self.conv_l3_1(feat_l2))
-        feat_l3 = self.lrelu(self.conv_l3_2(feat_l3))
-        feat_l1 = feat_l1.view(b, t, -1, h, w)
-        feat_l2 = feat_l2.view(b, t, -1, h // 2, w // 2)
-        feat_l3 = feat_l3.view(b, t, -1, h // 4, w // 4)
-        # PCD alignment
-        ref_feat_l = [  # reference feature list
-            feat_l1[:, self.center_frame_idx, :, :, :].clone(), feat_l2[:, self.center_frame_idx, :, :, :].clone(),
-            feat_l3[:, self.center_frame_idx, :, :, :].clone()
-        ]
-        aligned_feat = []
-        for i in range(t):
-            nbr_feat_l = [  # neighboring feature list
-                feat_l1[:, i, :, :, :].clone(), feat_l2[:, i, :, :, :].clone(), feat_l3[:, i, :, :, :].clone()
-            ]
-            aligned_feat.append(self.pcd_align(nbr_feat_l, ref_feat_l))
-        aligned_feat = torch.stack(aligned_feat, dim=1)  # (b, t, c, h, w)
-        if not self.with_tsa:
-            aligned_feat = aligned_feat.view(b, -1, h, w)
-        feat = self.fusion(aligned_feat)
-        out = self.reconstruction(feat)
-        out = self.lrelu(self.pixel_shuffle(self.upconv1(out)))
-        out = self.lrelu(self.pixel_shuffle(self.upconv2(out)))
-        out = self.lrelu(self.conv_hr(out))
-        out = self.conv_last(out)
-        if self.hr_in:
-            base = x_center
-        else:
-            base = F.interpolate(x_center, scale_factor=4, mode='bilinear', align_corners=False)
-        out += base
-        return out

basicsr/archs/hifacegan_arch.py DELETED Viewed

@@ -1,260 +0,0 @@
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from basicsr.utils.registry import ARCH_REGISTRY
-from .hifacegan_util import BaseNetwork, LIPEncoder, SPADEResnetBlock, get_nonspade_norm_layer
-class SPADEGenerator(BaseNetwork):
-    """Generator with SPADEResBlock"""
-    def __init__(self,
-                 num_in_ch=3,
-                 num_feat=64,
-                 use_vae=False,
-                 z_dim=256,
-                 crop_size=512,
-                 norm_g='spectralspadesyncbatch3x3',
-                 is_train=True,
-                 init_train_phase=3):  # progressive training disabled
-        super().__init__()
-        self.nf = num_feat
-        self.input_nc = num_in_ch
-        self.is_train = is_train
-        self.train_phase = init_train_phase
-        self.scale_ratio = 5  # hardcoded now
-        self.sw = crop_size // (2**self.scale_ratio)
-        self.sh = self.sw  # 20210519: By default use square image, aspect_ratio = 1.0
-        if use_vae:
-            # In case of VAE, we will sample from random z vector
-            self.fc = nn.Linear(z_dim, 16 * self.nf * self.sw * self.sh)
-        else:
-            # Otherwise, we make the network deterministic by starting with
-            # downsampled segmentation map instead of random z
-            self.fc = nn.Conv2d(num_in_ch, 16 * self.nf, 3, padding=1)
-        self.head_0 = SPADEResnetBlock(16 * self.nf, 16 * self.nf, norm_g)
-        self.g_middle_0 = SPADEResnetBlock(16 * self.nf, 16 * self.nf, norm_g)
-        self.g_middle_1 = SPADEResnetBlock(16 * self.nf, 16 * self.nf, norm_g)
-        self.ups = nn.ModuleList([
-            SPADEResnetBlock(16 * self.nf, 8 * self.nf, norm_g),
-            SPADEResnetBlock(8 * self.nf, 4 * self.nf, norm_g),
-            SPADEResnetBlock(4 * self.nf, 2 * self.nf, norm_g),
-            SPADEResnetBlock(2 * self.nf, 1 * self.nf, norm_g)
-        ])
-        self.to_rgbs = nn.ModuleList([
-            nn.Conv2d(8 * self.nf, 3, 3, padding=1),
-            nn.Conv2d(4 * self.nf, 3, 3, padding=1),
-            nn.Conv2d(2 * self.nf, 3, 3, padding=1),
-            nn.Conv2d(1 * self.nf, 3, 3, padding=1)
-        ])
-        self.up = nn.Upsample(scale_factor=2)
-    def encode(self, input_tensor):
-        """
-        Encode input_tensor into feature maps, can be overridden in derived classes
-        Default: nearest downsampling of 2**5 = 32 times
-        """
-        h, w = input_tensor.size()[-2:]
-        sh, sw = h // 2**self.scale_ratio, w // 2**self.scale_ratio
-        x = F.interpolate(input_tensor, size=(sh, sw))
-        return self.fc(x)
-    def forward(self, x):
-        # In oroginal SPADE, seg means a segmentation map, but here we use x instead.
-        seg = x
-        x = self.encode(x)
-        x = self.head_0(x, seg)
-        x = self.up(x)
-        x = self.g_middle_0(x, seg)
-        x = self.g_middle_1(x, seg)
-        if self.is_train:
-            phase = self.train_phase + 1
-        else:
-            phase = len(self.to_rgbs)
-        for i in range(phase):
-            x = self.up(x)
-            x = self.ups[i](x, seg)
-        x = self.to_rgbs[phase - 1](F.leaky_relu(x, 2e-1))
-        x = torch.tanh(x)
-        return x
-    def mixed_guidance_forward(self, input_x, seg=None, n=0, mode='progressive'):
-        """
-        A helper class for subspace visualization. Input and seg are different images.
-        For the first n levels (including encoder) we use input, for the rest we use seg.
-        If mode = 'progressive', the output's like: AAABBB
-        If mode = 'one_plug', the output's like:    AAABAA
-        If mode = 'one_ablate', the output's like:  BBBABB
-        """
-        if seg is None:
-            return self.forward(input_x)
-        if self.is_train:
-            phase = self.train_phase + 1
-        else:
-            phase = len(self.to_rgbs)
-        if mode == 'progressive':
-            n = max(min(n, 4 + phase), 0)
-            guide_list = [input_x] * n + [seg] * (4 + phase - n)
-        elif mode == 'one_plug':
-            n = max(min(n, 4 + phase - 1), 0)
-            guide_list = [seg] * (4 + phase)
-            guide_list[n] = input_x
-        elif mode == 'one_ablate':
-            if n > 3 + phase:
-                return self.forward(input_x)
-            guide_list = [input_x] * (4 + phase)
-            guide_list[n] = seg
-        x = self.encode(guide_list[0])
-        x = self.head_0(x, guide_list[1])
-        x = self.up(x)
-        x = self.g_middle_0(x, guide_list[2])
-        x = self.g_middle_1(x, guide_list[3])
-        for i in range(phase):
-            x = self.up(x)
-            x = self.ups[i](x, guide_list[4 + i])
-        x = self.to_rgbs[phase - 1](F.leaky_relu(x, 2e-1))
-        x = torch.tanh(x)
-        return x
-@ARCH_REGISTRY.register()
-class HiFaceGAN(SPADEGenerator):
-    """
-    HiFaceGAN: SPADEGenerator with a learnable feature encoder
-    Current encoder design: LIPEncoder
-    """
-    def __init__(self,
-                 num_in_ch=3,
-                 num_feat=64,
-                 use_vae=False,
-                 z_dim=256,
-                 crop_size=512,
-                 norm_g='spectralspadesyncbatch3x3',
-                 is_train=True,
-                 init_train_phase=3):
-        super().__init__(num_in_ch, num_feat, use_vae, z_dim, crop_size, norm_g, is_train, init_train_phase)
-        self.lip_encoder = LIPEncoder(num_in_ch, num_feat, self.sw, self.sh, self.scale_ratio)
-    def encode(self, input_tensor):
-        return self.lip_encoder(input_tensor)
-@ARCH_REGISTRY.register()
-class HiFaceGANDiscriminator(BaseNetwork):
-    """
-    Inspired by pix2pixHD multiscale discriminator.
-    Args:
-        num_in_ch (int): Channel number of inputs. Default: 3.
-        num_out_ch (int): Channel number of outputs. Default: 3.
-        conditional_d (bool): Whether use conditional discriminator.
-            Default: True.
-        num_d (int): Number of Multiscale discriminators. Default: 3.
-        n_layers_d (int): Number of downsample layers in each D. Default: 4.
-        num_feat (int): Channel number of base intermediate features.
-            Default: 64.
-        norm_d (str): String to determine normalization layers in D.
-            Choices: [spectral][instance/batch/syncbatch]
-            Default: 'spectralinstance'.
-        keep_features (bool): Keep intermediate features for matching loss, etc.
-            Default: True.
-    """
-    def __init__(self,
-                 num_in_ch=3,
-                 num_out_ch=3,
-                 conditional_d=True,
-                 num_d=2,
-                 n_layers_d=4,
-                 num_feat=64,
-                 norm_d='spectralinstance',
-                 keep_features=True):
-        super().__init__()
-        self.num_d = num_d
-        input_nc = num_in_ch
-        if conditional_d:
-            input_nc += num_out_ch
-        for i in range(num_d):
-            subnet_d = NLayerDiscriminator(input_nc, n_layers_d, num_feat, norm_d, keep_features)
-            self.add_module(f'discriminator_{i}', subnet_d)
-    def downsample(self, x):
-        return F.avg_pool2d(x, kernel_size=3, stride=2, padding=[1, 1], count_include_pad=False)
-    # Returns list of lists of discriminator outputs.
-    # The final result is of size opt.num_d x opt.n_layers_D
-    def forward(self, x):
-        result = []
-        for _, _net_d in self.named_children():
-            out = _net_d(x)
-            result.append(out)
-            x = self.downsample(x)
-        return result
-class NLayerDiscriminator(BaseNetwork):
-    """Defines the PatchGAN discriminator with the specified arguments."""
-    def __init__(self, input_nc, n_layers_d, num_feat, norm_d, keep_features):
-        super().__init__()
-        kw = 4
-        padw = int(np.ceil((kw - 1.0) / 2))
-        nf = num_feat
-        self.keep_features = keep_features
-        norm_layer = get_nonspade_norm_layer(norm_d)
-        sequence = [[nn.Conv2d(input_nc, nf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, False)]]
-        for n in range(1, n_layers_d):
-            nf_prev = nf
-            nf = min(nf * 2, 512)
-            stride = 1 if n == n_layers_d - 1 else 2
-            sequence += [[
-                norm_layer(nn.Conv2d(nf_prev, nf, kernel_size=kw, stride=stride, padding=padw)),
-                nn.LeakyReLU(0.2, False)
-            ]]
-        sequence += [[nn.Conv2d(nf, 1, kernel_size=kw, stride=1, padding=padw)]]
-        # We divide the layers into groups to extract intermediate layer outputs
-        for n in range(len(sequence)):
-            self.add_module('model' + str(n), nn.Sequential(*sequence[n]))
-    def forward(self, x):
-        results = [x]
-        for submodel in self.children():
-            intermediate_output = submodel(results[-1])
-            results.append(intermediate_output)
-        if self.keep_features:
-            return results[1:]
-        else:
-            return results[-1]

basicsr/archs/hifacegan_util.py DELETED Viewed

@@ -1,255 +0,0 @@
-import re
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn import init
-# Warning: spectral norm could be buggy
-# under eval mode and multi-GPU inference
-# A workaround is sticking to single-GPU inference and train mode
-from torch.nn.utils import spectral_norm
-class SPADE(nn.Module):
-    def __init__(self, config_text, norm_nc, label_nc):
-        super().__init__()
-        assert config_text.startswith('spade')
-        parsed = re.search('spade(\\D+)(\\d)x\\d', config_text)
-        param_free_norm_type = str(parsed.group(1))
-        ks = int(parsed.group(2))
-        if param_free_norm_type == 'instance':
-            self.param_free_norm = nn.InstanceNorm2d(norm_nc)
-        elif param_free_norm_type == 'syncbatch':
-            print('SyncBatchNorm is currently not supported under single-GPU mode, switch to "instance" instead')
-            self.param_free_norm = nn.InstanceNorm2d(norm_nc)
-        elif param_free_norm_type == 'batch':
-            self.param_free_norm = nn.BatchNorm2d(norm_nc, affine=False)
-        else:
-            raise ValueError(f'{param_free_norm_type} is not a recognized param-free norm type in SPADE')
-        # The dimension of the intermediate embedding space. Yes, hardcoded.
-        nhidden = 128 if norm_nc > 128 else norm_nc
-        pw = ks // 2
-        self.mlp_shared = nn.Sequential(nn.Conv2d(label_nc, nhidden, kernel_size=ks, padding=pw), nn.ReLU())
-        self.mlp_gamma = nn.Conv2d(nhidden, norm_nc, kernel_size=ks, padding=pw, bias=False)
-        self.mlp_beta = nn.Conv2d(nhidden, norm_nc, kernel_size=ks, padding=pw, bias=False)
-    def forward(self, x, segmap):
-        # Part 1. generate parameter-free normalized activations
-        normalized = self.param_free_norm(x)
-        # Part 2. produce scaling and bias conditioned on semantic map
-        segmap = F.interpolate(segmap, size=x.size()[2:], mode='nearest')
-        actv = self.mlp_shared(segmap)
-        gamma = self.mlp_gamma(actv)
-        beta = self.mlp_beta(actv)
-        # apply scale and bias
-        out = normalized * gamma + beta
-        return out
-class SPADEResnetBlock(nn.Module):
-    """
-    ResNet block that uses SPADE. It differs from the ResNet block of pix2pixHD in that
-    it takes in the segmentation map as input, learns the skip connection if necessary,
-    and applies normalization first and then convolution.
-    This architecture seemed like a standard architecture for unconditional or
-    class-conditional GAN architecture using residual block.
-    The code was inspired from https://github.com/LMescheder/GAN_stability.
-    """
-    def __init__(self, fin, fout, norm_g='spectralspadesyncbatch3x3', semantic_nc=3):
-        super().__init__()
-        # Attributes
-        self.learned_shortcut = (fin != fout)
-        fmiddle = min(fin, fout)
-        # create conv layers
-        self.conv_0 = nn.Conv2d(fin, fmiddle, kernel_size=3, padding=1)
-        self.conv_1 = nn.Conv2d(fmiddle, fout, kernel_size=3, padding=1)
-        if self.learned_shortcut:
-            self.conv_s = nn.Conv2d(fin, fout, kernel_size=1, bias=False)
-        # apply spectral norm if specified
-        if 'spectral' in norm_g:
-            self.conv_0 = spectral_norm(self.conv_0)
-            self.conv_1 = spectral_norm(self.conv_1)
-            if self.learned_shortcut:
-                self.conv_s = spectral_norm(self.conv_s)
-        # define normalization layers
-        spade_config_str = norm_g.replace('spectral', '')
-        self.norm_0 = SPADE(spade_config_str, fin, semantic_nc)
-        self.norm_1 = SPADE(spade_config_str, fmiddle, semantic_nc)
-        if self.learned_shortcut:
-            self.norm_s = SPADE(spade_config_str, fin, semantic_nc)
-    # note the resnet block with SPADE also takes in |seg|,
-    # the semantic segmentation map as input
-    def forward(self, x, seg):
-        x_s = self.shortcut(x, seg)
-        dx = self.conv_0(self.act(self.norm_0(x, seg)))
-        dx = self.conv_1(self.act(self.norm_1(dx, seg)))
-        out = x_s + dx
-        return out
-    def shortcut(self, x, seg):
-        if self.learned_shortcut:
-            x_s = self.conv_s(self.norm_s(x, seg))
-        else:
-            x_s = x
-        return x_s
-    def act(self, x):
-        return F.leaky_relu(x, 2e-1)
-class BaseNetwork(nn.Module):
-    """ A basis for hifacegan archs with custom initialization """
-    def init_weights(self, init_type='normal', gain=0.02):
-        def init_func(m):
-            classname = m.__class__.__name__
-            if classname.find('BatchNorm2d') != -1:
-                if hasattr(m, 'weight') and m.weight is not None:
-                    init.normal_(m.weight.data, 1.0, gain)
-                if hasattr(m, 'bias') and m.bias is not None:
-                    init.constant_(m.bias.data, 0.0)
-            elif hasattr(m, 'weight') and (classname.find('Conv') != -1 or classname.find('Linear') != -1):
-                if init_type == 'normal':
-                    init.normal_(m.weight.data, 0.0, gain)
-                elif init_type == 'xavier':
-                    init.xavier_normal_(m.weight.data, gain=gain)
-                elif init_type == 'xavier_uniform':
-                    init.xavier_uniform_(m.weight.data, gain=1.0)
-                elif init_type == 'kaiming':
-                    init.kaiming_normal_(m.weight.data, a=0, mode='fan_in')
-                elif init_type == 'orthogonal':
-                    init.orthogonal_(m.weight.data, gain=gain)
-                elif init_type == 'none':  # uses pytorch's default init method
-                    m.reset_parameters()
-                else:
-                    raise NotImplementedError(f'initialization method [{init_type}] is not implemented')
-                if hasattr(m, 'bias') and m.bias is not None:
-                    init.constant_(m.bias.data, 0.0)
-        self.apply(init_func)
-        # propagate to children
-        for m in self.children():
-            if hasattr(m, 'init_weights'):
-                m.init_weights(init_type, gain)
-    def forward(self, x):
-        pass
-def lip2d(x, logit, kernel=3, stride=2, padding=1):
-    weight = logit.exp()
-    return F.avg_pool2d(x * weight, kernel, stride, padding) / F.avg_pool2d(weight, kernel, stride, padding)
-class SoftGate(nn.Module):
-    COEFF = 12.0
-    def forward(self, x):
-        return torch.sigmoid(x).mul(self.COEFF)
-class SimplifiedLIP(nn.Module):
-    def __init__(self, channels):
-        super(SimplifiedLIP, self).__init__()
-        self.logit = nn.Sequential(
-            nn.Conv2d(channels, channels, 3, padding=1, bias=False), nn.InstanceNorm2d(channels, affine=True),
-            SoftGate())
-    def init_layer(self):
-        self.logit[0].weight.data.fill_(0.0)
-    def forward(self, x):
-        frac = lip2d(x, self.logit(x))
-        return frac
-class LIPEncoder(BaseNetwork):
-    """Local Importance-based Pooling (Ziteng Gao et.al.,ICCV 2019)"""
-    def __init__(self, input_nc, ngf, sw, sh, n_2xdown, norm_layer=nn.InstanceNorm2d):
-        super().__init__()
-        self.sw = sw
-        self.sh = sh
-        self.max_ratio = 16
-        # 20200310: Several Convolution (stride 1) + LIP blocks, 4 fold
-        kw = 3
-        pw = (kw - 1) // 2
-        model = [
-            nn.Conv2d(input_nc, ngf, kw, stride=1, padding=pw, bias=False),
-            norm_layer(ngf),
-            nn.ReLU(),
-        ]
-        cur_ratio = 1
-        for i in range(n_2xdown):
-            next_ratio = min(cur_ratio * 2, self.max_ratio)
-            model += [
-                SimplifiedLIP(ngf * cur_ratio),
-                nn.Conv2d(ngf * cur_ratio, ngf * next_ratio, kw, stride=1, padding=pw),
-                norm_layer(ngf * next_ratio),
-            ]
-            cur_ratio = next_ratio
-            if i < n_2xdown - 1:
-                model += [nn.ReLU(inplace=True)]
-        self.model = nn.Sequential(*model)
-    def forward(self, x):
-        return self.model(x)
-def get_nonspade_norm_layer(norm_type='instance'):
-    # helper function to get # output channels of the previous layer
-    def get_out_channel(layer):
-        if hasattr(layer, 'out_channels'):
-            return getattr(layer, 'out_channels')
-        return layer.weight.size(0)
-    # this function will be returned
-    def add_norm_layer(layer):
-        nonlocal norm_type
-        if norm_type.startswith('spectral'):
-            layer = spectral_norm(layer)
-            subnorm_type = norm_type[len('spectral'):]
-        if subnorm_type == 'none' or len(subnorm_type) == 0:
-            return layer
-        # remove bias in the previous layer, which is meaningless
-        # since it has no effect after normalization
-        if getattr(layer, 'bias', None) is not None:
-            delattr(layer, 'bias')
-            layer.register_parameter('bias', None)
-        if subnorm_type == 'batch':
-            norm_layer = nn.BatchNorm2d(get_out_channel(layer), affine=True)
-        elif subnorm_type == 'sync_batch':
-            print('SyncBatchNorm is currently not supported under single-GPU mode, switch to "instance" instead')
-            # norm_layer = SynchronizedBatchNorm2d(
-            #    get_out_channel(layer), affine=True)
-            norm_layer = nn.InstanceNorm2d(get_out_channel(layer), affine=False)
-        elif subnorm_type == 'instance':
-            norm_layer = nn.InstanceNorm2d(get_out_channel(layer), affine=False)
-        else:
-            raise ValueError(f'normalization layer {subnorm_type} is not recognized')
-        return nn.Sequential(layer, norm_layer)
-    print('This is a legacy from nvlabs/SPADE, and will be removed in future versions.')
-    return add_norm_layer

basicsr/archs/inception.py DELETED Viewed

@@ -1,307 +0,0 @@
-# Modified from https://github.com/mseitzer/pytorch-fid/blob/master/pytorch_fid/inception.py # noqa: E501
-# For FID metric
-import os
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.utils.model_zoo import load_url
-from torchvision import models
-# Inception weights ported to Pytorch from
-# http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
-FID_WEIGHTS_URL = 'https://github.com/mseitzer/pytorch-fid/releases/download/fid_weights/pt_inception-2015-12-05-6726825d.pth'  # noqa: E501
-LOCAL_FID_WEIGHTS = 'experiments/pretrained_models/pt_inception-2015-12-05-6726825d.pth'  # noqa: E501
-class InceptionV3(nn.Module):
-    """Pretrained InceptionV3 network returning feature maps"""
-    # Index of default block of inception to return,
-    # corresponds to output of final average pooling
-    DEFAULT_BLOCK_INDEX = 3
-    # Maps feature dimensionality to their output blocks indices
-    BLOCK_INDEX_BY_DIM = {
-        64: 0,  # First max pooling features
-        192: 1,  # Second max pooling features
-        768: 2,  # Pre-aux classifier features
-        2048: 3  # Final average pooling features
-    }
-    def __init__(self,
-                 output_blocks=(DEFAULT_BLOCK_INDEX),
-                 resize_input=True,
-                 normalize_input=True,
-                 requires_grad=False,
-                 use_fid_inception=True):
-        """Build pretrained InceptionV3.
-        Args:
-            output_blocks (list[int]): Indices of blocks to return features of.
-                Possible values are:
-                - 0: corresponds to output of first max pooling
-                - 1: corresponds to output of second max pooling
-                - 2: corresponds to output which is fed to aux classifier
-                - 3: corresponds to output of final average pooling
-            resize_input (bool): If true, bilinearly resizes input to width and
-                height 299 before feeding input to model. As the network
-                without fully connected layers is fully convolutional, it
-                should be able to handle inputs of arbitrary size, so resizing
-                might not be strictly needed. Default: True.
-            normalize_input (bool): If true, scales the input from range (0, 1)
-                to the range the pretrained Inception network expects,
-                namely (-1, 1). Default: True.
-            requires_grad (bool): If true, parameters of the model require
-                gradients. Possibly useful for finetuning the network.
-                Default: False.
-            use_fid_inception (bool): If true, uses the pretrained Inception
-                model used in Tensorflow's FID implementation.
-                If false, uses the pretrained Inception model available in
-                torchvision. The FID Inception model has different weights
-                and a slightly different structure from torchvision's
-                Inception model. If you want to compute FID scores, you are
-                strongly advised to set this parameter to true to get
-                comparable results. Default: True.
-        """
-        super(InceptionV3, self).__init__()
-        self.resize_input = resize_input
-        self.normalize_input = normalize_input
-        self.output_blocks = sorted(output_blocks)
-        self.last_needed_block = max(output_blocks)
-        assert self.last_needed_block <= 3, ('Last possible output block index is 3')
-        self.blocks = nn.ModuleList()
-        if use_fid_inception:
-            inception = fid_inception_v3()
-        else:
-            try:
-                inception = models.inception_v3(pretrained=True, init_weights=False)
-            except TypeError:
-                # pytorch < 1.5 does not have init_weights for inception_v3
-                inception = models.inception_v3(pretrained=True)
-        # Block 0: input to maxpool1
-        block0 = [
-            inception.Conv2d_1a_3x3, inception.Conv2d_2a_3x3, inception.Conv2d_2b_3x3,
-            nn.MaxPool2d(kernel_size=3, stride=2)
-        ]
-        self.blocks.append(nn.Sequential(*block0))
-        # Block 1: maxpool1 to maxpool2
-        if self.last_needed_block >= 1:
-            block1 = [inception.Conv2d_3b_1x1, inception.Conv2d_4a_3x3, nn.MaxPool2d(kernel_size=3, stride=2)]
-            self.blocks.append(nn.Sequential(*block1))
-        # Block 2: maxpool2 to aux classifier
-        if self.last_needed_block >= 2:
-            block2 = [
-                inception.Mixed_5b,
-                inception.Mixed_5c,
-                inception.Mixed_5d,
-                inception.Mixed_6a,
-                inception.Mixed_6b,
-                inception.Mixed_6c,
-                inception.Mixed_6d,
-                inception.Mixed_6e,
-            ]
-            self.blocks.append(nn.Sequential(*block2))
-        # Block 3: aux classifier to final avgpool
-        if self.last_needed_block >= 3:
-            block3 = [
-                inception.Mixed_7a, inception.Mixed_7b, inception.Mixed_7c,
-                nn.AdaptiveAvgPool2d(output_size=(1, 1))
-            ]
-            self.blocks.append(nn.Sequential(*block3))
-        for param in self.parameters():
-            param.requires_grad = requires_grad
-    def forward(self, x):
-        """Get Inception feature maps.
-        Args:
-            x (Tensor): Input tensor of shape (b, 3, h, w).
-                Values are expected to be in range (-1, 1). You can also input
-                (0, 1) with setting normalize_input = True.
-        Returns:
-            list[Tensor]: Corresponding to the selected output block, sorted
-            ascending by index.
-        """
-        output = []
-        if self.resize_input:
-            x = F.interpolate(x, size=(299, 299), mode='bilinear', align_corners=False)
-        if self.normalize_input:
-            x = 2 * x - 1  # Scale from range (0, 1) to range (-1, 1)
-        for idx, block in enumerate(self.blocks):
-            x = block(x)
-            if idx in self.output_blocks:
-                output.append(x)
-            if idx == self.last_needed_block:
-                break
-        return output
-def fid_inception_v3():
-    """Build pretrained Inception model for FID computation.
-    The Inception model for FID computation uses a different set of weights
-    and has a slightly different structure than torchvision's Inception.
-    This method first constructs torchvision's Inception and then patches the
-    necessary parts that are different in the FID Inception model.
-    """
-    try:
-        inception = models.inception_v3(num_classes=1008, aux_logits=False, pretrained=False, init_weights=False)
-    except TypeError:
-        # pytorch < 1.5 does not have init_weights for inception_v3
-        inception = models.inception_v3(num_classes=1008, aux_logits=False, pretrained=False)
-    inception.Mixed_5b = FIDInceptionA(192, pool_features=32)
-    inception.Mixed_5c = FIDInceptionA(256, pool_features=64)
-    inception.Mixed_5d = FIDInceptionA(288, pool_features=64)
-    inception.Mixed_6b = FIDInceptionC(768, channels_7x7=128)
-    inception.Mixed_6c = FIDInceptionC(768, channels_7x7=160)
-    inception.Mixed_6d = FIDInceptionC(768, channels_7x7=160)
-    inception.Mixed_6e = FIDInceptionC(768, channels_7x7=192)
-    inception.Mixed_7b = FIDInceptionE_1(1280)
-    inception.Mixed_7c = FIDInceptionE_2(2048)
-    if os.path.exists(LOCAL_FID_WEIGHTS):
-        state_dict = torch.load(LOCAL_FID_WEIGHTS, map_location=lambda storage, loc: storage)
-    else:
-        state_dict = load_url(FID_WEIGHTS_URL, progress=True)
-    inception.load_state_dict(state_dict)
-    return inception
-class FIDInceptionA(models.inception.InceptionA):
-    """InceptionA block patched for FID computation"""
-    def __init__(self, in_channels, pool_features):
-        super(FIDInceptionA, self).__init__(in_channels, pool_features)
-    def forward(self, x):
-        branch1x1 = self.branch1x1(x)
-        branch5x5 = self.branch5x5_1(x)
-        branch5x5 = self.branch5x5_2(branch5x5)
-        branch3x3dbl = self.branch3x3dbl_1(x)
-        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
-        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
-        # Patch: Tensorflow's average pool does not use the padded zero's in
-        # its average calculation
-        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1, count_include_pad=False)
-        branch_pool = self.branch_pool(branch_pool)
-        outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
-        return torch.cat(outputs, 1)
-class FIDInceptionC(models.inception.InceptionC):
-    """InceptionC block patched for FID computation"""
-    def __init__(self, in_channels, channels_7x7):
-        super(FIDInceptionC, self).__init__(in_channels, channels_7x7)
-    def forward(self, x):
-        branch1x1 = self.branch1x1(x)
-        branch7x7 = self.branch7x7_1(x)
-        branch7x7 = self.branch7x7_2(branch7x7)
-        branch7x7 = self.branch7x7_3(branch7x7)
-        branch7x7dbl = self.branch7x7dbl_1(x)
-        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
-        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
-        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
-        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
-        # Patch: Tensorflow's average pool does not use the padded zero's in
-        # its average calculation
-        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1, count_include_pad=False)
-        branch_pool = self.branch_pool(branch_pool)
-        outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool]
-        return torch.cat(outputs, 1)
-class FIDInceptionE_1(models.inception.InceptionE):
-    """First InceptionE block patched for FID computation"""
-    def __init__(self, in_channels):
-        super(FIDInceptionE_1, self).__init__(in_channels)
-    def forward(self, x):
-        branch1x1 = self.branch1x1(x)
-        branch3x3 = self.branch3x3_1(x)
-        branch3x3 = [
-            self.branch3x3_2a(branch3x3),
-            self.branch3x3_2b(branch3x3),
-        ]
-        branch3x3 = torch.cat(branch3x3, 1)
-        branch3x3dbl = self.branch3x3dbl_1(x)
-        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
-        branch3x3dbl = [
-            self.branch3x3dbl_3a(branch3x3dbl),
-            self.branch3x3dbl_3b(branch3x3dbl),
-        ]
-        branch3x3dbl = torch.cat(branch3x3dbl, 1)
-        # Patch: Tensorflow's average pool does not use the padded zero's in
-        # its average calculation
-        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1, count_include_pad=False)
-        branch_pool = self.branch_pool(branch_pool)
-        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
-        return torch.cat(outputs, 1)
-class FIDInceptionE_2(models.inception.InceptionE):
-    """Second InceptionE block patched for FID computation"""
-    def __init__(self, in_channels):
-        super(FIDInceptionE_2, self).__init__(in_channels)
-    def forward(self, x):
-        branch1x1 = self.branch1x1(x)
-        branch3x3 = self.branch3x3_1(x)
-        branch3x3 = [
-            self.branch3x3_2a(branch3x3),
-            self.branch3x3_2b(branch3x3),
-        ]
-        branch3x3 = torch.cat(branch3x3, 1)
-        branch3x3dbl = self.branch3x3dbl_1(x)
-        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
-        branch3x3dbl = [
-            self.branch3x3dbl_3a(branch3x3dbl),
-            self.branch3x3dbl_3b(branch3x3dbl),
-        ]
-        branch3x3dbl = torch.cat(branch3x3dbl, 1)
-        # Patch: The FID Inception model uses max pooling instead of average
-        # pooling. This is likely an error in this specific Inception
-        # implementation, as other Inception models use average pooling here
-        # (which matches the description in the paper).
-        branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1)
-        branch_pool = self.branch_pool(branch_pool)
-        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
-        return torch.cat(outputs, 1)

basicsr/archs/rcan_arch.py DELETED Viewed

@@ -1,135 +0,0 @@
-import torch
-from torch import nn as nn
-from basicsr.utils.registry import ARCH_REGISTRY
-from .arch_util import Upsample, make_layer
-class ChannelAttention(nn.Module):
-    """Channel attention used in RCAN.
-    Args:
-        num_feat (int): Channel number of intermediate features.
-        squeeze_factor (int): Channel squeeze factor. Default: 16.
-    """
-    def __init__(self, num_feat, squeeze_factor=16):
-        super(ChannelAttention, self).__init__()
-        self.attention = nn.Sequential(
-            nn.AdaptiveAvgPool2d(1), nn.Conv2d(num_feat, num_feat // squeeze_factor, 1, padding=0),
-            nn.ReLU(inplace=True), nn.Conv2d(num_feat // squeeze_factor, num_feat, 1, padding=0), nn.Sigmoid())
-    def forward(self, x):
-        y = self.attention(x)
-        return x * y
-class RCAB(nn.Module):
-    """Residual Channel Attention Block (RCAB) used in RCAN.
-    Args:
-        num_feat (int): Channel number of intermediate features.
-        squeeze_factor (int): Channel squeeze factor. Default: 16.
-        res_scale (float): Scale the residual. Default: 1.
-    """
-    def __init__(self, num_feat, squeeze_factor=16, res_scale=1):
-        super(RCAB, self).__init__()
-        self.res_scale = res_scale
-        self.rcab = nn.Sequential(
-            nn.Conv2d(num_feat, num_feat, 3, 1, 1), nn.ReLU(True), nn.Conv2d(num_feat, num_feat, 3, 1, 1),
-            ChannelAttention(num_feat, squeeze_factor))
-    def forward(self, x):
-        res = self.rcab(x) * self.res_scale
-        return res + x
-class ResidualGroup(nn.Module):
-    """Residual Group of RCAB.
-    Args:
-        num_feat (int): Channel number of intermediate features.
-        num_block (int): Block number in the body network.
-        squeeze_factor (int): Channel squeeze factor. Default: 16.
-        res_scale (float): Scale the residual. Default: 1.
-    """
-    def __init__(self, num_feat, num_block, squeeze_factor=16, res_scale=1):
-        super(ResidualGroup, self).__init__()
-        self.residual_group = make_layer(
-            RCAB, num_block, num_feat=num_feat, squeeze_factor=squeeze_factor, res_scale=res_scale)
-        self.conv = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-    def forward(self, x):
-        res = self.conv(self.residual_group(x))
-        return res + x
-@ARCH_REGISTRY.register()
-class RCAN(nn.Module):
-    """Residual Channel Attention Networks.
-    ``Paper: Image Super-Resolution Using Very Deep Residual Channel Attention Networks``
-    Reference: https://github.com/yulunzhang/RCAN
-    Args:
-        num_in_ch (int): Channel number of inputs.
-        num_out_ch (int): Channel number of outputs.
-        num_feat (int): Channel number of intermediate features.
-            Default: 64.
-        num_group (int): Number of ResidualGroup. Default: 10.
-        num_block (int): Number of RCAB in ResidualGroup. Default: 16.
-        squeeze_factor (int): Channel squeeze factor. Default: 16.
-        upscale (int): Upsampling factor. Support 2^n and 3.
-            Default: 4.
-        res_scale (float): Used to scale the residual in residual block.
-            Default: 1.
-        img_range (float): Image range. Default: 255.
-        rgb_mean (tuple[float]): Image mean in RGB orders.
-            Default: (0.4488, 0.4371, 0.4040), calculated from DIV2K dataset.
-    """
-    def __init__(self,
-                 num_in_ch,
-                 num_out_ch,
-                 num_feat=64,
-                 num_group=10,
-                 num_block=16,
-                 squeeze_factor=16,
-                 upscale=4,
-                 res_scale=1,
-                 img_range=255.,
-                 rgb_mean=(0.4488, 0.4371, 0.4040)):
-        super(RCAN, self).__init__()
-        self.img_range = img_range
-        self.mean = torch.Tensor(rgb_mean).view(1, 3, 1, 1)
-        self.conv_first = nn.Conv2d(num_in_ch, num_feat, 3, 1, 1)
-        self.body = make_layer(
-            ResidualGroup,
-            num_group,
-            num_feat=num_feat,
-            num_block=num_block,
-            squeeze_factor=squeeze_factor,
-            res_scale=res_scale)
-        self.conv_after_body = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-        self.upsample = Upsample(upscale, num_feat)
-        self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
-    def forward(self, x):
-        self.mean = self.mean.type_as(x)
-        x = (x - self.mean) * self.img_range
-        x = self.conv_first(x)
-        res = self.conv_after_body(self.body(x))
-        res += x
-        x = self.conv_last(self.upsample(res))
-        x = x / self.img_range + self.mean
-        return x

basicsr/archs/ridnet_arch.py DELETED Viewed

@@ -1,180 +0,0 @@
-import torch
-import torch.nn as nn
-from basicsr.utils.registry import ARCH_REGISTRY
-from .arch_util import ResidualBlockNoBN, make_layer
-class MeanShift(nn.Conv2d):
-    """ Data normalization with mean and std.
-    Args:
-        rgb_range (int): Maximum value of RGB.
-        rgb_mean (list[float]): Mean for RGB channels.
-        rgb_std (list[float]): Std for RGB channels.
-        sign (int): For subtraction, sign is -1, for addition, sign is 1.
-            Default: -1.
-        requires_grad (bool): Whether to update the self.weight and self.bias.
-            Default: True.
-    """
-    def __init__(self, rgb_range, rgb_mean, rgb_std, sign=-1, requires_grad=True):
-        super(MeanShift, self).__init__(3, 3, kernel_size=1)
-        std = torch.Tensor(rgb_std)
-        self.weight.data = torch.eye(3).view(3, 3, 1, 1)
-        self.weight.data.div_(std.view(3, 1, 1, 1))
-        self.bias.data = sign * rgb_range * torch.Tensor(rgb_mean)
-        self.bias.data.div_(std)
-        self.requires_grad = requires_grad
-class EResidualBlockNoBN(nn.Module):
-    """Enhanced Residual block without BN.
-    There are three convolution layers in residual branch.
-    """
-    def __init__(self, in_channels, out_channels):
-        super(EResidualBlockNoBN, self).__init__()
-        self.body = nn.Sequential(
-            nn.Conv2d(in_channels, out_channels, 3, 1, 1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(out_channels, out_channels, 3, 1, 1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(out_channels, out_channels, 1, 1, 0),
-        )
-        self.relu = nn.ReLU(inplace=True)
-    def forward(self, x):
-        out = self.body(x)
-        out = self.relu(out + x)
-        return out
-class MergeRun(nn.Module):
-    """ Merge-and-run unit.
-    This unit contains two branches with different dilated convolutions,
-    followed by a convolution to process the concatenated features.
-    Paper: Real Image Denoising with Feature Attention
-    Ref git repo: https://github.com/saeed-anwar/RIDNet
-    """
-    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1):
-        super(MergeRun, self).__init__()
-        self.dilation1 = nn.Sequential(
-            nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding), nn.ReLU(inplace=True),
-            nn.Conv2d(out_channels, out_channels, kernel_size, stride, 2, 2), nn.ReLU(inplace=True))
-        self.dilation2 = nn.Sequential(
-            nn.Conv2d(in_channels, out_channels, kernel_size, stride, 3, 3), nn.ReLU(inplace=True),
-            nn.Conv2d(out_channels, out_channels, kernel_size, stride, 4, 4), nn.ReLU(inplace=True))
-        self.aggregation = nn.Sequential(
-            nn.Conv2d(out_channels * 2, out_channels, kernel_size, stride, padding), nn.ReLU(inplace=True))
-    def forward(self, x):
-        dilation1 = self.dilation1(x)
-        dilation2 = self.dilation2(x)
-        out = torch.cat([dilation1, dilation2], dim=1)
-        out = self.aggregation(out)
-        out = out + x
-        return out
-class ChannelAttention(nn.Module):
-    """Channel attention.
-    Args:
-        num_feat (int): Channel number of intermediate features.
-        squeeze_factor (int): Channel squeeze factor. Default:
-    """
-    def __init__(self, mid_channels, squeeze_factor=16):
-        super(ChannelAttention, self).__init__()
-        self.attention = nn.Sequential(
-            nn.AdaptiveAvgPool2d(1), nn.Conv2d(mid_channels, mid_channels // squeeze_factor, 1, padding=0),
-            nn.ReLU(inplace=True), nn.Conv2d(mid_channels // squeeze_factor, mid_channels, 1, padding=0), nn.Sigmoid())
-    def forward(self, x):
-        y = self.attention(x)
-        return x * y
-class EAM(nn.Module):
-    """Enhancement attention modules (EAM) in RIDNet.
-    This module contains a merge-and-run unit, a residual block,
-    an enhanced residual block and a feature attention unit.
-    Attributes:
-        merge: The merge-and-run unit.
-        block1: The residual block.
-        block2: The enhanced residual block.
-        ca: The feature/channel attention unit.
-    """
-    def __init__(self, in_channels, mid_channels, out_channels):
-        super(EAM, self).__init__()
-        self.merge = MergeRun(in_channels, mid_channels)
-        self.block1 = ResidualBlockNoBN(mid_channels)
-        self.block2 = EResidualBlockNoBN(mid_channels, out_channels)
-        self.ca = ChannelAttention(out_channels)
-        # The residual block in the paper contains a relu after addition.
-        self.relu = nn.ReLU(inplace=True)
-    def forward(self, x):
-        out = self.merge(x)
-        out = self.relu(self.block1(out))
-        out = self.block2(out)
-        out = self.ca(out)
-        return out
-@ARCH_REGISTRY.register()
-class RIDNet(nn.Module):
-    """RIDNet: Real Image Denoising with Feature Attention.
-    Ref git repo: https://github.com/saeed-anwar/RIDNet
-    Args:
-        in_channels (int): Channel number of inputs.
-        mid_channels (int): Channel number of EAM modules.
-            Default: 64.
-        out_channels (int): Channel number of outputs.
-        num_block (int): Number of EAM. Default: 4.
-        img_range (float): Image range. Default: 255.
-        rgb_mean (tuple[float]): Image mean in RGB orders.
-            Default: (0.4488, 0.4371, 0.4040), calculated from DIV2K dataset.
-    """
-    def __init__(self,
-                 in_channels,
-                 mid_channels,
-                 out_channels,
-                 num_block=4,
-                 img_range=255.,
-                 rgb_mean=(0.4488, 0.4371, 0.4040),
-                 rgb_std=(1.0, 1.0, 1.0)):
-        super(RIDNet, self).__init__()
-        self.sub_mean = MeanShift(img_range, rgb_mean, rgb_std)
-        self.add_mean = MeanShift(img_range, rgb_mean, rgb_std, 1)
-        self.head = nn.Conv2d(in_channels, mid_channels, 3, 1, 1)
-        self.body = make_layer(
-            EAM, num_block, in_channels=mid_channels, mid_channels=mid_channels, out_channels=mid_channels)
-        self.tail = nn.Conv2d(mid_channels, out_channels, 3, 1, 1)
-        self.relu = nn.ReLU(inplace=True)
-    def forward(self, x):
-        res = self.sub_mean(x)
-        res = self.tail(self.body(self.relu(self.head(res))))
-        res = self.add_mean(res)
-        out = x + res
-        return out

basicsr/archs/rrdbnet_arch.py DELETED Viewed

@@ -1,119 +0,0 @@
-import torch
-from torch import nn as nn
-from torch.nn import functional as F
-from basicsr.utils.registry import ARCH_REGISTRY
-from .arch_util import default_init_weights, make_layer, pixel_unshuffle
-class ResidualDenseBlock(nn.Module):
-    """Residual Dense Block.
-    Used in RRDB block in ESRGAN.
-    Args:
-        num_feat (int): Channel number of intermediate features.
-        num_grow_ch (int): Channels for each growth.
-    """
-    def __init__(self, num_feat=64, num_grow_ch=32):
-        super(ResidualDenseBlock, self).__init__()
-        self.conv1 = nn.Conv2d(num_feat, num_grow_ch, 3, 1, 1)
-        self.conv2 = nn.Conv2d(num_feat + num_grow_ch, num_grow_ch, 3, 1, 1)
-        self.conv3 = nn.Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, 3, 1, 1)
-        self.conv4 = nn.Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, 3, 1, 1)
-        self.conv5 = nn.Conv2d(num_feat + 4 * num_grow_ch, num_feat, 3, 1, 1)
-        self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
-        # initialization
-        default_init_weights([self.conv1, self.conv2, self.conv3, self.conv4, self.conv5], 0.1)
-    def forward(self, x):
-        x1 = self.lrelu(self.conv1(x))
-        x2 = self.lrelu(self.conv2(torch.cat((x, x1), 1)))
-        x3 = self.lrelu(self.conv3(torch.cat((x, x1, x2), 1)))
-        x4 = self.lrelu(self.conv4(torch.cat((x, x1, x2, x3), 1)))
-        x5 = self.conv5(torch.cat((x, x1, x2, x3, x4), 1))
-        # Empirically, we use 0.2 to scale the residual for better performance
-        return x5 * 0.2 + x
-class RRDB(nn.Module):
-    """Residual in Residual Dense Block.
-    Used in RRDB-Net in ESRGAN.
-    Args:
-        num_feat (int): Channel number of intermediate features.
-        num_grow_ch (int): Channels for each growth.
-    """
-    def __init__(self, num_feat, num_grow_ch=32):
-        super(RRDB, self).__init__()
-        self.rdb1 = ResidualDenseBlock(num_feat, num_grow_ch)
-        self.rdb2 = ResidualDenseBlock(num_feat, num_grow_ch)
-        self.rdb3 = ResidualDenseBlock(num_feat, num_grow_ch)
-    def forward(self, x):
-        out = self.rdb1(x)
-        out = self.rdb2(out)
-        out = self.rdb3(out)
-        # Empirically, we use 0.2 to scale the residual for better performance
-        return out * 0.2 + x
-@ARCH_REGISTRY.register()
-class RRDBNet(nn.Module):
-    """Networks consisting of Residual in Residual Dense Block, which is used
-    in ESRGAN.
-    ESRGAN: Enhanced Super-Resolution Generative Adversarial Networks.
-    We extend ESRGAN for scale x2 and scale x1.
-    Note: This is one option for scale 1, scale 2 in RRDBNet.
-    We first employ the pixel-unshuffle (an inverse operation of pixelshuffle to reduce the spatial size
-    and enlarge the channel size before feeding inputs into the main ESRGAN architecture.
-    Args:
-        num_in_ch (int): Channel number of inputs.
-        num_out_ch (int): Channel number of outputs.
-        num_feat (int): Channel number of intermediate features.
-            Default: 64
-        num_block (int): Block number in the trunk network. Defaults: 23
-        num_grow_ch (int): Channels for each growth. Default: 32.
-    """
-    def __init__(self, num_in_ch, num_out_ch, scale=4, num_feat=64, num_block=23, num_grow_ch=32):
-        super(RRDBNet, self).__init__()
-        self.scale = scale
-        if scale == 2:
-            num_in_ch = num_in_ch * 4
-        elif scale == 1:
-            num_in_ch = num_in_ch * 16
-        self.conv_first = nn.Conv2d(num_in_ch, num_feat, 3, 1, 1)
-        self.body = make_layer(RRDB, num_block, num_feat=num_feat, num_grow_ch=num_grow_ch)
-        self.conv_body = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-        # upsample
-        self.conv_up1 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-        self.conv_up2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-        self.conv_hr = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-        self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
-        self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
-    def forward(self, x):
-        if self.scale == 2:
-            feat = pixel_unshuffle(x, scale=2)
-        elif self.scale == 1:
-            feat = pixel_unshuffle(x, scale=4)
-        else:
-            feat = x
-        feat = self.conv_first(feat)
-        body_feat = self.conv_body(self.body(feat))
-        feat = feat + body_feat
-        # upsample
-        feat = self.lrelu(self.conv_up1(F.interpolate(feat, scale_factor=2, mode='nearest')))
-        feat = self.lrelu(self.conv_up2(F.interpolate(feat, scale_factor=2, mode='nearest')))
-        out = self.conv_last(self.lrelu(self.conv_hr(feat)))
-        return out

basicsr/archs/spynet_arch.py DELETED Viewed

@@ -1,96 +0,0 @@
-import math
-import torch
-from torch import nn as nn
-from torch.nn import functional as F
-from basicsr.utils.registry import ARCH_REGISTRY
-from .arch_util import flow_warp
-class BasicModule(nn.Module):
-    """Basic Module for SpyNet.
-    """
-    def __init__(self):
-        super(BasicModule, self).__init__()
-        self.basic_module = nn.Sequential(
-            nn.Conv2d(in_channels=8, out_channels=32, kernel_size=7, stride=1, padding=3), nn.ReLU(inplace=False),
-            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=7, stride=1, padding=3), nn.ReLU(inplace=False),
-            nn.Conv2d(in_channels=64, out_channels=32, kernel_size=7, stride=1, padding=3), nn.ReLU(inplace=False),
-            nn.Conv2d(in_channels=32, out_channels=16, kernel_size=7, stride=1, padding=3), nn.ReLU(inplace=False),
-            nn.Conv2d(in_channels=16, out_channels=2, kernel_size=7, stride=1, padding=3))
-    def forward(self, tensor_input):
-        return self.basic_module(tensor_input)
-@ARCH_REGISTRY.register()
-class SpyNet(nn.Module):
-    """SpyNet architecture.
-    Args:
-        load_path (str): path for pretrained SpyNet. Default: None.
-    """
-    def __init__(self, load_path=None):
-        super(SpyNet, self).__init__()
-        self.basic_module = nn.ModuleList([BasicModule() for _ in range(6)])
-        if load_path:
-            self.load_state_dict(torch.load(load_path, map_location=lambda storage, loc: storage)['params'])
-        self.register_buffer('mean', torch.Tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1))
-        self.register_buffer('std', torch.Tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1))
-    def preprocess(self, tensor_input):
-        tensor_output = (tensor_input - self.mean) / self.std
-        return tensor_output
-    def process(self, ref, supp):
-        flow = []
-        ref = [self.preprocess(ref)]
-        supp = [self.preprocess(supp)]
-        for level in range(5):
-            ref.insert(0, F.avg_pool2d(input=ref[0], kernel_size=2, stride=2, count_include_pad=False))
-            supp.insert(0, F.avg_pool2d(input=supp[0], kernel_size=2, stride=2, count_include_pad=False))
-        flow = ref[0].new_zeros(
-            [ref[0].size(0), 2,
-             int(math.floor(ref[0].size(2) / 2.0)),
-             int(math.floor(ref[0].size(3) / 2.0))])
-        for level in range(len(ref)):
-            upsampled_flow = F.interpolate(input=flow, scale_factor=2, mode='bilinear', align_corners=True) * 2.0
-            if upsampled_flow.size(2) != ref[level].size(2):
-                upsampled_flow = F.pad(input=upsampled_flow, pad=[0, 0, 0, 1], mode='replicate')
-            if upsampled_flow.size(3) != ref[level].size(3):
-                upsampled_flow = F.pad(input=upsampled_flow, pad=[0, 1, 0, 0], mode='replicate')
-            flow = self.basic_module[level](torch.cat([
-                ref[level],
-                flow_warp(
-                    supp[level], upsampled_flow.permute(0, 2, 3, 1), interp_mode='bilinear', padding_mode='border'),
-                upsampled_flow
-            ], 1)) + upsampled_flow
-        return flow
-    def forward(self, ref, supp):
-        assert ref.size() == supp.size()
-        h, w = ref.size(2), ref.size(3)
-        w_floor = math.floor(math.ceil(w / 32.0) * 32.0)
-        h_floor = math.floor(math.ceil(h / 32.0) * 32.0)
-        ref = F.interpolate(input=ref, size=(h_floor, w_floor), mode='bilinear', align_corners=False)
-        supp = F.interpolate(input=supp, size=(h_floor, w_floor), mode='bilinear', align_corners=False)
-        flow = F.interpolate(input=self.process(ref, supp), size=(h, w), mode='bilinear', align_corners=False)
-        flow[:, 0, :, :] *= float(w) / float(w_floor)
-        flow[:, 1, :, :] *= float(h) / float(h_floor)
-        return flow

basicsr/archs/srresnet_arch.py DELETED Viewed

@@ -1,65 +0,0 @@
-from torch import nn as nn
-from torch.nn import functional as F
-from basicsr.utils.registry import ARCH_REGISTRY
-from .arch_util import ResidualBlockNoBN, default_init_weights, make_layer
-@ARCH_REGISTRY.register()
-class MSRResNet(nn.Module):
-    """Modified SRResNet.
-    A compacted version modified from SRResNet in
-    "Photo-Realistic Single Image Super-Resolution Using a Generative Adversarial Network"
-    It uses residual blocks without BN, similar to EDSR.
-    Currently, it supports x2, x3 and x4 upsampling scale factor.
-    Args:
-        num_in_ch (int): Channel number of inputs. Default: 3.
-        num_out_ch (int): Channel number of outputs. Default: 3.
-        num_feat (int): Channel number of intermediate features. Default: 64.
-        num_block (int): Block number in the body network. Default: 16.
-        upscale (int): Upsampling factor. Support x2, x3 and x4. Default: 4.
-    """
-    def __init__(self, num_in_ch=3, num_out_ch=3, num_feat=64, num_block=16, upscale=4):
-        super(MSRResNet, self).__init__()
-        self.upscale = upscale
-        self.conv_first = nn.Conv2d(num_in_ch, num_feat, 3, 1, 1)
-        self.body = make_layer(ResidualBlockNoBN, num_block, num_feat=num_feat)
-        # upsampling
-        if self.upscale in [2, 3]:
-            self.upconv1 = nn.Conv2d(num_feat, num_feat * self.upscale * self.upscale, 3, 1, 1)
-            self.pixel_shuffle = nn.PixelShuffle(self.upscale)
-        elif self.upscale == 4:
-            self.upconv1 = nn.Conv2d(num_feat, num_feat * 4, 3, 1, 1)
-            self.upconv2 = nn.Conv2d(num_feat, num_feat * 4, 3, 1, 1)
-            self.pixel_shuffle = nn.PixelShuffle(2)
-        self.conv_hr = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-        self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
-        # activation function
-        self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True)
-        # initialization
-        default_init_weights([self.conv_first, self.upconv1, self.conv_hr, self.conv_last], 0.1)
-        if self.upscale == 4:
-            default_init_weights(self.upconv2, 0.1)
-    def forward(self, x):
-        feat = self.lrelu(self.conv_first(x))
-        out = self.body(feat)
-        if self.upscale == 4:
-            out = self.lrelu(self.pixel_shuffle(self.upconv1(out)))
-            out = self.lrelu(self.pixel_shuffle(self.upconv2(out)))
-        elif self.upscale in [2, 3]:
-            out = self.lrelu(self.pixel_shuffle(self.upconv1(out)))
-        out = self.conv_last(self.lrelu(self.conv_hr(out)))
-        base = F.interpolate(x, scale_factor=self.upscale, mode='bilinear', align_corners=False)
-        out += base
-        return out

basicsr/archs/srvgg_arch.py DELETED Viewed

@@ -1,70 +0,0 @@
-from torch import nn as nn
-from torch.nn import functional as F
-from basicsr.utils.registry import ARCH_REGISTRY
-@ARCH_REGISTRY.register(suffix='basicsr')
-class SRVGGNetCompact(nn.Module):
-    """A compact VGG-style network structure for super-resolution.
-    It is a compact network structure, which performs upsampling in the last layer and no convolution is
-    conducted on the HR feature space.
-    Args:
-        num_in_ch (int): Channel number of inputs. Default: 3.
-        num_out_ch (int): Channel number of outputs. Default: 3.
-        num_feat (int): Channel number of intermediate features. Default: 64.
-        num_conv (int): Number of convolution layers in the body network. Default: 16.
-        upscale (int): Upsampling factor. Default: 4.
-        act_type (str): Activation type, options: 'relu', 'prelu', 'leakyrelu'. Default: prelu.
-    """
-    def __init__(self, num_in_ch=3, num_out_ch=3, num_feat=64, num_conv=16, upscale=4, act_type='prelu'):
-        super(SRVGGNetCompact, self).__init__()
-        self.num_in_ch = num_in_ch
-        self.num_out_ch = num_out_ch
-        self.num_feat = num_feat
-        self.num_conv = num_conv
-        self.upscale = upscale
-        self.act_type = act_type
-        self.body = nn.ModuleList()
-        # the first conv
-        self.body.append(nn.Conv2d(num_in_ch, num_feat, 3, 1, 1))
-        # the first activation
-        if act_type == 'relu':
-            activation = nn.ReLU(inplace=True)
-        elif act_type == 'prelu':
-            activation = nn.PReLU(num_parameters=num_feat)
-        elif act_type == 'leakyrelu':
-            activation = nn.LeakyReLU(negative_slope=0.1, inplace=True)
-        self.body.append(activation)
-        # the body structure
-        for _ in range(num_conv):
-            self.body.append(nn.Conv2d(num_feat, num_feat, 3, 1, 1))
-            # activation
-            if act_type == 'relu':
-                activation = nn.ReLU(inplace=True)
-            elif act_type == 'prelu':
-                activation = nn.PReLU(num_parameters=num_feat)
-            elif act_type == 'leakyrelu':
-                activation = nn.LeakyReLU(negative_slope=0.1, inplace=True)
-            self.body.append(activation)
-        # the last conv
-        self.body.append(nn.Conv2d(num_feat, num_out_ch * upscale * upscale, 3, 1, 1))
-        # upsample
-        self.upsampler = nn.PixelShuffle(upscale)
-    def forward(self, x):
-        out = x
-        for i in range(0, len(self.body)):
-            out = self.body[i](out)
-        out = self.upsampler(out)
-        # add the nearest upsampled image, so that the network learns the residual
-        base = F.interpolate(x, scale_factor=self.upscale, mode='nearest')
-        out += base
-        return out

basicsr/archs/stylegan2_arch.py DELETED Viewed

@@ -1,799 +0,0 @@
-import math
-import random
-import torch
-from torch import nn
-from torch.nn import functional as F
-from basicsr.ops.fused_act import FusedLeakyReLU, fused_leaky_relu
-from basicsr.ops.upfirdn2d import upfirdn2d
-from basicsr.utils.registry import ARCH_REGISTRY
-class NormStyleCode(nn.Module):
-    def forward(self, x):
-        """Normalize the style codes.
-        Args:
-            x (Tensor): Style codes with shape (b, c).
-        Returns:
-            Tensor: Normalized tensor.
-        """
-        return x * torch.rsqrt(torch.mean(x**2, dim=1, keepdim=True) + 1e-8)
-def make_resample_kernel(k):
-    """Make resampling kernel for UpFirDn.
-    Args:
-        k (list[int]): A list indicating the 1D resample kernel magnitude.
-    Returns:
-        Tensor: 2D resampled kernel.
-    """
-    k = torch.tensor(k, dtype=torch.float32)
-    if k.ndim == 1:
-        k = k[None, :] * k[:, None]  # to 2D kernel, outer product
-    # normalize
-    k /= k.sum()
-    return k
-class UpFirDnUpsample(nn.Module):
-    """Upsample, FIR filter, and downsample (upsampole version).
-    References:
-    1. https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.upfirdn.html  # noqa: E501
-    2. http://www.ece.northwestern.edu/local-apps/matlabhelp/toolbox/signal/upfirdn.html  # noqa: E501
-    Args:
-        resample_kernel (list[int]): A list indicating the 1D resample kernel
-            magnitude.
-        factor (int): Upsampling scale factor. Default: 2.
-    """
-    def __init__(self, resample_kernel, factor=2):
-        super(UpFirDnUpsample, self).__init__()
-        self.kernel = make_resample_kernel(resample_kernel) * (factor**2)
-        self.factor = factor
-        pad = self.kernel.shape[0] - factor
-        self.pad = ((pad + 1) // 2 + factor - 1, pad // 2)
-    def forward(self, x):
-        out = upfirdn2d(x, self.kernel.type_as(x), up=self.factor, down=1, pad=self.pad)
-        return out
-    def __repr__(self):
-        return (f'{self.__class__.__name__}(factor={self.factor})')
-class UpFirDnDownsample(nn.Module):
-    """Upsample, FIR filter, and downsample (downsampole version).
-    Args:
-        resample_kernel (list[int]): A list indicating the 1D resample kernel
-            magnitude.
-        factor (int): Downsampling scale factor. Default: 2.
-    """
-    def __init__(self, resample_kernel, factor=2):
-        super(UpFirDnDownsample, self).__init__()
-        self.kernel = make_resample_kernel(resample_kernel)
-        self.factor = factor
-        pad = self.kernel.shape[0] - factor
-        self.pad = ((pad + 1) // 2, pad // 2)
-    def forward(self, x):
-        out = upfirdn2d(x, self.kernel.type_as(x), up=1, down=self.factor, pad=self.pad)
-        return out
-    def __repr__(self):
-        return (f'{self.__class__.__name__}(factor={self.factor})')
-class UpFirDnSmooth(nn.Module):
-    """Upsample, FIR filter, and downsample (smooth version).
-    Args:
-        resample_kernel (list[int]): A list indicating the 1D resample kernel
-            magnitude.
-        upsample_factor (int): Upsampling scale factor. Default: 1.
-        downsample_factor (int): Downsampling scale factor. Default: 1.
-        kernel_size (int): Kernel size: Default: 1.
-    """
-    def __init__(self, resample_kernel, upsample_factor=1, downsample_factor=1, kernel_size=1):
-        super(UpFirDnSmooth, self).__init__()
-        self.upsample_factor = upsample_factor
-        self.downsample_factor = downsample_factor
-        self.kernel = make_resample_kernel(resample_kernel)
-        if upsample_factor > 1:
-            self.kernel = self.kernel * (upsample_factor**2)
-        if upsample_factor > 1:
-            pad = (self.kernel.shape[0] - upsample_factor) - (kernel_size - 1)
-            self.pad = ((pad + 1) // 2 + upsample_factor - 1, pad // 2 + 1)
-        elif downsample_factor > 1:
-            pad = (self.kernel.shape[0] - downsample_factor) + (kernel_size - 1)
-            self.pad = ((pad + 1) // 2, pad // 2)
-        else:
-            raise NotImplementedError
-    def forward(self, x):
-        out = upfirdn2d(x, self.kernel.type_as(x), up=1, down=1, pad=self.pad)
-        return out
-    def __repr__(self):
-        return (f'{self.__class__.__name__}(upsample_factor={self.upsample_factor}'
-                f', downsample_factor={self.downsample_factor})')
-class EqualLinear(nn.Module):
-    """Equalized Linear as StyleGAN2.
-    Args:
-        in_channels (int): Size of each sample.
-        out_channels (int): Size of each output sample.
-        bias (bool): If set to ``False``, the layer will not learn an additive
-            bias. Default: ``True``.
-        bias_init_val (float): Bias initialized value. Default: 0.
-        lr_mul (float): Learning rate multiplier. Default: 1.
-        activation (None | str): The activation after ``linear`` operation.
-            Supported: 'fused_lrelu', None. Default: None.
-    """
-    def __init__(self, in_channels, out_channels, bias=True, bias_init_val=0, lr_mul=1, activation=None):
-        super(EqualLinear, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.lr_mul = lr_mul
-        self.activation = activation
-        if self.activation not in ['fused_lrelu', None]:
-            raise ValueError(f'Wrong activation value in EqualLinear: {activation}'
-                             "Supported ones are: ['fused_lrelu', None].")
-        self.scale = (1 / math.sqrt(in_channels)) * lr_mul
-        self.weight = nn.Parameter(torch.randn(out_channels, in_channels).div_(lr_mul))
-        if bias:
-            self.bias = nn.Parameter(torch.zeros(out_channels).fill_(bias_init_val))
-        else:
-            self.register_parameter('bias', None)
-    def forward(self, x):
-        if self.bias is None:
-            bias = None
-        else:
-            bias = self.bias * self.lr_mul
-        if self.activation == 'fused_lrelu':
-            out = F.linear(x, self.weight * self.scale)
-            out = fused_leaky_relu(out, bias)
-        else:
-            out = F.linear(x, self.weight * self.scale, bias=bias)
-        return out
-    def __repr__(self):
-        return (f'{self.__class__.__name__}(in_channels={self.in_channels}, '
-                f'out_channels={self.out_channels}, bias={self.bias is not None})')
-class ModulatedConv2d(nn.Module):
-    """Modulated Conv2d used in StyleGAN2.
-    There is no bias in ModulatedConv2d.
-    Args:
-        in_channels (int): Channel number of the input.
-        out_channels (int): Channel number of the output.
-        kernel_size (int): Size of the convolving kernel.
-        num_style_feat (int): Channel number of style features.
-        demodulate (bool): Whether to demodulate in the conv layer.
-            Default: True.
-        sample_mode (str | None): Indicating 'upsample', 'downsample' or None.
-            Default: None.
-        resample_kernel (list[int]): A list indicating the 1D resample kernel
-            magnitude. Default: (1, 3, 3, 1).
-        eps (float): A value added to the denominator for numerical stability.
-            Default: 1e-8.
-    """
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 num_style_feat,
-                 demodulate=True,
-                 sample_mode=None,
-                 resample_kernel=(1, 3, 3, 1),
-                 eps=1e-8):
-        super(ModulatedConv2d, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.demodulate = demodulate
-        self.sample_mode = sample_mode
-        self.eps = eps
-        if self.sample_mode == 'upsample':
-            self.smooth = UpFirDnSmooth(
-                resample_kernel, upsample_factor=2, downsample_factor=1, kernel_size=kernel_size)
-        elif self.sample_mode == 'downsample':
-            self.smooth = UpFirDnSmooth(
-                resample_kernel, upsample_factor=1, downsample_factor=2, kernel_size=kernel_size)
-        elif self.sample_mode is None:
-            pass
-        else:
-            raise ValueError(f'Wrong sample mode {self.sample_mode}, '
-                             "supported ones are ['upsample', 'downsample', None].")
-        self.scale = 1 / math.sqrt(in_channels * kernel_size**2)
-        # modulation inside each modulated conv
-        self.modulation = EqualLinear(
-            num_style_feat, in_channels, bias=True, bias_init_val=1, lr_mul=1, activation=None)
-        self.weight = nn.Parameter(torch.randn(1, out_channels, in_channels, kernel_size, kernel_size))
-        self.padding = kernel_size // 2
-    def forward(self, x, style):
-        """Forward function.
-        Args:
-            x (Tensor): Tensor with shape (b, c, h, w).
-            style (Tensor): Tensor with shape (b, num_style_feat).
-        Returns:
-            Tensor: Modulated tensor after convolution.
-        """
-        b, c, h, w = x.shape  # c = c_in
-        # weight modulation
-        style = self.modulation(style).view(b, 1, c, 1, 1)
-        # self.weight: (1, c_out, c_in, k, k); style: (b, 1, c, 1, 1)
-        weight = self.scale * self.weight * style  # (b, c_out, c_in, k, k)
-        if self.demodulate:
-            demod = torch.rsqrt(weight.pow(2).sum([2, 3, 4]) + self.eps)
-            weight = weight * demod.view(b, self.out_channels, 1, 1, 1)
-        weight = weight.view(b * self.out_channels, c, self.kernel_size, self.kernel_size)
-        if self.sample_mode == 'upsample':
-            x = x.view(1, b * c, h, w)
-            weight = weight.view(b, self.out_channels, c, self.kernel_size, self.kernel_size)
-            weight = weight.transpose(1, 2).reshape(b * c, self.out_channels, self.kernel_size, self.kernel_size)
-            out = F.conv_transpose2d(x, weight, padding=0, stride=2, groups=b)
-            out = out.view(b, self.out_channels, *out.shape[2:4])
-            out = self.smooth(out)
-        elif self.sample_mode == 'downsample':
-            x = self.smooth(x)
-            x = x.view(1, b * c, *x.shape[2:4])
-            out = F.conv2d(x, weight, padding=0, stride=2, groups=b)
-            out = out.view(b, self.out_channels, *out.shape[2:4])
-        else:
-            x = x.view(1, b * c, h, w)
-            # weight: (b*c_out, c_in, k, k), groups=b
-            out = F.conv2d(x, weight, padding=self.padding, groups=b)
-            out = out.view(b, self.out_channels, *out.shape[2:4])
-        return out
-    def __repr__(self):
-        return (f'{self.__class__.__name__}(in_channels={self.in_channels}, '
-                f'out_channels={self.out_channels}, '
-                f'kernel_size={self.kernel_size}, '
-                f'demodulate={self.demodulate}, sample_mode={self.sample_mode})')
-class StyleConv(nn.Module):
-    """Style conv.
-    Args:
-        in_channels (int): Channel number of the input.
-        out_channels (int): Channel number of the output.
-        kernel_size (int): Size of the convolving kernel.
-        num_style_feat (int): Channel number of style features.
-        demodulate (bool): Whether demodulate in the conv layer. Default: True.
-        sample_mode (str | None): Indicating 'upsample', 'downsample' or None.
-            Default: None.
-        resample_kernel (list[int]): A list indicating the 1D resample kernel
-            magnitude. Default: (1, 3, 3, 1).
-    """
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 num_style_feat,
-                 demodulate=True,
-                 sample_mode=None,
-                 resample_kernel=(1, 3, 3, 1)):
-        super(StyleConv, self).__init__()
-        self.modulated_conv = ModulatedConv2d(
-            in_channels,
-            out_channels,
-            kernel_size,
-            num_style_feat,
-            demodulate=demodulate,
-            sample_mode=sample_mode,
-            resample_kernel=resample_kernel)
-        self.weight = nn.Parameter(torch.zeros(1))  # for noise injection
-        self.activate = FusedLeakyReLU(out_channels)
-    def forward(self, x, style, noise=None):
-        # modulate
-        out = self.modulated_conv(x, style)
-        # noise injection
-        if noise is None:
-            b, _, h, w = out.shape
-            noise = out.new_empty(b, 1, h, w).normal_()
-        out = out + self.weight * noise
-        # activation (with bias)
-        out = self.activate(out)
-        return out
-class ToRGB(nn.Module):
-    """To RGB from features.
-    Args:
-        in_channels (int): Channel number of input.
-        num_style_feat (int): Channel number of style features.
-        upsample (bool): Whether to upsample. Default: True.
-        resample_kernel (list[int]): A list indicating the 1D resample kernel
-            magnitude. Default: (1, 3, 3, 1).
-    """
-    def __init__(self, in_channels, num_style_feat, upsample=True, resample_kernel=(1, 3, 3, 1)):
-        super(ToRGB, self).__init__()
-        if upsample:
-            self.upsample = UpFirDnUpsample(resample_kernel, factor=2)
-        else:
-            self.upsample = None
-        self.modulated_conv = ModulatedConv2d(
-            in_channels, 3, kernel_size=1, num_style_feat=num_style_feat, demodulate=False, sample_mode=None)
-        self.bias = nn.Parameter(torch.zeros(1, 3, 1, 1))
-    def forward(self, x, style, skip=None):
-        """Forward function.
-        Args:
-            x (Tensor): Feature tensor with shape (b, c, h, w).
-            style (Tensor): Tensor with shape (b, num_style_feat).
-            skip (Tensor): Base/skip tensor. Default: None.
-        Returns:
-            Tensor: RGB images.
-        """
-        out = self.modulated_conv(x, style)
-        out = out + self.bias
-        if skip is not None:
-            if self.upsample:
-                skip = self.upsample(skip)
-            out = out + skip
-        return out
-class ConstantInput(nn.Module):
-    """Constant input.
-    Args:
-        num_channel (int): Channel number of constant input.
-        size (int): Spatial size of constant input.
-    """
-    def __init__(self, num_channel, size):
-        super(ConstantInput, self).__init__()
-        self.weight = nn.Parameter(torch.randn(1, num_channel, size, size))
-    def forward(self, batch):
-        out = self.weight.repeat(batch, 1, 1, 1)
-        return out
-@ARCH_REGISTRY.register()
-class StyleGAN2Generator(nn.Module):
-    """StyleGAN2 Generator.
-    Args:
-        out_size (int): The spatial size of outputs.
-        num_style_feat (int): Channel number of style features. Default: 512.
-        num_mlp (int): Layer number of MLP style layers. Default: 8.
-        channel_multiplier (int): Channel multiplier for large networks of
-            StyleGAN2. Default: 2.
-        resample_kernel (list[int]): A list indicating the 1D resample kernel
-            magnitude. A cross production will be applied to extent 1D resample
-            kernel to 2D resample kernel. Default: (1, 3, 3, 1).
-        lr_mlp (float): Learning rate multiplier for mlp layers. Default: 0.01.
-        narrow (float): Narrow ratio for channels. Default: 1.0.
-    """
-    def __init__(self,
-                 out_size,
-                 num_style_feat=512,
-                 num_mlp=8,
-                 channel_multiplier=2,
-                 resample_kernel=(1, 3, 3, 1),
-                 lr_mlp=0.01,
-                 narrow=1):
-        super(StyleGAN2Generator, self).__init__()
-        # Style MLP layers
-        self.num_style_feat = num_style_feat
-        style_mlp_layers = [NormStyleCode()]
-        for i in range(num_mlp):
-            style_mlp_layers.append(
-                EqualLinear(
-                    num_style_feat, num_style_feat, bias=True, bias_init_val=0, lr_mul=lr_mlp,
-                    activation='fused_lrelu'))
-        self.style_mlp = nn.Sequential(*style_mlp_layers)
-        channels = {
-            '4': int(512 * narrow),
-            '8': int(512 * narrow),
-            '16': int(512 * narrow),
-            '32': int(512 * narrow),
-            '64': int(256 * channel_multiplier * narrow),
-            '128': int(128 * channel_multiplier * narrow),
-            '256': int(64 * channel_multiplier * narrow),
-            '512': int(32 * channel_multiplier * narrow),
-            '1024': int(16 * channel_multiplier * narrow)
-        }
-        self.channels = channels
-        self.constant_input = ConstantInput(channels['4'], size=4)
-        self.style_conv1 = StyleConv(
-            channels['4'],
-            channels['4'],
-            kernel_size=3,
-            num_style_feat=num_style_feat,
-            demodulate=True,
-            sample_mode=None,
-            resample_kernel=resample_kernel)
-        self.to_rgb1 = ToRGB(channels['4'], num_style_feat, upsample=False, resample_kernel=resample_kernel)
-        self.log_size = int(math.log(out_size, 2))
-        self.num_layers = (self.log_size - 2) * 2 + 1
-        self.num_latent = self.log_size * 2 - 2
-        self.style_convs = nn.ModuleList()
-        self.to_rgbs = nn.ModuleList()
-        self.noises = nn.Module()
-        in_channels = channels['4']
-        # noise
-        for layer_idx in range(self.num_layers):
-            resolution = 2**((layer_idx + 5) // 2)
-            shape = [1, 1, resolution, resolution]
-            self.noises.register_buffer(f'noise{layer_idx}', torch.randn(*shape))
-        # style convs and to_rgbs
-        for i in range(3, self.log_size + 1):
-            out_channels = channels[f'{2**i}']
-            self.style_convs.append(
-                StyleConv(
-                    in_channels,
-                    out_channels,
-                    kernel_size=3,
-                    num_style_feat=num_style_feat,
-                    demodulate=True,
-                    sample_mode='upsample',
-                    resample_kernel=resample_kernel,
-                ))
-            self.style_convs.append(
-                StyleConv(
-                    out_channels,
-                    out_channels,
-                    kernel_size=3,
-                    num_style_feat=num_style_feat,
-                    demodulate=True,
-                    sample_mode=None,
-                    resample_kernel=resample_kernel))
-            self.to_rgbs.append(ToRGB(out_channels, num_style_feat, upsample=True, resample_kernel=resample_kernel))
-            in_channels = out_channels
-    def make_noise(self):
-        """Make noise for noise injection."""
-        device = self.constant_input.weight.device
-        noises = [torch.randn(1, 1, 4, 4, device=device)]
-        for i in range(3, self.log_size + 1):
-            for _ in range(2):
-                noises.append(torch.randn(1, 1, 2**i, 2**i, device=device))
-        return noises
-    def get_latent(self, x):
-        return self.style_mlp(x)
-    def mean_latent(self, num_latent):
-        latent_in = torch.randn(num_latent, self.num_style_feat, device=self.constant_input.weight.device)
-        latent = self.style_mlp(latent_in).mean(0, keepdim=True)
-        return latent
-    def forward(self,
-                styles,
-                input_is_latent=False,
-                noise=None,
-                randomize_noise=True,
-                truncation=1,
-                truncation_latent=None,
-                inject_index=None,
-                return_latents=False):
-        """Forward function for StyleGAN2Generator.
-        Args:
-            styles (list[Tensor]): Sample codes of styles.
-            input_is_latent (bool): Whether input is latent style.
-                Default: False.
-            noise (Tensor | None): Input noise or None. Default: None.
-            randomize_noise (bool): Randomize noise, used when 'noise' is
-                False. Default: True.
-            truncation (float): TODO. Default: 1.
-            truncation_latent (Tensor | None): TODO. Default: None.
-            inject_index (int | None): The injection index for mixing noise.
-                Default: None.
-            return_latents (bool): Whether to return style latents.
-                Default: False.
-        """
-        # style codes -> latents with Style MLP layer
-        if not input_is_latent:
-            styles = [self.style_mlp(s) for s in styles]
-        # noises
-        if noise is None:
-            if randomize_noise:
-                noise = [None] * self.num_layers  # for each style conv layer
-            else:  # use the stored noise
-                noise = [getattr(self.noises, f'noise{i}') for i in range(self.num_layers)]
-        # style truncation
-        if truncation < 1:
-            style_truncation = []
-            for style in styles:
-                style_truncation.append(truncation_latent + truncation * (style - truncation_latent))
-            styles = style_truncation
-        # get style latent with injection
-        if len(styles) == 1:
-            inject_index = self.num_latent
-            if styles[0].ndim < 3:
-                # repeat latent code for all the layers
-                latent = styles[0].unsqueeze(1).repeat(1, inject_index, 1)
-            else:  # used for encoder with different latent code for each layer
-                latent = styles[0]
-        elif len(styles) == 2:  # mixing noises
-            if inject_index is None:
-                inject_index = random.randint(1, self.num_latent - 1)
-            latent1 = styles[0].unsqueeze(1).repeat(1, inject_index, 1)
-            latent2 = styles[1].unsqueeze(1).repeat(1, self.num_latent - inject_index, 1)
-            latent = torch.cat([latent1, latent2], 1)
-        # main generation
-        out = self.constant_input(latent.shape[0])
-        out = self.style_conv1(out, latent[:, 0], noise=noise[0])
-        skip = self.to_rgb1(out, latent[:, 1])
-        i = 1
-        for conv1, conv2, noise1, noise2, to_rgb in zip(self.style_convs[::2], self.style_convs[1::2], noise[1::2],
-                                                        noise[2::2], self.to_rgbs):
-            out = conv1(out, latent[:, i], noise=noise1)
-            out = conv2(out, latent[:, i + 1], noise=noise2)
-            skip = to_rgb(out, latent[:, i + 2], skip)
-            i += 2
-        image = skip
-        if return_latents:
-            return image, latent
-        else:
-            return image, None
-class ScaledLeakyReLU(nn.Module):
-    """Scaled LeakyReLU.
-    Args:
-        negative_slope (float): Negative slope. Default: 0.2.
-    """
-    def __init__(self, negative_slope=0.2):
-        super(ScaledLeakyReLU, self).__init__()
-        self.negative_slope = negative_slope
-    def forward(self, x):
-        out = F.leaky_relu(x, negative_slope=self.negative_slope)
-        return out * math.sqrt(2)
-class EqualConv2d(nn.Module):
-    """Equalized Linear as StyleGAN2.
-    Args:
-        in_channels (int): Channel number of the input.
-        out_channels (int): Channel number of the output.
-        kernel_size (int): Size of the convolving kernel.
-        stride (int): Stride of the convolution. Default: 1
-        padding (int): Zero-padding added to both sides of the input.
-            Default: 0.
-        bias (bool): If ``True``, adds a learnable bias to the output.
-            Default: ``True``.
-        bias_init_val (float): Bias initialized value. Default: 0.
-    """
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, bias=True, bias_init_val=0):
-        super(EqualConv2d, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.padding = padding
-        self.scale = 1 / math.sqrt(in_channels * kernel_size**2)
-        self.weight = nn.Parameter(torch.randn(out_channels, in_channels, kernel_size, kernel_size))
-        if bias:
-            self.bias = nn.Parameter(torch.zeros(out_channels).fill_(bias_init_val))
-        else:
-            self.register_parameter('bias', None)
-    def forward(self, x):
-        out = F.conv2d(
-            x,
-            self.weight * self.scale,
-            bias=self.bias,
-            stride=self.stride,
-            padding=self.padding,
-        )
-        return out
-    def __repr__(self):
-        return (f'{self.__class__.__name__}(in_channels={self.in_channels}, '
-                f'out_channels={self.out_channels}, '
-                f'kernel_size={self.kernel_size},'
-                f' stride={self.stride}, padding={self.padding}, '
-                f'bias={self.bias is not None})')
-class ConvLayer(nn.Sequential):
-    """Conv Layer used in StyleGAN2 Discriminator.
-    Args:
-        in_channels (int): Channel number of the input.
-        out_channels (int): Channel number of the output.
-        kernel_size (int): Kernel size.
-        downsample (bool): Whether downsample by a factor of 2.
-            Default: False.
-        resample_kernel (list[int]): A list indicating the 1D resample
-            kernel magnitude. A cross production will be applied to
-            extent 1D resample kernel to 2D resample kernel.
-            Default: (1, 3, 3, 1).
-        bias (bool): Whether with bias. Default: True.
-        activate (bool): Whether use activateion. Default: True.
-    """
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 downsample=False,
-                 resample_kernel=(1, 3, 3, 1),
-                 bias=True,
-                 activate=True):
-        layers = []
-        # downsample
-        if downsample:
-            layers.append(
-                UpFirDnSmooth(resample_kernel, upsample_factor=1, downsample_factor=2, kernel_size=kernel_size))
-            stride = 2
-            self.padding = 0
-        else:
-            stride = 1
-            self.padding = kernel_size // 2
-        # conv
-        layers.append(
-            EqualConv2d(
-                in_channels, out_channels, kernel_size, stride=stride, padding=self.padding, bias=bias
-                and not activate))
-        # activation
-        if activate:
-            if bias:
-                layers.append(FusedLeakyReLU(out_channels))
-            else:
-                layers.append(ScaledLeakyReLU(0.2))
-        super(ConvLayer, self).__init__(*layers)
-class ResBlock(nn.Module):
-    """Residual block used in StyleGAN2 Discriminator.
-    Args:
-        in_channels (int): Channel number of the input.
-        out_channels (int): Channel number of the output.
-        resample_kernel (list[int]): A list indicating the 1D resample
-            kernel magnitude. A cross production will be applied to
-            extent 1D resample kernel to 2D resample kernel.
-            Default: (1, 3, 3, 1).
-    """
-    def __init__(self, in_channels, out_channels, resample_kernel=(1, 3, 3, 1)):
-        super(ResBlock, self).__init__()
-        self.conv1 = ConvLayer(in_channels, in_channels, 3, bias=True, activate=True)
-        self.conv2 = ConvLayer(
-            in_channels, out_channels, 3, downsample=True, resample_kernel=resample_kernel, bias=True, activate=True)
-        self.skip = ConvLayer(
-            in_channels, out_channels, 1, downsample=True, resample_kernel=resample_kernel, bias=False, activate=False)
-    def forward(self, x):
-        out = self.conv1(x)
-        out = self.conv2(out)
-        skip = self.skip(x)
-        out = (out + skip) / math.sqrt(2)
-        return out
-@ARCH_REGISTRY.register()
-class StyleGAN2Discriminator(nn.Module):
-    """StyleGAN2 Discriminator.
-    Args:
-        out_size (int): The spatial size of outputs.
-        channel_multiplier (int): Channel multiplier for large networks of
-            StyleGAN2. Default: 2.
-        resample_kernel (list[int]): A list indicating the 1D resample kernel
-            magnitude. A cross production will be applied to extent 1D resample
-            kernel to 2D resample kernel. Default: (1, 3, 3, 1).
-        stddev_group (int): For group stddev statistics. Default: 4.
-        narrow (float): Narrow ratio for channels. Default: 1.0.
-    """
-    def __init__(self, out_size, channel_multiplier=2, resample_kernel=(1, 3, 3, 1), stddev_group=4, narrow=1):
-        super(StyleGAN2Discriminator, self).__init__()
-        channels = {
-            '4': int(512 * narrow),
-            '8': int(512 * narrow),
-            '16': int(512 * narrow),
-            '32': int(512 * narrow),
-            '64': int(256 * channel_multiplier * narrow),
-            '128': int(128 * channel_multiplier * narrow),
-            '256': int(64 * channel_multiplier * narrow),
-            '512': int(32 * channel_multiplier * narrow),
-            '1024': int(16 * channel_multiplier * narrow)
-        }
-        log_size = int(math.log(out_size, 2))
-        conv_body = [ConvLayer(3, channels[f'{out_size}'], 1, bias=True, activate=True)]
-        in_channels = channels[f'{out_size}']
-        for i in range(log_size, 2, -1):
-            out_channels = channels[f'{2**(i - 1)}']
-            conv_body.append(ResBlock(in_channels, out_channels, resample_kernel))
-            in_channels = out_channels
-        self.conv_body = nn.Sequential(*conv_body)
-        self.final_conv = ConvLayer(in_channels + 1, channels['4'], 3, bias=True, activate=True)
-        self.final_linear = nn.Sequential(
-            EqualLinear(
-                channels['4'] * 4 * 4, channels['4'], bias=True, bias_init_val=0, lr_mul=1, activation='fused_lrelu'),
-            EqualLinear(channels['4'], 1, bias=True, bias_init_val=0, lr_mul=1, activation=None),
-        )
-        self.stddev_group = stddev_group
-        self.stddev_feat = 1
-    def forward(self, x):
-        out = self.conv_body(x)
-        b, c, h, w = out.shape
-        # concatenate a group stddev statistics to out
-        group = min(b, self.stddev_group)  # Minibatch must be divisible by (or smaller than) group_size
-        stddev = out.view(group, -1, self.stddev_feat, c // self.stddev_feat, h, w)
-        stddev = torch.sqrt(stddev.var(0, unbiased=False) + 1e-8)
-        stddev = stddev.mean([2, 3, 4], keepdims=True).squeeze(2)
-        stddev = stddev.repeat(group, 1, h, w)
-        out = torch.cat([out, stddev], 1)
-        out = self.final_conv(out)
-        out = out.view(b, -1)
-        out = self.final_linear(out)
-        return out

basicsr/archs/stylegan2_bilinear_arch.py DELETED Viewed

@@ -1,614 +0,0 @@
-import math
-import random
-import torch
-from torch import nn
-from torch.nn import functional as F
-from basicsr.ops.fused_act import FusedLeakyReLU, fused_leaky_relu
-from basicsr.utils.registry import ARCH_REGISTRY
-class NormStyleCode(nn.Module):
-    def forward(self, x):
-        """Normalize the style codes.
-        Args:
-            x (Tensor): Style codes with shape (b, c).
-        Returns:
-            Tensor: Normalized tensor.
-        """
-        return x * torch.rsqrt(torch.mean(x**2, dim=1, keepdim=True) + 1e-8)
-class EqualLinear(nn.Module):
-    """Equalized Linear as StyleGAN2.
-    Args:
-        in_channels (int): Size of each sample.
-        out_channels (int): Size of each output sample.
-        bias (bool): If set to ``False``, the layer will not learn an additive
-            bias. Default: ``True``.
-        bias_init_val (float): Bias initialized value. Default: 0.
-        lr_mul (float): Learning rate multiplier. Default: 1.
-        activation (None | str): The activation after ``linear`` operation.
-            Supported: 'fused_lrelu', None. Default: None.
-    """
-    def __init__(self, in_channels, out_channels, bias=True, bias_init_val=0, lr_mul=1, activation=None):
-        super(EqualLinear, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.lr_mul = lr_mul
-        self.activation = activation
-        if self.activation not in ['fused_lrelu', None]:
-            raise ValueError(f'Wrong activation value in EqualLinear: {activation}'
-                             "Supported ones are: ['fused_lrelu', None].")
-        self.scale = (1 / math.sqrt(in_channels)) * lr_mul
-        self.weight = nn.Parameter(torch.randn(out_channels, in_channels).div_(lr_mul))
-        if bias:
-            self.bias = nn.Parameter(torch.zeros(out_channels).fill_(bias_init_val))
-        else:
-            self.register_parameter('bias', None)
-    def forward(self, x):
-        if self.bias is None:
-            bias = None
-        else:
-            bias = self.bias * self.lr_mul
-        if self.activation == 'fused_lrelu':
-            out = F.linear(x, self.weight * self.scale)
-            out = fused_leaky_relu(out, bias)
-        else:
-            out = F.linear(x, self.weight * self.scale, bias=bias)
-        return out
-    def __repr__(self):
-        return (f'{self.__class__.__name__}(in_channels={self.in_channels}, '
-                f'out_channels={self.out_channels}, bias={self.bias is not None})')
-class ModulatedConv2d(nn.Module):
-    """Modulated Conv2d used in StyleGAN2.
-    There is no bias in ModulatedConv2d.
-    Args:
-        in_channels (int): Channel number of the input.
-        out_channels (int): Channel number of the output.
-        kernel_size (int): Size of the convolving kernel.
-        num_style_feat (int): Channel number of style features.
-        demodulate (bool): Whether to demodulate in the conv layer.
-            Default: True.
-        sample_mode (str | None): Indicating 'upsample', 'downsample' or None.
-            Default: None.
-        eps (float): A value added to the denominator for numerical stability.
-            Default: 1e-8.
-    """
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 num_style_feat,
-                 demodulate=True,
-                 sample_mode=None,
-                 eps=1e-8,
-                 interpolation_mode='bilinear'):
-        super(ModulatedConv2d, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.demodulate = demodulate
-        self.sample_mode = sample_mode
-        self.eps = eps
-        self.interpolation_mode = interpolation_mode
-        if self.interpolation_mode == 'nearest':
-            self.align_corners = None
-        else:
-            self.align_corners = False
-        self.scale = 1 / math.sqrt(in_channels * kernel_size**2)
-        # modulation inside each modulated conv
-        self.modulation = EqualLinear(
-            num_style_feat, in_channels, bias=True, bias_init_val=1, lr_mul=1, activation=None)
-        self.weight = nn.Parameter(torch.randn(1, out_channels, in_channels, kernel_size, kernel_size))
-        self.padding = kernel_size // 2
-    def forward(self, x, style):
-        """Forward function.
-        Args:
-            x (Tensor): Tensor with shape (b, c, h, w).
-            style (Tensor): Tensor with shape (b, num_style_feat).
-        Returns:
-            Tensor: Modulated tensor after convolution.
-        """
-        b, c, h, w = x.shape  # c = c_in
-        # weight modulation
-        style = self.modulation(style).view(b, 1, c, 1, 1)
-        # self.weight: (1, c_out, c_in, k, k); style: (b, 1, c, 1, 1)
-        weight = self.scale * self.weight * style  # (b, c_out, c_in, k, k)
-        if self.demodulate:
-            demod = torch.rsqrt(weight.pow(2).sum([2, 3, 4]) + self.eps)
-            weight = weight * demod.view(b, self.out_channels, 1, 1, 1)
-        weight = weight.view(b * self.out_channels, c, self.kernel_size, self.kernel_size)
-        if self.sample_mode == 'upsample':
-            x = F.interpolate(x, scale_factor=2, mode=self.interpolation_mode, align_corners=self.align_corners)
-        elif self.sample_mode == 'downsample':
-            x = F.interpolate(x, scale_factor=0.5, mode=self.interpolation_mode, align_corners=self.align_corners)
-        b, c, h, w = x.shape
-        x = x.view(1, b * c, h, w)
-        # weight: (b*c_out, c_in, k, k), groups=b
-        out = F.conv2d(x, weight, padding=self.padding, groups=b)
-        out = out.view(b, self.out_channels, *out.shape[2:4])
-        return out
-    def __repr__(self):
-        return (f'{self.__class__.__name__}(in_channels={self.in_channels}, '
-                f'out_channels={self.out_channels}, '
-                f'kernel_size={self.kernel_size}, '
-                f'demodulate={self.demodulate}, sample_mode={self.sample_mode})')
-class StyleConv(nn.Module):
-    """Style conv.
-    Args:
-        in_channels (int): Channel number of the input.
-        out_channels (int): Channel number of the output.
-        kernel_size (int): Size of the convolving kernel.
-        num_style_feat (int): Channel number of style features.
-        demodulate (bool): Whether demodulate in the conv layer. Default: True.
-        sample_mode (str | None): Indicating 'upsample', 'downsample' or None.
-            Default: None.
-    """
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 num_style_feat,
-                 demodulate=True,
-                 sample_mode=None,
-                 interpolation_mode='bilinear'):
-        super(StyleConv, self).__init__()
-        self.modulated_conv = ModulatedConv2d(
-            in_channels,
-            out_channels,
-            kernel_size,
-            num_style_feat,
-            demodulate=demodulate,
-            sample_mode=sample_mode,
-            interpolation_mode=interpolation_mode)
-        self.weight = nn.Parameter(torch.zeros(1))  # for noise injection
-        self.activate = FusedLeakyReLU(out_channels)
-    def forward(self, x, style, noise=None):
-        # modulate
-        out = self.modulated_conv(x, style)
-        # noise injection
-        if noise is None:
-            b, _, h, w = out.shape
-            noise = out.new_empty(b, 1, h, w).normal_()
-        out = out + self.weight * noise
-        # activation (with bias)
-        out = self.activate(out)
-        return out
-class ToRGB(nn.Module):
-    """To RGB from features.
-    Args:
-        in_channels (int): Channel number of input.
-        num_style_feat (int): Channel number of style features.
-        upsample (bool): Whether to upsample. Default: True.
-    """
-    def __init__(self, in_channels, num_style_feat, upsample=True, interpolation_mode='bilinear'):
-        super(ToRGB, self).__init__()
-        self.upsample = upsample
-        self.interpolation_mode = interpolation_mode
-        if self.interpolation_mode == 'nearest':
-            self.align_corners = None
-        else:
-            self.align_corners = False
-        self.modulated_conv = ModulatedConv2d(
-            in_channels,
-            3,
-            kernel_size=1,
-            num_style_feat=num_style_feat,
-            demodulate=False,
-            sample_mode=None,
-            interpolation_mode=interpolation_mode)
-        self.bias = nn.Parameter(torch.zeros(1, 3, 1, 1))
-    def forward(self, x, style, skip=None):
-        """Forward function.
-        Args:
-            x (Tensor): Feature tensor with shape (b, c, h, w).
-            style (Tensor): Tensor with shape (b, num_style_feat).
-            skip (Tensor): Base/skip tensor. Default: None.
-        Returns:
-            Tensor: RGB images.
-        """
-        out = self.modulated_conv(x, style)
-        out = out + self.bias
-        if skip is not None:
-            if self.upsample:
-                skip = F.interpolate(
-                    skip, scale_factor=2, mode=self.interpolation_mode, align_corners=self.align_corners)
-            out = out + skip
-        return out
-class ConstantInput(nn.Module):
-    """Constant input.
-    Args:
-        num_channel (int): Channel number of constant input.
-        size (int): Spatial size of constant input.
-    """
-    def __init__(self, num_channel, size):
-        super(ConstantInput, self).__init__()
-        self.weight = nn.Parameter(torch.randn(1, num_channel, size, size))
-    def forward(self, batch):
-        out = self.weight.repeat(batch, 1, 1, 1)
-        return out
-@ARCH_REGISTRY.register(suffix='basicsr')
-class StyleGAN2GeneratorBilinear(nn.Module):
-    """StyleGAN2 Generator.
-    Args:
-        out_size (int): The spatial size of outputs.
-        num_style_feat (int): Channel number of style features. Default: 512.
-        num_mlp (int): Layer number of MLP style layers. Default: 8.
-        channel_multiplier (int): Channel multiplier for large networks of
-            StyleGAN2. Default: 2.
-        lr_mlp (float): Learning rate multiplier for mlp layers. Default: 0.01.
-        narrow (float): Narrow ratio for channels. Default: 1.0.
-    """
-    def __init__(self,
-                 out_size,
-                 num_style_feat=512,
-                 num_mlp=8,
-                 channel_multiplier=2,
-                 lr_mlp=0.01,
-                 narrow=1,
-                 interpolation_mode='bilinear'):
-        super(StyleGAN2GeneratorBilinear, self).__init__()
-        # Style MLP layers
-        self.num_style_feat = num_style_feat
-        style_mlp_layers = [NormStyleCode()]
-        for i in range(num_mlp):
-            style_mlp_layers.append(
-                EqualLinear(
-                    num_style_feat, num_style_feat, bias=True, bias_init_val=0, lr_mul=lr_mlp,
-                    activation='fused_lrelu'))
-        self.style_mlp = nn.Sequential(*style_mlp_layers)
-        channels = {
-            '4': int(512 * narrow),
-            '8': int(512 * narrow),
-            '16': int(512 * narrow),
-            '32': int(512 * narrow),
-            '64': int(256 * channel_multiplier * narrow),
-            '128': int(128 * channel_multiplier * narrow),
-            '256': int(64 * channel_multiplier * narrow),
-            '512': int(32 * channel_multiplier * narrow),
-            '1024': int(16 * channel_multiplier * narrow)
-        }
-        self.channels = channels
-        self.constant_input = ConstantInput(channels['4'], size=4)
-        self.style_conv1 = StyleConv(
-            channels['4'],
-            channels['4'],
-            kernel_size=3,
-            num_style_feat=num_style_feat,
-            demodulate=True,
-            sample_mode=None,
-            interpolation_mode=interpolation_mode)
-        self.to_rgb1 = ToRGB(channels['4'], num_style_feat, upsample=False, interpolation_mode=interpolation_mode)
-        self.log_size = int(math.log(out_size, 2))
-        self.num_layers = (self.log_size - 2) * 2 + 1
-        self.num_latent = self.log_size * 2 - 2
-        self.style_convs = nn.ModuleList()
-        self.to_rgbs = nn.ModuleList()
-        self.noises = nn.Module()
-        in_channels = channels['4']
-        # noise
-        for layer_idx in range(self.num_layers):
-            resolution = 2**((layer_idx + 5) // 2)
-            shape = [1, 1, resolution, resolution]
-            self.noises.register_buffer(f'noise{layer_idx}', torch.randn(*shape))
-        # style convs and to_rgbs
-        for i in range(3, self.log_size + 1):
-            out_channels = channels[f'{2**i}']
-            self.style_convs.append(
-                StyleConv(
-                    in_channels,
-                    out_channels,
-                    kernel_size=3,
-                    num_style_feat=num_style_feat,
-                    demodulate=True,
-                    sample_mode='upsample',
-                    interpolation_mode=interpolation_mode))
-            self.style_convs.append(
-                StyleConv(
-                    out_channels,
-                    out_channels,
-                    kernel_size=3,
-                    num_style_feat=num_style_feat,
-                    demodulate=True,
-                    sample_mode=None,
-                    interpolation_mode=interpolation_mode))
-            self.to_rgbs.append(
-                ToRGB(out_channels, num_style_feat, upsample=True, interpolation_mode=interpolation_mode))
-            in_channels = out_channels
-    def make_noise(self):
-        """Make noise for noise injection."""
-        device = self.constant_input.weight.device
-        noises = [torch.randn(1, 1, 4, 4, device=device)]
-        for i in range(3, self.log_size + 1):
-            for _ in range(2):
-                noises.append(torch.randn(1, 1, 2**i, 2**i, device=device))
-        return noises
-    def get_latent(self, x):
-        return self.style_mlp(x)
-    def mean_latent(self, num_latent):
-        latent_in = torch.randn(num_latent, self.num_style_feat, device=self.constant_input.weight.device)
-        latent = self.style_mlp(latent_in).mean(0, keepdim=True)
-        return latent
-    def forward(self,
-                styles,
-                input_is_latent=False,
-                noise=None,
-                randomize_noise=True,
-                truncation=1,
-                truncation_latent=None,
-                inject_index=None,
-                return_latents=False):
-        """Forward function for StyleGAN2Generator.
-        Args:
-            styles (list[Tensor]): Sample codes of styles.
-            input_is_latent (bool): Whether input is latent style.
-                Default: False.
-            noise (Tensor | None): Input noise or None. Default: None.
-            randomize_noise (bool): Randomize noise, used when 'noise' is
-                False. Default: True.
-            truncation (float): TODO. Default: 1.
-            truncation_latent (Tensor | None): TODO. Default: None.
-            inject_index (int | None): The injection index for mixing noise.
-                Default: None.
-            return_latents (bool): Whether to return style latents.
-                Default: False.
-        """
-        # style codes -> latents with Style MLP layer
-        if not input_is_latent:
-            styles = [self.style_mlp(s) for s in styles]
-        # noises
-        if noise is None:
-            if randomize_noise:
-                noise = [None] * self.num_layers  # for each style conv layer
-            else:  # use the stored noise
-                noise = [getattr(self.noises, f'noise{i}') for i in range(self.num_layers)]
-        # style truncation
-        if truncation < 1:
-            style_truncation = []
-            for style in styles:
-                style_truncation.append(truncation_latent + truncation * (style - truncation_latent))
-            styles = style_truncation
-        # get style latent with injection
-        if len(styles) == 1:
-            inject_index = self.num_latent
-            if styles[0].ndim < 3:
-                # repeat latent code for all the layers
-                latent = styles[0].unsqueeze(1).repeat(1, inject_index, 1)
-            else:  # used for encoder with different latent code for each layer
-                latent = styles[0]
-        elif len(styles) == 2:  # mixing noises
-            if inject_index is None:
-                inject_index = random.randint(1, self.num_latent - 1)
-            latent1 = styles[0].unsqueeze(1).repeat(1, inject_index, 1)
-            latent2 = styles[1].unsqueeze(1).repeat(1, self.num_latent - inject_index, 1)
-            latent = torch.cat([latent1, latent2], 1)
-        # main generation
-        out = self.constant_input(latent.shape[0])
-        out = self.style_conv1(out, latent[:, 0], noise=noise[0])
-        skip = self.to_rgb1(out, latent[:, 1])
-        i = 1
-        for conv1, conv2, noise1, noise2, to_rgb in zip(self.style_convs[::2], self.style_convs[1::2], noise[1::2],
-                                                        noise[2::2], self.to_rgbs):
-            out = conv1(out, latent[:, i], noise=noise1)
-            out = conv2(out, latent[:, i + 1], noise=noise2)
-            skip = to_rgb(out, latent[:, i + 2], skip)
-            i += 2
-        image = skip
-        if return_latents:
-            return image, latent
-        else:
-            return image, None
-class ScaledLeakyReLU(nn.Module):
-    """Scaled LeakyReLU.
-    Args:
-        negative_slope (float): Negative slope. Default: 0.2.
-    """
-    def __init__(self, negative_slope=0.2):
-        super(ScaledLeakyReLU, self).__init__()
-        self.negative_slope = negative_slope
-    def forward(self, x):
-        out = F.leaky_relu(x, negative_slope=self.negative_slope)
-        return out * math.sqrt(2)
-class EqualConv2d(nn.Module):
-    """Equalized Linear as StyleGAN2.
-    Args:
-        in_channels (int): Channel number of the input.
-        out_channels (int): Channel number of the output.
-        kernel_size (int): Size of the convolving kernel.
-        stride (int): Stride of the convolution. Default: 1
-        padding (int): Zero-padding added to both sides of the input.
-            Default: 0.
-        bias (bool): If ``True``, adds a learnable bias to the output.
-            Default: ``True``.
-        bias_init_val (float): Bias initialized value. Default: 0.
-    """
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, bias=True, bias_init_val=0):
-        super(EqualConv2d, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.padding = padding
-        self.scale = 1 / math.sqrt(in_channels * kernel_size**2)
-        self.weight = nn.Parameter(torch.randn(out_channels, in_channels, kernel_size, kernel_size))
-        if bias:
-            self.bias = nn.Parameter(torch.zeros(out_channels).fill_(bias_init_val))
-        else:
-            self.register_parameter('bias', None)
-    def forward(self, x):
-        out = F.conv2d(
-            x,
-            self.weight * self.scale,
-            bias=self.bias,
-            stride=self.stride,
-            padding=self.padding,
-        )
-        return out
-    def __repr__(self):
-        return (f'{self.__class__.__name__}(in_channels={self.in_channels}, '
-                f'out_channels={self.out_channels}, '
-                f'kernel_size={self.kernel_size},'
-                f' stride={self.stride}, padding={self.padding}, '
-                f'bias={self.bias is not None})')
-class ConvLayer(nn.Sequential):
-    """Conv Layer used in StyleGAN2 Discriminator.
-    Args:
-        in_channels (int): Channel number of the input.
-        out_channels (int): Channel number of the output.
-        kernel_size (int): Kernel size.
-        downsample (bool): Whether downsample by a factor of 2.
-            Default: False.
-        bias (bool): Whether with bias. Default: True.
-        activate (bool): Whether use activateion. Default: True.
-    """
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 downsample=False,
-                 bias=True,
-                 activate=True,
-                 interpolation_mode='bilinear'):
-        layers = []
-        self.interpolation_mode = interpolation_mode
-        # downsample
-        if downsample:
-            if self.interpolation_mode == 'nearest':
-                self.align_corners = None
-            else:
-                self.align_corners = False
-            layers.append(
-                torch.nn.Upsample(scale_factor=0.5, mode=interpolation_mode, align_corners=self.align_corners))
-        stride = 1
-        self.padding = kernel_size // 2
-        # conv
-        layers.append(
-            EqualConv2d(
-                in_channels, out_channels, kernel_size, stride=stride, padding=self.padding, bias=bias
-                and not activate))
-        # activation
-        if activate:
-            if bias:
-                layers.append(FusedLeakyReLU(out_channels))
-            else:
-                layers.append(ScaledLeakyReLU(0.2))
-        super(ConvLayer, self).__init__(*layers)
-class ResBlock(nn.Module):
-    """Residual block used in StyleGAN2 Discriminator.
-    Args:
-        in_channels (int): Channel number of the input.
-        out_channels (int): Channel number of the output.
-    """
-    def __init__(self, in_channels, out_channels, interpolation_mode='bilinear'):
-        super(ResBlock, self).__init__()
-        self.conv1 = ConvLayer(in_channels, in_channels, 3, bias=True, activate=True)
-        self.conv2 = ConvLayer(
-            in_channels,
-            out_channels,
-            3,
-            downsample=True,
-            interpolation_mode=interpolation_mode,
-            bias=True,
-            activate=True)
-        self.skip = ConvLayer(
-            in_channels,
-            out_channels,
-            1,
-            downsample=True,
-            interpolation_mode=interpolation_mode,
-            bias=False,
-            activate=False)
-    def forward(self, x):
-        out = self.conv1(x)
-        out = self.conv2(out)
-        skip = self.skip(x)
-        out = (out + skip) / math.sqrt(2)
-        return out

basicsr/archs/swinir_arch.py DELETED Viewed

@@ -1,956 +0,0 @@
-# Modified from https://github.com/JingyunLiang/SwinIR
-# SwinIR: Image Restoration Using Swin Transformer, https://arxiv.org/abs/2108.10257
-# Originally Written by Ze Liu, Modified by Jingyun Liang.
-import math
-import torch
-import torch.nn as nn
-import torch.utils.checkpoint as checkpoint
-from basicsr.utils.registry import ARCH_REGISTRY
-from .arch_util import to_2tuple, trunc_normal_
-def drop_path(x, drop_prob: float = 0., training: bool = False):
-    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-    From: https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py
-    """
-    if drop_prob == 0. or not training:
-        return x
-    keep_prob = 1 - drop_prob
-    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
-    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
-    random_tensor.floor_()  # binarize
-    output = x.div(keep_prob) * random_tensor
-    return output
-class DropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
-    From: https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py
-    """
-    def __init__(self, drop_prob=None):
-        super(DropPath, self).__init__()
-        self.drop_prob = drop_prob
-    def forward(self, x):
-        return drop_path(x, self.drop_prob, self.training)
-class Mlp(nn.Module):
-    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = act_layer()
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop = nn.Dropout(drop)
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-def window_partition(x, window_size):
-    """
-    Args:
-        x: (b, h, w, c)
-        window_size (int): window size
-    Returns:
-        windows: (num_windows*b, window_size, window_size, c)
-    """
-    b, h, w, c = x.shape
-    x = x.view(b, h // window_size, window_size, w // window_size, window_size, c)
-    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, c)
-    return windows
-def window_reverse(windows, window_size, h, w):
-    """
-    Args:
-        windows: (num_windows*b, window_size, window_size, c)
-        window_size (int): Window size
-        h (int): Height of image
-        w (int): Width of image
-    Returns:
-        x: (b, h, w, c)
-    """
-    b = int(windows.shape[0] / (h * w / window_size / window_size))
-    x = windows.view(b, h // window_size, w // window_size, window_size, window_size, -1)
-    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(b, h, w, -1)
-    return x
-class WindowAttention(nn.Module):
-    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
-    It supports both of shifted and non-shifted window.
-    Args:
-        dim (int): Number of input channels.
-        window_size (tuple[int]): The height and width of the window.
-        num_heads (int): Number of attention heads.
-        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
-        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
-        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
-    """
-    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
-        super().__init__()
-        self.dim = dim
-        self.window_size = window_size  # Wh, Ww
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim**-0.5
-        # define a parameter table of relative position bias
-        self.relative_position_bias_table = nn.Parameter(
-            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
-        # get pair-wise relative position index for each token inside the window
-        coords_h = torch.arange(self.window_size[0])
-        coords_w = torch.arange(self.window_size[1])
-        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
-        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
-        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
-        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
-        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
-        relative_coords[:, :, 1] += self.window_size[1] - 1
-        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
-        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
-        self.register_buffer('relative_position_index', relative_position_index)
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-        trunc_normal_(self.relative_position_bias_table, std=.02)
-        self.softmax = nn.Softmax(dim=-1)
-    def forward(self, x, mask=None):
-        """
-        Args:
-            x: input features with shape of (num_windows*b, n, c)
-            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
-        """
-        b_, n, c = x.shape
-        qkv = self.qkv(x).reshape(b_, n, 3, self.num_heads, c // self.num_heads).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
-        q = q * self.scale
-        attn = (q @ k.transpose(-2, -1))
-        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
-            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
-        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
-        attn = attn + relative_position_bias.unsqueeze(0)
-        if mask is not None:
-            nw = mask.shape[0]
-            attn = attn.view(b_ // nw, nw, self.num_heads, n, n) + mask.unsqueeze(1).unsqueeze(0)
-            attn = attn.view(-1, self.num_heads, n, n)
-            attn = self.softmax(attn)
-        else:
-            attn = self.softmax(attn)
-        attn = self.attn_drop(attn)
-        x = (attn @ v).transpose(1, 2).reshape(b_, n, c)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-    def extra_repr(self) -> str:
-        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
-    def flops(self, n):
-        # calculate flops for 1 window with token length of n
-        flops = 0
-        # qkv = self.qkv(x)
-        flops += n * self.dim * 3 * self.dim
-        # attn = (q @ k.transpose(-2, -1))
-        flops += self.num_heads * n * (self.dim // self.num_heads) * n
-        #  x = (attn @ v)
-        flops += self.num_heads * n * n * (self.dim // self.num_heads)
-        # x = self.proj(x)
-        flops += n * self.dim * self.dim
-        return flops
-class SwinTransformerBlock(nn.Module):
-    r""" Swin Transformer Block.
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resolution.
-        num_heads (int): Number of attention heads.
-        window_size (int): Window size.
-        shift_size (int): Shift size for SW-MSA.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float, optional): Stochastic depth rate. Default: 0.0
-        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
-    """
-    def __init__(self,
-                 dim,
-                 input_resolution,
-                 num_heads,
-                 window_size=7,
-                 shift_size=0,
-                 mlp_ratio=4.,
-                 qkv_bias=True,
-                 qk_scale=None,
-                 drop=0.,
-                 attn_drop=0.,
-                 drop_path=0.,
-                 act_layer=nn.GELU,
-                 norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.shift_size = shift_size
-        self.mlp_ratio = mlp_ratio
-        if min(self.input_resolution) <= self.window_size:
-            # if window size is larger than input resolution, we don't partition windows
-            self.shift_size = 0
-            self.window_size = min(self.input_resolution)
-        assert 0 <= self.shift_size < self.window_size, 'shift_size must in 0-window_size'
-        self.norm1 = norm_layer(dim)
-        self.attn = WindowAttention(
-            dim,
-            window_size=to_2tuple(self.window_size),
-            num_heads=num_heads,
-            qkv_bias=qkv_bias,
-            qk_scale=qk_scale,
-            attn_drop=attn_drop,
-            proj_drop=drop)
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
-        if self.shift_size > 0:
-            attn_mask = self.calculate_mask(self.input_resolution)
-        else:
-            attn_mask = None
-        self.register_buffer('attn_mask', attn_mask)
-    def calculate_mask(self, x_size):
-        # calculate attention mask for SW-MSA
-        h, w = x_size
-        img_mask = torch.zeros((1, h, w, 1))  # 1 h w 1
-        h_slices = (slice(0, -self.window_size), slice(-self.window_size,
-                                                       -self.shift_size), slice(-self.shift_size, None))
-        w_slices = (slice(0, -self.window_size), slice(-self.window_size,
-                                                       -self.shift_size), slice(-self.shift_size, None))
-        cnt = 0
-        for h in h_slices:
-            for w in w_slices:
-                img_mask[:, h, w, :] = cnt
-                cnt += 1
-        mask_windows = window_partition(img_mask, self.window_size)  # nw, window_size, window_size, 1
-        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
-        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
-        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
-        return attn_mask
-    def forward(self, x, x_size):
-        h, w = x_size
-        b, _, c = x.shape
-        # assert seq_len == h * w, "input feature has wrong size"
-        shortcut = x
-        x = self.norm1(x)
-        x = x.view(b, h, w, c)
-        # cyclic shift
-        if self.shift_size > 0:
-            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
-        else:
-            shifted_x = x
-        # partition windows
-        x_windows = window_partition(shifted_x, self.window_size)  # nw*b, window_size, window_size, c
-        x_windows = x_windows.view(-1, self.window_size * self.window_size, c)  # nw*b, window_size*window_size, c
-        # W-MSA/SW-MSA (to be compatible for testing on images whose shapes are the multiple of window size
-        if self.input_resolution == x_size:
-            attn_windows = self.attn(x_windows, mask=self.attn_mask)  # nw*b, window_size*window_size, c
-        else:
-            attn_windows = self.attn(x_windows, mask=self.calculate_mask(x_size).to(x.device))
-        # merge windows
-        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, c)
-        shifted_x = window_reverse(attn_windows, self.window_size, h, w)  # b h' w' c
-        # reverse cyclic shift
-        if self.shift_size > 0:
-            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
-        else:
-            x = shifted_x
-        x = x.view(b, h * w, c)
-        # FFN
-        x = shortcut + self.drop_path(x)
-        x = x + self.drop_path(self.mlp(self.norm2(x)))
-        return x
-    def extra_repr(self) -> str:
-        return (f'dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, '
-                f'window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}')
-    def flops(self):
-        flops = 0
-        h, w = self.input_resolution
-        # norm1
-        flops += self.dim * h * w
-        # W-MSA/SW-MSA
-        nw = h * w / self.window_size / self.window_size
-        flops += nw * self.attn.flops(self.window_size * self.window_size)
-        # mlp
-        flops += 2 * h * w * self.dim * self.dim * self.mlp_ratio
-        # norm2
-        flops += self.dim * h * w
-        return flops
-class PatchMerging(nn.Module):
-    r""" Patch Merging Layer.
-    Args:
-        input_resolution (tuple[int]): Resolution of input feature.
-        dim (int): Number of input channels.
-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
-    """
-    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.input_resolution = input_resolution
-        self.dim = dim
-        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
-        self.norm = norm_layer(4 * dim)
-    def forward(self, x):
-        """
-        x: b, h*w, c
-        """
-        h, w = self.input_resolution
-        b, seq_len, c = x.shape
-        assert seq_len == h * w, 'input feature has wrong size'
-        assert h % 2 == 0 and w % 2 == 0, f'x size ({h}*{w}) are not even.'
-        x = x.view(b, h, w, c)
-        x0 = x[:, 0::2, 0::2, :]  # b h/2 w/2 c
-        x1 = x[:, 1::2, 0::2, :]  # b h/2 w/2 c
-        x2 = x[:, 0::2, 1::2, :]  # b h/2 w/2 c
-        x3 = x[:, 1::2, 1::2, :]  # b h/2 w/2 c
-        x = torch.cat([x0, x1, x2, x3], -1)  # b h/2 w/2 4*c
-        x = x.view(b, -1, 4 * c)  # b h/2*w/2 4*c
-        x = self.norm(x)
-        x = self.reduction(x)
-        return x
-    def extra_repr(self) -> str:
-        return f'input_resolution={self.input_resolution}, dim={self.dim}'
-    def flops(self):
-        h, w = self.input_resolution
-        flops = h * w * self.dim
-        flops += (h // 2) * (w // 2) * 4 * self.dim * 2 * self.dim
-        return flops
-class BasicLayer(nn.Module):
-    """ A basic Swin Transformer layer for one stage.
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resolution.
-        depth (int): Number of blocks.
-        num_heads (int): Number of attention heads.
-        window_size (int): Local window size.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
-        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
-        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
-    """
-    def __init__(self,
-                 dim,
-                 input_resolution,
-                 depth,
-                 num_heads,
-                 window_size,
-                 mlp_ratio=4.,
-                 qkv_bias=True,
-                 qk_scale=None,
-                 drop=0.,
-                 attn_drop=0.,
-                 drop_path=0.,
-                 norm_layer=nn.LayerNorm,
-                 downsample=None,
-                 use_checkpoint=False):
-        super().__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.depth = depth
-        self.use_checkpoint = use_checkpoint
-        # build blocks
-        self.blocks = nn.ModuleList([
-            SwinTransformerBlock(
-                dim=dim,
-                input_resolution=input_resolution,
-                num_heads=num_heads,
-                window_size=window_size,
-                shift_size=0 if (i % 2 == 0) else window_size // 2,
-                mlp_ratio=mlp_ratio,
-                qkv_bias=qkv_bias,
-                qk_scale=qk_scale,
-                drop=drop,
-                attn_drop=attn_drop,
-                drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
-                norm_layer=norm_layer) for i in range(depth)
-        ])
-        # patch merging layer
-        if downsample is not None:
-            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
-        else:
-            self.downsample = None
-    def forward(self, x, x_size):
-        for blk in self.blocks:
-            if self.use_checkpoint:
-                x = checkpoint.checkpoint(blk, x)
-            else:
-                x = blk(x, x_size)
-        if self.downsample is not None:
-            x = self.downsample(x)
-        return x
-    def extra_repr(self) -> str:
-        return f'dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}'
-    def flops(self):
-        flops = 0
-        for blk in self.blocks:
-            flops += blk.flops()
-        if self.downsample is not None:
-            flops += self.downsample.flops()
-        return flops
-class RSTB(nn.Module):
-    """Residual Swin Transformer Block (RSTB).
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resolution.
-        depth (int): Number of blocks.
-        num_heads (int): Number of attention heads.
-        window_size (int): Local window size.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
-        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
-        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
-        img_size: Input image size.
-        patch_size: Patch size.
-        resi_connection: The convolutional block before residual connection.
-    """
-    def __init__(self,
-                 dim,
-                 input_resolution,
-                 depth,
-                 num_heads,
-                 window_size,
-                 mlp_ratio=4.,
-                 qkv_bias=True,
-                 qk_scale=None,
-                 drop=0.,
-                 attn_drop=0.,
-                 drop_path=0.,
-                 norm_layer=nn.LayerNorm,
-                 downsample=None,
-                 use_checkpoint=False,
-                 img_size=224,
-                 patch_size=4,
-                 resi_connection='1conv'):
-        super(RSTB, self).__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.residual_group = BasicLayer(
-            dim=dim,
-            input_resolution=input_resolution,
-            depth=depth,
-            num_heads=num_heads,
-            window_size=window_size,
-            mlp_ratio=mlp_ratio,
-            qkv_bias=qkv_bias,
-            qk_scale=qk_scale,
-            drop=drop,
-            attn_drop=attn_drop,
-            drop_path=drop_path,
-            norm_layer=norm_layer,
-            downsample=downsample,
-            use_checkpoint=use_checkpoint)
-        if resi_connection == '1conv':
-            self.conv = nn.Conv2d(dim, dim, 3, 1, 1)
-        elif resi_connection == '3conv':
-            # to save parameters and memory
-            self.conv = nn.Sequential(
-                nn.Conv2d(dim, dim // 4, 3, 1, 1), nn.LeakyReLU(negative_slope=0.2, inplace=True),
-                nn.Conv2d(dim // 4, dim // 4, 1, 1, 0), nn.LeakyReLU(negative_slope=0.2, inplace=True),
-                nn.Conv2d(dim // 4, dim, 3, 1, 1))
-        self.patch_embed = PatchEmbed(
-            img_size=img_size, patch_size=patch_size, in_chans=0, embed_dim=dim, norm_layer=None)
-        self.patch_unembed = PatchUnEmbed(
-            img_size=img_size, patch_size=patch_size, in_chans=0, embed_dim=dim, norm_layer=None)
-    def forward(self, x, x_size):
-        return self.patch_embed(self.conv(self.patch_unembed(self.residual_group(x, x_size), x_size))) + x
-    def flops(self):
-        flops = 0
-        flops += self.residual_group.flops()
-        h, w = self.input_resolution
-        flops += h * w * self.dim * self.dim * 9
-        flops += self.patch_embed.flops()
-        flops += self.patch_unembed.flops()
-        return flops
-class PatchEmbed(nn.Module):
-    r""" Image to Patch Embedding
-    Args:
-        img_size (int): Image size.  Default: 224.
-        patch_size (int): Patch token size. Default: 4.
-        in_chans (int): Number of input image channels. Default: 3.
-        embed_dim (int): Number of linear projection output channels. Default: 96.
-        norm_layer (nn.Module, optional): Normalization layer. Default: None
-    """
-    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.patches_resolution = patches_resolution
-        self.num_patches = patches_resolution[0] * patches_resolution[1]
-        self.in_chans = in_chans
-        self.embed_dim = embed_dim
-        if norm_layer is not None:
-            self.norm = norm_layer(embed_dim)
-        else:
-            self.norm = None
-    def forward(self, x):
-        x = x.flatten(2).transpose(1, 2)  # b Ph*Pw c
-        if self.norm is not None:
-            x = self.norm(x)
-        return x
-    def flops(self):
-        flops = 0
-        h, w = self.img_size
-        if self.norm is not None:
-            flops += h * w * self.embed_dim
-        return flops
-class PatchUnEmbed(nn.Module):
-    r""" Image to Patch Unembedding
-    Args:
-        img_size (int): Image size.  Default: 224.
-        patch_size (int): Patch token size. Default: 4.
-        in_chans (int): Number of input image channels. Default: 3.
-        embed_dim (int): Number of linear projection output channels. Default: 96.
-        norm_layer (nn.Module, optional): Normalization layer. Default: None
-    """
-    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.patches_resolution = patches_resolution
-        self.num_patches = patches_resolution[0] * patches_resolution[1]
-        self.in_chans = in_chans
-        self.embed_dim = embed_dim
-    def forward(self, x, x_size):
-        x = x.transpose(1, 2).view(x.shape[0], self.embed_dim, x_size[0], x_size[1])  # b Ph*Pw c
-        return x
-    def flops(self):
-        flops = 0
-        return flops
-class Upsample(nn.Sequential):
-    """Upsample module.
-    Args:
-        scale (int): Scale factor. Supported scales: 2^n and 3.
-        num_feat (int): Channel number of intermediate features.
-    """
-    def __init__(self, scale, num_feat):
-        m = []
-        if (scale & (scale - 1)) == 0:  # scale = 2^n
-            for _ in range(int(math.log(scale, 2))):
-                m.append(nn.Conv2d(num_feat, 4 * num_feat, 3, 1, 1))
-                m.append(nn.PixelShuffle(2))
-        elif scale == 3:
-            m.append(nn.Conv2d(num_feat, 9 * num_feat, 3, 1, 1))
-            m.append(nn.PixelShuffle(3))
-        else:
-            raise ValueError(f'scale {scale} is not supported. Supported scales: 2^n and 3.')
-        super(Upsample, self).__init__(*m)
-class UpsampleOneStep(nn.Sequential):
-    """UpsampleOneStep module (the difference with Upsample is that it always only has 1conv + 1pixelshuffle)
-       Used in lightweight SR to save parameters.
-    Args:
-        scale (int): Scale factor. Supported scales: 2^n and 3.
-        num_feat (int): Channel number of intermediate features.
-    """
-    def __init__(self, scale, num_feat, num_out_ch, input_resolution=None):
-        self.num_feat = num_feat
-        self.input_resolution = input_resolution
-        m = []
-        m.append(nn.Conv2d(num_feat, (scale**2) * num_out_ch, 3, 1, 1))
-        m.append(nn.PixelShuffle(scale))
-        super(UpsampleOneStep, self).__init__(*m)
-    def flops(self):
-        h, w = self.input_resolution
-        flops = h * w * self.num_feat * 3 * 9
-        return flops
-@ARCH_REGISTRY.register()
-class SwinIR(nn.Module):
-    r""" SwinIR
-        A PyTorch impl of : `SwinIR: Image Restoration Using Swin Transformer`, based on Swin Transformer.
-    Args:
-        img_size (int | tuple(int)): Input image size. Default 64
-        patch_size (int | tuple(int)): Patch size. Default: 1
-        in_chans (int): Number of input image channels. Default: 3
-        embed_dim (int): Patch embedding dimension. Default: 96
-        depths (tuple(int)): Depth of each Swin Transformer layer.
-        num_heads (tuple(int)): Number of attention heads in different layers.
-        window_size (int): Window size. Default: 7
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
-        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
-        drop_rate (float): Dropout rate. Default: 0
-        attn_drop_rate (float): Attention dropout rate. Default: 0
-        drop_path_rate (float): Stochastic depth rate. Default: 0.1
-        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
-        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
-        patch_norm (bool): If True, add normalization after patch embedding. Default: True
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
-        upscale: Upscale factor. 2/3/4/8 for image SR, 1 for denoising and compress artifact reduction
-        img_range: Image range. 1. or 255.
-        upsampler: The reconstruction reconstruction module. 'pixelshuffle'/'pixelshuffledirect'/'nearest+conv'/None
-        resi_connection: The convolutional block before residual connection. '1conv'/'3conv'
-    """
-    def __init__(self,
-                 img_size=64,
-                 patch_size=1,
-                 in_chans=3,
-                 embed_dim=96,
-                 depths=(6, 6, 6, 6),
-                 num_heads=(6, 6, 6, 6),
-                 window_size=7,
-                 mlp_ratio=4.,
-                 qkv_bias=True,
-                 qk_scale=None,
-                 drop_rate=0.,
-                 attn_drop_rate=0.,
-                 drop_path_rate=0.1,
-                 norm_layer=nn.LayerNorm,
-                 ape=False,
-                 patch_norm=True,
-                 use_checkpoint=False,
-                 upscale=2,
-                 img_range=1.,
-                 upsampler='',
-                 resi_connection='1conv',
-                 **kwargs):
-        super(SwinIR, self).__init__()
-        num_in_ch = in_chans
-        num_out_ch = in_chans
-        num_feat = 64
-        self.img_range = img_range
-        if in_chans == 3:
-            rgb_mean = (0.4488, 0.4371, 0.4040)
-            self.mean = torch.Tensor(rgb_mean).view(1, 3, 1, 1)
-        else:
-            self.mean = torch.zeros(1, 1, 1, 1)
-        self.upscale = upscale
-        self.upsampler = upsampler
-        # ------------------------- 1, shallow feature extraction ------------------------- #
-        self.conv_first = nn.Conv2d(num_in_ch, embed_dim, 3, 1, 1)
-        # ------------------------- 2, deep feature extraction ------------------------- #
-        self.num_layers = len(depths)
-        self.embed_dim = embed_dim
-        self.ape = ape
-        self.patch_norm = patch_norm
-        self.num_features = embed_dim
-        self.mlp_ratio = mlp_ratio
-        # split image into non-overlapping patches
-        self.patch_embed = PatchEmbed(
-            img_size=img_size,
-            patch_size=patch_size,
-            in_chans=embed_dim,
-            embed_dim=embed_dim,
-            norm_layer=norm_layer if self.patch_norm else None)
-        num_patches = self.patch_embed.num_patches
-        patches_resolution = self.patch_embed.patches_resolution
-        self.patches_resolution = patches_resolution
-        # merge non-overlapping patches into image
-        self.patch_unembed = PatchUnEmbed(
-            img_size=img_size,
-            patch_size=patch_size,
-            in_chans=embed_dim,
-            embed_dim=embed_dim,
-            norm_layer=norm_layer if self.patch_norm else None)
-        # absolute position embedding
-        if self.ape:
-            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
-            trunc_normal_(self.absolute_pos_embed, std=.02)
-        self.pos_drop = nn.Dropout(p=drop_rate)
-        # stochastic depth
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
-        # build Residual Swin Transformer blocks (RSTB)
-        self.layers = nn.ModuleList()
-        for i_layer in range(self.num_layers):
-            layer = RSTB(
-                dim=embed_dim,
-                input_resolution=(patches_resolution[0], patches_resolution[1]),
-                depth=depths[i_layer],
-                num_heads=num_heads[i_layer],
-                window_size=window_size,
-                mlp_ratio=self.mlp_ratio,
-                qkv_bias=qkv_bias,
-                qk_scale=qk_scale,
-                drop=drop_rate,
-                attn_drop=attn_drop_rate,
-                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],  # no impact on SR results
-                norm_layer=norm_layer,
-                downsample=None,
-                use_checkpoint=use_checkpoint,
-                img_size=img_size,
-                patch_size=patch_size,
-                resi_connection=resi_connection)
-            self.layers.append(layer)
-        self.norm = norm_layer(self.num_features)
-        # build the last conv layer in deep feature extraction
-        if resi_connection == '1conv':
-            self.conv_after_body = nn.Conv2d(embed_dim, embed_dim, 3, 1, 1)
-        elif resi_connection == '3conv':
-            # to save parameters and memory
-            self.conv_after_body = nn.Sequential(
-                nn.Conv2d(embed_dim, embed_dim // 4, 3, 1, 1), nn.LeakyReLU(negative_slope=0.2, inplace=True),
-                nn.Conv2d(embed_dim // 4, embed_dim // 4, 1, 1, 0), nn.LeakyReLU(negative_slope=0.2, inplace=True),
-                nn.Conv2d(embed_dim // 4, embed_dim, 3, 1, 1))
-        # ------------------------- 3, high quality image reconstruction ------------------------- #
-        if self.upsampler == 'pixelshuffle':
-            # for classical SR
-            self.conv_before_upsample = nn.Sequential(
-                nn.Conv2d(embed_dim, num_feat, 3, 1, 1), nn.LeakyReLU(inplace=True))
-            self.upsample = Upsample(upscale, num_feat)
-            self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
-        elif self.upsampler == 'pixelshuffledirect':
-            # for lightweight SR (to save parameters)
-            self.upsample = UpsampleOneStep(upscale, embed_dim, num_out_ch,
-                                            (patches_resolution[0], patches_resolution[1]))
-        elif self.upsampler == 'nearest+conv':
-            # for real-world SR (less artifacts)
-            assert self.upscale == 4, 'only support x4 now.'
-            self.conv_before_upsample = nn.Sequential(
-                nn.Conv2d(embed_dim, num_feat, 3, 1, 1), nn.LeakyReLU(inplace=True))
-            self.conv_up1 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-            self.conv_up2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-            self.conv_hr = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-            self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
-            self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
-        else:
-            # for image denoising and JPEG compression artifact reduction
-            self.conv_last = nn.Conv2d(embed_dim, num_out_ch, 3, 1, 1)
-        self.apply(self._init_weights)
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-    @torch.jit.ignore
-    def no_weight_decay(self):
-        return {'absolute_pos_embed'}
-    @torch.jit.ignore
-    def no_weight_decay_keywords(self):
-        return {'relative_position_bias_table'}
-    def forward_features(self, x):
-        x_size = (x.shape[2], x.shape[3])
-        x = self.patch_embed(x)
-        if self.ape:
-            x = x + self.absolute_pos_embed
-        x = self.pos_drop(x)
-        for layer in self.layers:
-            x = layer(x, x_size)
-        x = self.norm(x)  # b seq_len c
-        x = self.patch_unembed(x, x_size)
-        return x
-    def forward(self, x):
-        self.mean = self.mean.type_as(x)
-        x = (x - self.mean) * self.img_range
-        if self.upsampler == 'pixelshuffle':
-            # for classical SR
-            x = self.conv_first(x)
-            x = self.conv_after_body(self.forward_features(x)) + x
-            x = self.conv_before_upsample(x)
-            x = self.conv_last(self.upsample(x))
-        elif self.upsampler == 'pixelshuffledirect':
-            # for lightweight SR
-            x = self.conv_first(x)
-            x = self.conv_after_body(self.forward_features(x)) + x
-            x = self.upsample(x)
-        elif self.upsampler == 'nearest+conv':
-            # for real-world SR
-            x = self.conv_first(x)
-            x = self.conv_after_body(self.forward_features(x)) + x
-            x = self.conv_before_upsample(x)
-            x = self.lrelu(self.conv_up1(torch.nn.functional.interpolate(x, scale_factor=2, mode='nearest')))
-            x = self.lrelu(self.conv_up2(torch.nn.functional.interpolate(x, scale_factor=2, mode='nearest')))
-            x = self.conv_last(self.lrelu(self.conv_hr(x)))
-        else:
-            # for image denoising and JPEG compression artifact reduction
-            x_first = self.conv_first(x)
-            res = self.conv_after_body(self.forward_features(x_first)) + x_first
-            x = x + self.conv_last(res)
-        x = x / self.img_range + self.mean
-        return x
-    def flops(self):
-        flops = 0
-        h, w = self.patches_resolution
-        flops += h * w * 3 * self.embed_dim * 9
-        flops += self.patch_embed.flops()
-        for layer in self.layers:
-            flops += layer.flops()
-        flops += h * w * 3 * self.embed_dim * self.embed_dim
-        flops += self.upsample.flops()
-        return flops
-if __name__ == '__main__':
-    upscale = 4
-    window_size = 8
-    height = (1024 // upscale // window_size + 1) * window_size
-    width = (720 // upscale // window_size + 1) * window_size
-    model = SwinIR(
-        upscale=2,
-        img_size=(height, width),
-        window_size=window_size,
-        img_range=1.,
-        depths=[6, 6, 6, 6],
-        embed_dim=60,
-        num_heads=[6, 6, 6, 6],
-        mlp_ratio=2,
-        upsampler='pixelshuffledirect')
-    print(model)
-    print(height, width, model.flops() / 1e9)
-    x = torch.randn((1, 3, height, width))
-    x = model(x)
-    print(x.shape)

basicsr/archs/tof_arch.py DELETED Viewed

@@ -1,172 +0,0 @@
-import torch
-from torch import nn as nn
-from torch.nn import functional as F
-from basicsr.utils.registry import ARCH_REGISTRY
-from .arch_util import flow_warp
-class BasicModule(nn.Module):
-    """Basic module of SPyNet.
-    Note that unlike the architecture in spynet_arch.py, the basic module
-    here contains batch normalization.
-    """
-    def __init__(self):
-        super(BasicModule, self).__init__()
-        self.basic_module = nn.Sequential(
-            nn.Conv2d(in_channels=8, out_channels=32, kernel_size=7, stride=1, padding=3, bias=False),
-            nn.BatchNorm2d(32), nn.ReLU(inplace=True),
-            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=7, stride=1, padding=3, bias=False),
-            nn.BatchNorm2d(64), nn.ReLU(inplace=True),
-            nn.Conv2d(in_channels=64, out_channels=32, kernel_size=7, stride=1, padding=3, bias=False),
-            nn.BatchNorm2d(32), nn.ReLU(inplace=True),
-            nn.Conv2d(in_channels=32, out_channels=16, kernel_size=7, stride=1, padding=3, bias=False),
-            nn.BatchNorm2d(16), nn.ReLU(inplace=True),
-            nn.Conv2d(in_channels=16, out_channels=2, kernel_size=7, stride=1, padding=3))
-    def forward(self, tensor_input):
-        """
-        Args:
-            tensor_input (Tensor): Input tensor with shape (b, 8, h, w).
-                8 channels contain:
-                [reference image (3), neighbor image (3), initial flow (2)].
-        Returns:
-            Tensor: Estimated flow with shape (b, 2, h, w)
-        """
-        return self.basic_module(tensor_input)
-class SPyNetTOF(nn.Module):
-    """SPyNet architecture for TOF.
-    Note that this implementation is specifically for TOFlow. Please use :file:`spynet_arch.py` for general use.
-    They differ in the following aspects:
-    1. The basic modules here contain BatchNorm.
-    2. Normalization and denormalization are not done here, as they are done in TOFlow.
-    ``Paper: Optical Flow Estimation using a Spatial Pyramid Network``
-    Reference: https://github.com/Coldog2333/pytoflow
-    Args:
-        load_path (str): Path for pretrained SPyNet. Default: None.
-    """
-    def __init__(self, load_path=None):
-        super(SPyNetTOF, self).__init__()
-        self.basic_module = nn.ModuleList([BasicModule() for _ in range(4)])
-        if load_path:
-            self.load_state_dict(torch.load(load_path, map_location=lambda storage, loc: storage)['params'])
-    def forward(self, ref, supp):
-        """
-        Args:
-            ref (Tensor): Reference image with shape of (b, 3, h, w).
-            supp: The supporting image to be warped: (b, 3, h, w).
-        Returns:
-            Tensor: Estimated optical flow: (b, 2, h, w).
-        """
-        num_batches, _, h, w = ref.size()
-        ref = [ref]
-        supp = [supp]
-        # generate downsampled frames
-        for _ in range(3):
-            ref.insert(0, F.avg_pool2d(input=ref[0], kernel_size=2, stride=2, count_include_pad=False))
-            supp.insert(0, F.avg_pool2d(input=supp[0], kernel_size=2, stride=2, count_include_pad=False))
-        # flow computation
-        flow = ref[0].new_zeros(num_batches, 2, h // 16, w // 16)
-        for i in range(4):
-            flow_up = F.interpolate(input=flow, scale_factor=2, mode='bilinear', align_corners=True) * 2.0
-            flow = flow_up + self.basic_module[i](
-                torch.cat([ref[i], flow_warp(supp[i], flow_up.permute(0, 2, 3, 1)), flow_up], 1))
-        return flow
-@ARCH_REGISTRY.register()
-class TOFlow(nn.Module):
-    """PyTorch implementation of TOFlow.
-    In TOFlow, the LR frames are pre-upsampled and have the same size with the GT frames.
-    ``Paper: Video Enhancement with Task-Oriented Flow``
-    Reference: https://github.com/anchen1011/toflow
-    Reference: https://github.com/Coldog2333/pytoflow
-    Args:
-        adapt_official_weights (bool): Whether to adapt the weights translated
-            from the official implementation. Set to false if you want to
-            train from scratch. Default: False
-    """
-    def __init__(self, adapt_official_weights=False):
-        super(TOFlow, self).__init__()
-        self.adapt_official_weights = adapt_official_weights
-        self.ref_idx = 0 if adapt_official_weights else 3
-        self.register_buffer('mean', torch.Tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1))
-        self.register_buffer('std', torch.Tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1))
-        # flow estimation module
-        self.spynet = SPyNetTOF()
-        # reconstruction module
-        self.conv_1 = nn.Conv2d(3 * 7, 64, 9, 1, 4)
-        self.conv_2 = nn.Conv2d(64, 64, 9, 1, 4)
-        self.conv_3 = nn.Conv2d(64, 64, 1)
-        self.conv_4 = nn.Conv2d(64, 3, 1)
-        # activation function
-        self.relu = nn.ReLU(inplace=True)
-    def normalize(self, img):
-        return (img - self.mean) / self.std
-    def denormalize(self, img):
-        return img * self.std + self.mean
-    def forward(self, lrs):
-        """
-        Args:
-            lrs: Input lr frames: (b, 7, 3, h, w).
-        Returns:
-            Tensor: SR frame: (b, 3, h, w).
-        """
-        # In the official implementation, the 0-th frame is the reference frame
-        if self.adapt_official_weights:
-            lrs = lrs[:, [3, 0, 1, 2, 4, 5, 6], :, :, :]
-        num_batches, num_lrs, _, h, w = lrs.size()
-        lrs = self.normalize(lrs.view(-1, 3, h, w))
-        lrs = lrs.view(num_batches, num_lrs, 3, h, w)
-        lr_ref = lrs[:, self.ref_idx, :, :, :]
-        lr_aligned = []
-        for i in range(7):  # 7 frames
-            if i == self.ref_idx:
-                lr_aligned.append(lr_ref)
-            else:
-                lr_supp = lrs[:, i, :, :, :]
-                flow = self.spynet(lr_ref, lr_supp)
-                lr_aligned.append(flow_warp(lr_supp, flow.permute(0, 2, 3, 1)))
-        # reconstruction
-        hr = torch.stack(lr_aligned, dim=1)
-        hr = hr.view(num_batches, -1, h, w)
-        hr = self.relu(self.conv_1(hr))
-        hr = self.relu(self.conv_2(hr))
-        hr = self.relu(self.conv_3(hr))
-        hr = self.conv_4(hr) + lr_ref
-        return self.denormalize(hr)

basicsr/archs/vgg_arch.py DELETED Viewed

@@ -1,161 +0,0 @@
-import os
-import torch
-from collections import OrderedDict
-from torch import nn as nn
-from torchvision.models import vgg as vgg
-from basicsr.utils.registry import ARCH_REGISTRY
-VGG_PRETRAIN_PATH = 'experiments/pretrained_models/vgg19-dcbb9e9d.pth'
-NAMES = {
-    'vgg11': [
-        'conv1_1', 'relu1_1', 'pool1', 'conv2_1', 'relu2_1', 'pool2', 'conv3_1', 'relu3_1', 'conv3_2', 'relu3_2',
-        'pool3', 'conv4_1', 'relu4_1', 'conv4_2', 'relu4_2', 'pool4', 'conv5_1', 'relu5_1', 'conv5_2', 'relu5_2',
-        'pool5'
-    ],
-    'vgg13': [
-        'conv1_1', 'relu1_1', 'conv1_2', 'relu1_2', 'pool1', 'conv2_1', 'relu2_1', 'conv2_2', 'relu2_2', 'pool2',
-        'conv3_1', 'relu3_1', 'conv3_2', 'relu3_2', 'pool3', 'conv4_1', 'relu4_1', 'conv4_2', 'relu4_2', 'pool4',
-        'conv5_1', 'relu5_1', 'conv5_2', 'relu5_2', 'pool5'
-    ],
-    'vgg16': [
-        'conv1_1', 'relu1_1', 'conv1_2', 'relu1_2', 'pool1', 'conv2_1', 'relu2_1', 'conv2_2', 'relu2_2', 'pool2',
-        'conv3_1', 'relu3_1', 'conv3_2', 'relu3_2', 'conv3_3', 'relu3_3', 'pool3', 'conv4_1', 'relu4_1', 'conv4_2',
-        'relu4_2', 'conv4_3', 'relu4_3', 'pool4', 'conv5_1', 'relu5_1', 'conv5_2', 'relu5_2', 'conv5_3', 'relu5_3',
-        'pool5'
-    ],
-    'vgg19': [
-        'conv1_1', 'relu1_1', 'conv1_2', 'relu1_2', 'pool1', 'conv2_1', 'relu2_1', 'conv2_2', 'relu2_2', 'pool2',
-        'conv3_1', 'relu3_1', 'conv3_2', 'relu3_2', 'conv3_3', 'relu3_3', 'conv3_4', 'relu3_4', 'pool3', 'conv4_1',
-        'relu4_1', 'conv4_2', 'relu4_2', 'conv4_3', 'relu4_3', 'conv4_4', 'relu4_4', 'pool4', 'conv5_1', 'relu5_1',
-        'conv5_2', 'relu5_2', 'conv5_3', 'relu5_3', 'conv5_4', 'relu5_4', 'pool5'
-    ]
-}
-def insert_bn(names):
-    """Insert bn layer after each conv.
-    Args:
-        names (list): The list of layer names.
-    Returns:
-        list: The list of layer names with bn layers.
-    """
-    names_bn = []
-    for name in names:
-        names_bn.append(name)
-        if 'conv' in name:
-            position = name.replace('conv', '')
-            names_bn.append('bn' + position)
-    return names_bn
-@ARCH_REGISTRY.register()
-class VGGFeatureExtractor(nn.Module):
-    """VGG network for feature extraction.
-    In this implementation, we allow users to choose whether use normalization
-    in the input feature and the type of vgg network. Note that the pretrained
-    path must fit the vgg type.
-    Args:
-        layer_name_list (list[str]): Forward function returns the corresponding
-            features according to the layer_name_list.
-            Example: {'relu1_1', 'relu2_1', 'relu3_1'}.
-        vgg_type (str): Set the type of vgg network. Default: 'vgg19'.
-        use_input_norm (bool): If True, normalize the input image. Importantly,
-            the input feature must in the range [0, 1]. Default: True.
-        range_norm (bool): If True, norm images with range [-1, 1] to [0, 1].
-            Default: False.
-        requires_grad (bool): If true, the parameters of VGG network will be
-            optimized. Default: False.
-        remove_pooling (bool): If true, the max pooling operations in VGG net
-            will be removed. Default: False.
-        pooling_stride (int): The stride of max pooling operation. Default: 2.
-    """
-    def __init__(self,
-                 layer_name_list,
-                 vgg_type='vgg19',
-                 use_input_norm=True,
-                 range_norm=False,
-                 requires_grad=False,
-                 remove_pooling=False,
-                 pooling_stride=2):
-        super(VGGFeatureExtractor, self).__init__()
-        self.layer_name_list = layer_name_list
-        self.use_input_norm = use_input_norm
-        self.range_norm = range_norm
-        self.names = NAMES[vgg_type.replace('_bn', '')]
-        if 'bn' in vgg_type:
-            self.names = insert_bn(self.names)
-        # only borrow layers that will be used to avoid unused params
-        max_idx = 0
-        for v in layer_name_list:
-            idx = self.names.index(v)
-            if idx > max_idx:
-                max_idx = idx
-        if os.path.exists(VGG_PRETRAIN_PATH):
-            vgg_net = getattr(vgg, vgg_type)(pretrained=False)
-            state_dict = torch.load(VGG_PRETRAIN_PATH, map_location=lambda storage, loc: storage)
-            vgg_net.load_state_dict(state_dict)
-        else:
-            vgg_net = getattr(vgg, vgg_type)(pretrained=True)
-        features = vgg_net.features[:max_idx + 1]
-        modified_net = OrderedDict()
-        for k, v in zip(self.names, features):
-            if 'pool' in k:
-                # if remove_pooling is true, pooling operation will be removed
-                if remove_pooling:
-                    continue
-                else:
-                    # in some cases, we may want to change the default stride
-                    modified_net[k] = nn.MaxPool2d(kernel_size=2, stride=pooling_stride)
-            else:
-                modified_net[k] = v
-        self.vgg_net = nn.Sequential(modified_net)
-        if not requires_grad:
-            self.vgg_net.eval()
-            for param in self.parameters():
-                param.requires_grad = False
-        else:
-            self.vgg_net.train()
-            for param in self.parameters():
-                param.requires_grad = True
-        if self.use_input_norm:
-            # the mean is for image with range [0, 1]
-            self.register_buffer('mean', torch.Tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1))
-            # the std is for image with range [0, 1]
-            self.register_buffer('std', torch.Tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1))
-    def forward(self, x):
-        """Forward function.
-        Args:
-            x (Tensor): Input tensor with shape (n, c, h, w).
-        Returns:
-            Tensor: Forward results.
-        """
-        if self.range_norm:
-            x = (x + 1) / 2
-        if self.use_input_norm:
-            x = (x - self.mean) / self.std
-        output = {}
-        for key, layer in self.vgg_net._modules.items():
-            x = layer(x)
-            if key in self.layer_name_list:
-                output[key] = x.clone()
-        return output

basicsr/data/__init__.py DELETED Viewed

@@ -1,101 +0,0 @@
-import importlib
-import numpy as np
-import random
-import torch
-import torch.utils.data
-from copy import deepcopy
-from functools import partial
-from os import path as osp
-from basicsr.data.prefetch_dataloader import PrefetchDataLoader
-from basicsr.utils import get_root_logger, scandir
-from basicsr.utils.dist_util import get_dist_info
-from basicsr.utils.registry import DATASET_REGISTRY
-__all__ = ['build_dataset', 'build_dataloader']
-# automatically scan and import dataset modules for registry
-# scan all the files under the data folder with '_dataset' in file names
-data_folder = osp.dirname(osp.abspath(__file__))
-dataset_filenames = [osp.splitext(osp.basename(v))[0] for v in scandir(data_folder) if v.endswith('_dataset.py')]
-# import all the dataset modules
-_dataset_modules = [importlib.import_module(f'basicsr.data.{file_name}') for file_name in dataset_filenames]
-def build_dataset(dataset_opt):
-    """Build dataset from options.
-    Args:
-        dataset_opt (dict): Configuration for dataset. It must contain:
-            name (str): Dataset name.
-            type (str): Dataset type.
-    """
-    dataset_opt = deepcopy(dataset_opt)
-    dataset = DATASET_REGISTRY.get(dataset_opt['type'])(dataset_opt)
-    logger = get_root_logger()
-    logger.info(f'Dataset [{dataset.__class__.__name__}] - {dataset_opt["name"]} is built.')
-    return dataset
-def build_dataloader(dataset, dataset_opt, num_gpu=1, dist=False, sampler=None, seed=None):
-    """Build dataloader.
-    Args:
-        dataset (torch.utils.data.Dataset): Dataset.
-        dataset_opt (dict): Dataset options. It contains the following keys:
-            phase (str): 'train' or 'val'.
-            num_worker_per_gpu (int): Number of workers for each GPU.
-            batch_size_per_gpu (int): Training batch size for each GPU.
-        num_gpu (int): Number of GPUs. Used only in the train phase.
-            Default: 1.
-        dist (bool): Whether in distributed training. Used only in the train
-            phase. Default: False.
-        sampler (torch.utils.data.sampler): Data sampler. Default: None.
-        seed (int | None): Seed. Default: None
-    """
-    phase = dataset_opt['phase']
-    rank, _ = get_dist_info()
-    if phase == 'train':
-        if dist:  # distributed training
-            batch_size = dataset_opt['batch_size_per_gpu']
-            num_workers = dataset_opt['num_worker_per_gpu']
-        else:  # non-distributed training
-            multiplier = 1 if num_gpu == 0 else num_gpu
-            batch_size = dataset_opt['batch_size_per_gpu'] * multiplier
-            num_workers = dataset_opt['num_worker_per_gpu'] * multiplier
-        dataloader_args = dict(
-            dataset=dataset,
-            batch_size=batch_size,
-            shuffle=False,
-            num_workers=num_workers,
-            sampler=sampler,
-            drop_last=True)
-        if sampler is None:
-            dataloader_args['shuffle'] = True
-        dataloader_args['worker_init_fn'] = partial(
-            worker_init_fn, num_workers=num_workers, rank=rank, seed=seed) if seed is not None else None
-    elif phase in ['val', 'test']:  # validation
-        dataloader_args = dict(dataset=dataset, batch_size=1, shuffle=False, num_workers=0)
-    else:
-        raise ValueError(f"Wrong dataset phase: {phase}. Supported ones are 'train', 'val' and 'test'.")
-    dataloader_args['pin_memory'] = dataset_opt.get('pin_memory', False)
-    dataloader_args['persistent_workers'] = dataset_opt.get('persistent_workers', False)
-    prefetch_mode = dataset_opt.get('prefetch_mode')
-    if prefetch_mode == 'cpu':  # CPUPrefetcher
-        num_prefetch_queue = dataset_opt.get('num_prefetch_queue', 1)
-        logger = get_root_logger()
-        logger.info(f'Use {prefetch_mode} prefetch dataloader: num_prefetch_queue = {num_prefetch_queue}')
-        return PrefetchDataLoader(num_prefetch_queue=num_prefetch_queue, **dataloader_args)
-    else:
-        # prefetch_mode=None: Normal dataloader
-        # prefetch_mode='cuda': dataloader for CUDAPrefetcher
-        return torch.utils.data.DataLoader(**dataloader_args)
-def worker_init_fn(worker_id, num_workers, rank, seed):
-    # Set the worker seed to num_workers * rank + worker_id + seed
-    worker_seed = num_workers * rank + worker_id + seed
-    np.random.seed(worker_seed)
-    random.seed(worker_seed)

basicsr/data/data_sampler.py DELETED Viewed

@@ -1,48 +0,0 @@
-import math
-import torch
-from torch.utils.data.sampler import Sampler
-class EnlargedSampler(Sampler):
-    """Sampler that restricts data loading to a subset of the dataset.
-    Modified from torch.utils.data.distributed.DistributedSampler
-    Support enlarging the dataset for iteration-based training, for saving
-    time when restart the dataloader after each epoch
-    Args:
-        dataset (torch.utils.data.Dataset): Dataset used for sampling.
-        num_replicas (int | None): Number of processes participating in
-            the training. It is usually the world_size.
-        rank (int | None): Rank of the current process within num_replicas.
-        ratio (int): Enlarging ratio. Default: 1.
-    """
-    def __init__(self, dataset, num_replicas, rank, ratio=1):
-        self.dataset = dataset
-        self.num_replicas = num_replicas
-        self.rank = rank
-        self.epoch = 0
-        self.num_samples = math.ceil(len(self.dataset) * ratio / self.num_replicas)
-        self.total_size = self.num_samples * self.num_replicas
-    def __iter__(self):
-        # deterministically shuffle based on epoch
-        g = torch.Generator()
-        g.manual_seed(self.epoch)
-        indices = torch.randperm(self.total_size, generator=g).tolist()
-        dataset_size = len(self.dataset)
-        indices = [v % dataset_size for v in indices]
-        # subsample
-        indices = indices[self.rank:self.total_size:self.num_replicas]
-        assert len(indices) == self.num_samples
-        return iter(indices)
-    def __len__(self):
-        return self.num_samples
-    def set_epoch(self, epoch):
-        self.epoch = epoch

basicsr/data/data_util.py DELETED Viewed

@@ -1,315 +0,0 @@
-import cv2
-import numpy as np
-import torch
-from os import path as osp
-from torch.nn import functional as F
-from basicsr.data.transforms import mod_crop
-from basicsr.utils import img2tensor, scandir
-def read_img_seq(path, require_mod_crop=False, scale=1, return_imgname=False):
-    """Read a sequence of images from a given folder path.
-    Args:
-        path (list[str] | str): List of image paths or image folder path.
-        require_mod_crop (bool): Require mod crop for each image.
-            Default: False.
-        scale (int): Scale factor for mod_crop. Default: 1.
-        return_imgname(bool): Whether return image names. Default False.
-    Returns:
-        Tensor: size (t, c, h, w), RGB, [0, 1].
-        list[str]: Returned image name list.
-    """
-    if isinstance(path, list):
-        img_paths = path
-    else:
-        img_paths = sorted(list(scandir(path, full_path=True)))
-    imgs = [cv2.imread(v).astype(np.float32) / 255. for v in img_paths]
-    if require_mod_crop:
-        imgs = [mod_crop(img, scale) for img in imgs]
-    imgs = img2tensor(imgs, bgr2rgb=True, float32=True)
-    imgs = torch.stack(imgs, dim=0)
-    if return_imgname:
-        imgnames = [osp.splitext(osp.basename(path))[0] for path in img_paths]
-        return imgs, imgnames
-    else:
-        return imgs
-def generate_frame_indices(crt_idx, max_frame_num, num_frames, padding='reflection'):
-    """Generate an index list for reading `num_frames` frames from a sequence
-    of images.
-    Args:
-        crt_idx (int): Current center index.
-        max_frame_num (int): Max number of the sequence of images (from 1).
-        num_frames (int): Reading num_frames frames.
-        padding (str): Padding mode, one of
-            'replicate' | 'reflection' | 'reflection_circle' | 'circle'
-            Examples: current_idx = 0, num_frames = 5
-            The generated frame indices under different padding mode:
-            replicate: [0, 0, 0, 1, 2]
-            reflection: [2, 1, 0, 1, 2]
-            reflection_circle: [4, 3, 0, 1, 2]
-            circle: [3, 4, 0, 1, 2]
-    Returns:
-        list[int]: A list of indices.
-    """
-    assert num_frames % 2 == 1, 'num_frames should be an odd number.'
-    assert padding in ('replicate', 'reflection', 'reflection_circle', 'circle'), f'Wrong padding mode: {padding}.'
-    max_frame_num = max_frame_num - 1  # start from 0
-    num_pad = num_frames // 2
-    indices = []
-    for i in range(crt_idx - num_pad, crt_idx + num_pad + 1):
-        if i < 0:
-            if padding == 'replicate':
-                pad_idx = 0
-            elif padding == 'reflection':
-                pad_idx = -i
-            elif padding == 'reflection_circle':
-                pad_idx = crt_idx + num_pad - i
-            else:
-                pad_idx = num_frames + i
-        elif i > max_frame_num:
-            if padding == 'replicate':
-                pad_idx = max_frame_num
-            elif padding == 'reflection':
-                pad_idx = max_frame_num * 2 - i
-            elif padding == 'reflection_circle':
-                pad_idx = (crt_idx - num_pad) - (i - max_frame_num)
-            else:
-                pad_idx = i - num_frames
-        else:
-            pad_idx = i
-        indices.append(pad_idx)
-    return indices
-def paired_paths_from_lmdb(folders, keys):
-    """Generate paired paths from lmdb files.
-    Contents of lmdb. Taking the `lq.lmdb` for example, the file structure is:
-    ::
-        lq.lmdb
-        ├── data.mdb
-        ├── lock.mdb
-        ├── meta_info.txt
-    The data.mdb and lock.mdb are standard lmdb files and you can refer to
-    https://lmdb.readthedocs.io/en/release/ for more details.
-    The meta_info.txt is a specified txt file to record the meta information
-    of our datasets. It will be automatically created when preparing
-    datasets by our provided dataset tools.
-    Each line in the txt file records
-    1)image name (with extension),
-    2)image shape,
-    3)compression level, separated by a white space.
-    Example: `baboon.png (120,125,3) 1`
-    We use the image name without extension as the lmdb key.
-    Note that we use the same key for the corresponding lq and gt images.
-    Args:
-        folders (list[str]): A list of folder path. The order of list should
-            be [input_folder, gt_folder].
-        keys (list[str]): A list of keys identifying folders. The order should
-            be in consistent with folders, e.g., ['lq', 'gt'].
-            Note that this key is different from lmdb keys.
-    Returns:
-        list[str]: Returned path list.
-    """
-    assert len(folders) == 2, ('The len of folders should be 2 with [input_folder, gt_folder]. '
-                               f'But got {len(folders)}')
-    assert len(keys) == 2, f'The len of keys should be 2 with [input_key, gt_key]. But got {len(keys)}'
-    input_folder, gt_folder = folders
-    input_key, gt_key = keys
-    if not (input_folder.endswith('.lmdb') and gt_folder.endswith('.lmdb')):
-        raise ValueError(f'{input_key} folder and {gt_key} folder should both in lmdb '
-                         f'formats. But received {input_key}: {input_folder}; '
-                         f'{gt_key}: {gt_folder}')
-    # ensure that the two meta_info files are the same
-    with open(osp.join(input_folder, 'meta_info.txt')) as fin:
-        input_lmdb_keys = [line.split('.')[0] for line in fin]
-    with open(osp.join(gt_folder, 'meta_info.txt')) as fin:
-        gt_lmdb_keys = [line.split('.')[0] for line in fin]
-    if set(input_lmdb_keys) != set(gt_lmdb_keys):
-        raise ValueError(f'Keys in {input_key}_folder and {gt_key}_folder are different.')
-    else:
-        paths = []
-        for lmdb_key in sorted(input_lmdb_keys):
-            paths.append(dict([(f'{input_key}_path', lmdb_key), (f'{gt_key}_path', lmdb_key)]))
-        return paths
-def paired_paths_from_meta_info_file(folders, keys, meta_info_file, filename_tmpl):
-    """Generate paired paths from an meta information file.
-    Each line in the meta information file contains the image names and
-    image shape (usually for gt), separated by a white space.
-    Example of an meta information file:
-    ```
-    0001_s001.png (480,480,3)
-    0001_s002.png (480,480,3)
-    ```
-    Args:
-        folders (list[str]): A list of folder path. The order of list should
-            be [input_folder, gt_folder].
-        keys (list[str]): A list of keys identifying folders. The order should
-            be in consistent with folders, e.g., ['lq', 'gt'].
-        meta_info_file (str): Path to the meta information file.
-        filename_tmpl (str): Template for each filename. Note that the
-            template excludes the file extension. Usually the filename_tmpl is
-            for files in the input folder.
-    Returns:
-        list[str]: Returned path list.
-    """
-    assert len(folders) == 2, ('The len of folders should be 2 with [input_folder, gt_folder]. '
-                               f'But got {len(folders)}')
-    assert len(keys) == 2, f'The len of keys should be 2 with [input_key, gt_key]. But got {len(keys)}'
-    input_folder, gt_folder = folders
-    input_key, gt_key = keys
-    with open(meta_info_file, 'r') as fin:
-        gt_names = [line.strip().split(' ')[0] for line in fin]
-    paths = []
-    for gt_name in gt_names:
-        basename, ext = osp.splitext(osp.basename(gt_name))
-        input_name = f'{filename_tmpl.format(basename)}{ext}'
-        input_path = osp.join(input_folder, input_name)
-        gt_path = osp.join(gt_folder, gt_name)
-        paths.append(dict([(f'{input_key}_path', input_path), (f'{gt_key}_path', gt_path)]))
-    return paths
-def paired_paths_from_folder(folders, keys, filename_tmpl):
-    """Generate paired paths from folders.
-    Args:
-        folders (list[str]): A list of folder path. The order of list should
-            be [input_folder, gt_folder].
-        keys (list[str]): A list of keys identifying folders. The order should
-            be in consistent with folders, e.g., ['lq', 'gt'].
-        filename_tmpl (str): Template for each filename. Note that the
-            template excludes the file extension. Usually the filename_tmpl is
-            for files in the input folder.
-    Returns:
-        list[str]: Returned path list.
-    """
-    assert len(folders) == 2, ('The len of folders should be 2 with [input_folder, gt_folder]. '
-                               f'But got {len(folders)}')
-    assert len(keys) == 2, f'The len of keys should be 2 with [input_key, gt_key]. But got {len(keys)}'
-    input_folder, gt_folder = folders
-    input_key, gt_key = keys
-    input_paths = list(scandir(input_folder))
-    gt_paths = list(scandir(gt_folder))
-    assert len(input_paths) == len(gt_paths), (f'{input_key} and {gt_key} datasets have different number of images: '
-                                               f'{len(input_paths)}, {len(gt_paths)}.')
-    paths = []
-    for gt_path in gt_paths:
-        basename, ext = osp.splitext(osp.basename(gt_path))
-        input_name = f'{filename_tmpl.format(basename)}{ext}'
-        input_path = osp.join(input_folder, input_name)
-        assert input_name in input_paths, f'{input_name} is not in {input_key}_paths.'
-        gt_path = osp.join(gt_folder, gt_path)
-        paths.append(dict([(f'{input_key}_path', input_path), (f'{gt_key}_path', gt_path)]))
-    return paths
-def paths_from_folder(folder):
-    """Generate paths from folder.
-    Args:
-        folder (str): Folder path.
-    Returns:
-        list[str]: Returned path list.
-    """
-    paths = list(scandir(folder))
-    paths = [osp.join(folder, path) for path in paths]
-    return paths
-def paths_from_lmdb(folder):
-    """Generate paths from lmdb.
-    Args:
-        folder (str): Folder path.
-    Returns:
-        list[str]: Returned path list.
-    """
-    if not folder.endswith('.lmdb'):
-        raise ValueError(f'Folder {folder}folder should in lmdb format.')
-    with open(osp.join(folder, 'meta_info.txt')) as fin:
-        paths = [line.split('.')[0] for line in fin]
-    return paths
-def generate_gaussian_kernel(kernel_size=13, sigma=1.6):
-    """Generate Gaussian kernel used in `duf_downsample`.
-    Args:
-        kernel_size (int): Kernel size. Default: 13.
-        sigma (float): Sigma of the Gaussian kernel. Default: 1.6.
-    Returns:
-        np.array: The Gaussian kernel.
-    """
-    from scipy.ndimage import filters as filters
-    kernel = np.zeros((kernel_size, kernel_size))
-    # set element at the middle to one, a dirac delta
-    kernel[kernel_size // 2, kernel_size // 2] = 1
-    # gaussian-smooth the dirac, resulting in a gaussian filter
-    return filters.gaussian_filter(kernel, sigma)
-def duf_downsample(x, kernel_size=13, scale=4):
-    """Downsamping with Gaussian kernel used in the DUF official code.
-    Args:
-        x (Tensor): Frames to be downsampled, with shape (b, t, c, h, w).
-        kernel_size (int): Kernel size. Default: 13.
-        scale (int): Downsampling factor. Supported scale: (2, 3, 4).
-            Default: 4.
-    Returns:
-        Tensor: DUF downsampled frames.
-    """
-    assert scale in (2, 3, 4), f'Only support scale (2, 3, 4), but got {scale}.'
-    squeeze_flag = False
-    if x.ndim == 4:
-        squeeze_flag = True
-        x = x.unsqueeze(0)
-    b, t, c, h, w = x.size()
-    x = x.view(-1, 1, h, w)
-    pad_w, pad_h = kernel_size // 2 + scale * 2, kernel_size // 2 + scale * 2
-    x = F.pad(x, (pad_w, pad_w, pad_h, pad_h), 'reflect')
-    gaussian_filter = generate_gaussian_kernel(kernel_size, 0.4 * scale)
-    gaussian_filter = torch.from_numpy(gaussian_filter).type_as(x).unsqueeze(0).unsqueeze(0)
-    x = F.conv2d(x, gaussian_filter, stride=scale)
-    x = x[:, :, 2:-2, 2:-2]
-    x = x.view(b, t, c, x.size(2), x.size(3))
-    if squeeze_flag:
-        x = x.squeeze(0)
-    return x

basicsr/data/degradations.py DELETED Viewed

@@ -1,764 +0,0 @@
-import cv2
-import math
-import numpy as np
-import random
-import torch
-from scipy import special
-from scipy.stats import multivariate_normal
-from torchvision.transforms.functional import rgb_to_grayscale
-# -------------------------------------------------------------------- #
-# --------------------------- blur kernels --------------------------- #
-# -------------------------------------------------------------------- #
-# --------------------------- util functions --------------------------- #
-def sigma_matrix2(sig_x, sig_y, theta):
-    """Calculate the rotated sigma matrix (two dimensional matrix).
-    Args:
-        sig_x (float):
-        sig_y (float):
-        theta (float): Radian measurement.
-    Returns:
-        ndarray: Rotated sigma matrix.
-    """
-    d_matrix = np.array([[sig_x**2, 0], [0, sig_y**2]])
-    u_matrix = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]])
-    return np.dot(u_matrix, np.dot(d_matrix, u_matrix.T))
-def mesh_grid(kernel_size):
-    """Generate the mesh grid, centering at zero.
-    Args:
-        kernel_size (int):
-    Returns:
-        xy (ndarray): with the shape (kernel_size, kernel_size, 2)
-        xx (ndarray): with the shape (kernel_size, kernel_size)
-        yy (ndarray): with the shape (kernel_size, kernel_size)
-    """
-    ax = np.arange(-kernel_size // 2 + 1., kernel_size // 2 + 1.)
-    xx, yy = np.meshgrid(ax, ax)
-    xy = np.hstack((xx.reshape((kernel_size * kernel_size, 1)), yy.reshape(kernel_size * kernel_size,
-                                                                           1))).reshape(kernel_size, kernel_size, 2)
-    return xy, xx, yy
-def pdf2(sigma_matrix, grid):
-    """Calculate PDF of the bivariate Gaussian distribution.
-    Args:
-        sigma_matrix (ndarray): with the shape (2, 2)
-        grid (ndarray): generated by :func:`mesh_grid`,
-            with the shape (K, K, 2), K is the kernel size.
-    Returns:
-        kernel (ndarrray): un-normalized kernel.
-    """
-    inverse_sigma = np.linalg.inv(sigma_matrix)
-    kernel = np.exp(-0.5 * np.sum(np.dot(grid, inverse_sigma) * grid, 2))
-    return kernel
-def cdf2(d_matrix, grid):
-    """Calculate the CDF of the standard bivariate Gaussian distribution.
-        Used in skewed Gaussian distribution.
-    Args:
-        d_matrix (ndarrasy): skew matrix.
-        grid (ndarray): generated by :func:`mesh_grid`,
-            with the shape (K, K, 2), K is the kernel size.
-    Returns:
-        cdf (ndarray): skewed cdf.
-    """
-    rv = multivariate_normal([0, 0], [[1, 0], [0, 1]])
-    grid = np.dot(grid, d_matrix)
-    cdf = rv.cdf(grid)
-    return cdf
-def bivariate_Gaussian(kernel_size, sig_x, sig_y, theta, grid=None, isotropic=True):
-    """Generate a bivariate isotropic or anisotropic Gaussian kernel.
-    In the isotropic mode, only `sig_x` is used. `sig_y` and `theta` is ignored.
-    Args:
-        kernel_size (int):
-        sig_x (float):
-        sig_y (float):
-        theta (float): Radian measurement.
-        grid (ndarray, optional): generated by :func:`mesh_grid`,
-            with the shape (K, K, 2), K is the kernel size. Default: None
-        isotropic (bool):
-    Returns:
-        kernel (ndarray): normalized kernel.
-    """
-    if grid is None:
-        grid, _, _ = mesh_grid(kernel_size)
-    if isotropic:
-        sigma_matrix = np.array([[sig_x**2, 0], [0, sig_x**2]])
-    else:
-        sigma_matrix = sigma_matrix2(sig_x, sig_y, theta)
-    kernel = pdf2(sigma_matrix, grid)
-    kernel = kernel / np.sum(kernel)
-    return kernel
-def bivariate_generalized_Gaussian(kernel_size, sig_x, sig_y, theta, beta, grid=None, isotropic=True):
-    """Generate a bivariate generalized Gaussian kernel.
-    ``Paper: Parameter Estimation For Multivariate Generalized Gaussian Distributions``
-    In the isotropic mode, only `sig_x` is used. `sig_y` and `theta` is ignored.
-    Args:
-        kernel_size (int):
-        sig_x (float):
-        sig_y (float):
-        theta (float): Radian measurement.
-        beta (float): shape parameter, beta = 1 is the normal distribution.
-        grid (ndarray, optional): generated by :func:`mesh_grid`,
-            with the shape (K, K, 2), K is the kernel size. Default: None
-    Returns:
-        kernel (ndarray): normalized kernel.
-    """
-    if grid is None:
-        grid, _, _ = mesh_grid(kernel_size)
-    if isotropic:
-        sigma_matrix = np.array([[sig_x**2, 0], [0, sig_x**2]])
-    else:
-        sigma_matrix = sigma_matrix2(sig_x, sig_y, theta)
-    inverse_sigma = np.linalg.inv(sigma_matrix)
-    kernel = np.exp(-0.5 * np.power(np.sum(np.dot(grid, inverse_sigma) * grid, 2), beta))
-    kernel = kernel / np.sum(kernel)
-    return kernel
-def bivariate_plateau(kernel_size, sig_x, sig_y, theta, beta, grid=None, isotropic=True):
-    """Generate a plateau-like anisotropic kernel.
-    1 / (1+x^(beta))
-    Reference: https://stats.stackexchange.com/questions/203629/is-there-a-plateau-shaped-distribution
-    In the isotropic mode, only `sig_x` is used. `sig_y` and `theta` is ignored.
-    Args:
-        kernel_size (int):
-        sig_x (float):
-        sig_y (float):
-        theta (float): Radian measurement.
-        beta (float): shape parameter, beta = 1 is the normal distribution.
-        grid (ndarray, optional): generated by :func:`mesh_grid`,
-            with the shape (K, K, 2), K is the kernel size. Default: None
-    Returns:
-        kernel (ndarray): normalized kernel.
-    """
-    if grid is None:
-        grid, _, _ = mesh_grid(kernel_size)
-    if isotropic:
-        sigma_matrix = np.array([[sig_x**2, 0], [0, sig_x**2]])
-    else:
-        sigma_matrix = sigma_matrix2(sig_x, sig_y, theta)
-    inverse_sigma = np.linalg.inv(sigma_matrix)
-    kernel = np.reciprocal(np.power(np.sum(np.dot(grid, inverse_sigma) * grid, 2), beta) + 1)
-    kernel = kernel / np.sum(kernel)
-    return kernel
-def random_bivariate_Gaussian(kernel_size,
-                              sigma_x_range,
-                              sigma_y_range,
-                              rotation_range,
-                              noise_range=None,
-                              isotropic=True):
-    """Randomly generate bivariate isotropic or anisotropic Gaussian kernels.
-    In the isotropic mode, only `sigma_x_range` is used. `sigma_y_range` and `rotation_range` is ignored.
-    Args:
-        kernel_size (int):
-        sigma_x_range (tuple): [0.6, 5]
-        sigma_y_range (tuple): [0.6, 5]
-        rotation range (tuple): [-math.pi, math.pi]
-        noise_range(tuple, optional): multiplicative kernel noise,
-            [0.75, 1.25]. Default: None
-    Returns:
-        kernel (ndarray):
-    """
-    assert kernel_size % 2 == 1, 'Kernel size must be an odd number.'
-    assert sigma_x_range[0] < sigma_x_range[1], 'Wrong sigma_x_range.'
-    sigma_x = np.random.uniform(sigma_x_range[0], sigma_x_range[1])
-    if isotropic is False:
-        assert sigma_y_range[0] < sigma_y_range[1], 'Wrong sigma_y_range.'
-        assert rotation_range[0] < rotation_range[1], 'Wrong rotation_range.'
-        sigma_y = np.random.uniform(sigma_y_range[0], sigma_y_range[1])
-        rotation = np.random.uniform(rotation_range[0], rotation_range[1])
-    else:
-        sigma_y = sigma_x
-        rotation = 0
-    kernel = bivariate_Gaussian(kernel_size, sigma_x, sigma_y, rotation, isotropic=isotropic)
-    # add multiplicative noise
-    if noise_range is not None:
-        assert noise_range[0] < noise_range[1], 'Wrong noise range.'
-        noise = np.random.uniform(noise_range[0], noise_range[1], size=kernel.shape)
-        kernel = kernel * noise
-    kernel = kernel / np.sum(kernel)
-    return kernel
-def random_bivariate_generalized_Gaussian(kernel_size,
-                                          sigma_x_range,
-                                          sigma_y_range,
-                                          rotation_range,
-                                          beta_range,
-                                          noise_range=None,
-                                          isotropic=True):
-    """Randomly generate bivariate generalized Gaussian kernels.
-    In the isotropic mode, only `sigma_x_range` is used. `sigma_y_range` and `rotation_range` is ignored.
-    Args:
-        kernel_size (int):
-        sigma_x_range (tuple): [0.6, 5]
-        sigma_y_range (tuple): [0.6, 5]
-        rotation range (tuple): [-math.pi, math.pi]
-        beta_range (tuple): [0.5, 8]
-        noise_range(tuple, optional): multiplicative kernel noise,
-            [0.75, 1.25]. Default: None
-    Returns:
-        kernel (ndarray):
-    """
-    assert kernel_size % 2 == 1, 'Kernel size must be an odd number.'
-    assert sigma_x_range[0] < sigma_x_range[1], 'Wrong sigma_x_range.'
-    sigma_x = np.random.uniform(sigma_x_range[0], sigma_x_range[1])
-    if isotropic is False:
-        assert sigma_y_range[0] < sigma_y_range[1], 'Wrong sigma_y_range.'
-        assert rotation_range[0] < rotation_range[1], 'Wrong rotation_range.'
-        sigma_y = np.random.uniform(sigma_y_range[0], sigma_y_range[1])
-        rotation = np.random.uniform(rotation_range[0], rotation_range[1])
-    else:
-        sigma_y = sigma_x
-        rotation = 0
-    # assume beta_range[0] < 1 < beta_range[1]
-    if np.random.uniform() < 0.5:
-        beta = np.random.uniform(beta_range[0], 1)
-    else:
-        beta = np.random.uniform(1, beta_range[1])
-    kernel = bivariate_generalized_Gaussian(kernel_size, sigma_x, sigma_y, rotation, beta, isotropic=isotropic)
-    # add multiplicative noise
-    if noise_range is not None:
-        assert noise_range[0] < noise_range[1], 'Wrong noise range.'
-        noise = np.random.uniform(noise_range[0], noise_range[1], size=kernel.shape)
-        kernel = kernel * noise
-    kernel = kernel / np.sum(kernel)
-    return kernel
-def random_bivariate_plateau(kernel_size,
-                             sigma_x_range,
-                             sigma_y_range,
-                             rotation_range,
-                             beta_range,
-                             noise_range=None,
-                             isotropic=True):
-    """Randomly generate bivariate plateau kernels.
-    In the isotropic mode, only `sigma_x_range` is used. `sigma_y_range` and `rotation_range` is ignored.
-    Args:
-        kernel_size (int):
-        sigma_x_range (tuple): [0.6, 5]
-        sigma_y_range (tuple): [0.6, 5]
-        rotation range (tuple): [-math.pi/2, math.pi/2]
-        beta_range (tuple): [1, 4]
-        noise_range(tuple, optional): multiplicative kernel noise,
-            [0.75, 1.25]. Default: None
-    Returns:
-        kernel (ndarray):
-    """
-    assert kernel_size % 2 == 1, 'Kernel size must be an odd number.'
-    assert sigma_x_range[0] < sigma_x_range[1], 'Wrong sigma_x_range.'
-    sigma_x = np.random.uniform(sigma_x_range[0], sigma_x_range[1])
-    if isotropic is False:
-        assert sigma_y_range[0] < sigma_y_range[1], 'Wrong sigma_y_range.'
-        assert rotation_range[0] < rotation_range[1], 'Wrong rotation_range.'
-        sigma_y = np.random.uniform(sigma_y_range[0], sigma_y_range[1])
-        rotation = np.random.uniform(rotation_range[0], rotation_range[1])
-    else:
-        sigma_y = sigma_x
-        rotation = 0
-    # TODO: this may be not proper
-    if np.random.uniform() < 0.5:
-        beta = np.random.uniform(beta_range[0], 1)
-    else:
-        beta = np.random.uniform(1, beta_range[1])
-    kernel = bivariate_plateau(kernel_size, sigma_x, sigma_y, rotation, beta, isotropic=isotropic)
-    # add multiplicative noise
-    if noise_range is not None:
-        assert noise_range[0] < noise_range[1], 'Wrong noise range.'
-        noise = np.random.uniform(noise_range[0], noise_range[1], size=kernel.shape)
-        kernel = kernel * noise
-    kernel = kernel / np.sum(kernel)
-    return kernel
-def random_mixed_kernels(kernel_list,
-                         kernel_prob,
-                         kernel_size=21,
-                         sigma_x_range=(0.6, 5),
-                         sigma_y_range=(0.6, 5),
-                         rotation_range=(-math.pi, math.pi),
-                         betag_range=(0.5, 8),
-                         betap_range=(0.5, 8),
-                         noise_range=None):
-    """Randomly generate mixed kernels.
-    Args:
-        kernel_list (tuple): a list name of kernel types,
-            support ['iso', 'aniso', 'skew', 'generalized', 'plateau_iso',
-            'plateau_aniso']
-        kernel_prob (tuple): corresponding kernel probability for each
-            kernel type
-        kernel_size (int):
-        sigma_x_range (tuple): [0.6, 5]
-        sigma_y_range (tuple): [0.6, 5]
-        rotation range (tuple): [-math.pi, math.pi]
-        beta_range (tuple): [0.5, 8]
-        noise_range(tuple, optional): multiplicative kernel noise,
-            [0.75, 1.25]. Default: None
-    Returns:
-        kernel (ndarray):
-    """
-    kernel_type = random.choices(kernel_list, kernel_prob)[0]
-    if kernel_type == 'iso':
-        kernel = random_bivariate_Gaussian(
-            kernel_size, sigma_x_range, sigma_y_range, rotation_range, noise_range=noise_range, isotropic=True)
-    elif kernel_type == 'aniso':
-        kernel = random_bivariate_Gaussian(
-            kernel_size, sigma_x_range, sigma_y_range, rotation_range, noise_range=noise_range, isotropic=False)
-    elif kernel_type == 'generalized_iso':
-        kernel = random_bivariate_generalized_Gaussian(
-            kernel_size,
-            sigma_x_range,
-            sigma_y_range,
-            rotation_range,
-            betag_range,
-            noise_range=noise_range,
-            isotropic=True)
-    elif kernel_type == 'generalized_aniso':
-        kernel = random_bivariate_generalized_Gaussian(
-            kernel_size,
-            sigma_x_range,
-            sigma_y_range,
-            rotation_range,
-            betag_range,
-            noise_range=noise_range,
-            isotropic=False)
-    elif kernel_type == 'plateau_iso':
-        kernel = random_bivariate_plateau(
-            kernel_size, sigma_x_range, sigma_y_range, rotation_range, betap_range, noise_range=None, isotropic=True)
-    elif kernel_type == 'plateau_aniso':
-        kernel = random_bivariate_plateau(
-            kernel_size, sigma_x_range, sigma_y_range, rotation_range, betap_range, noise_range=None, isotropic=False)
-    return kernel
-np.seterr(divide='ignore', invalid='ignore')
-def circular_lowpass_kernel(cutoff, kernel_size, pad_to=0):
-    """2D sinc filter
-    Reference: https://dsp.stackexchange.com/questions/58301/2-d-circularly-symmetric-low-pass-filter
-    Args:
-        cutoff (float): cutoff frequency in radians (pi is max)
-        kernel_size (int): horizontal and vertical size, must be odd.
-        pad_to (int): pad kernel size to desired size, must be odd or zero.
-    """
-    assert kernel_size % 2 == 1, 'Kernel size must be an odd number.'
-    kernel = np.fromfunction(
-        lambda x, y: cutoff * special.j1(cutoff * np.sqrt(
-            (x - (kernel_size - 1) / 2)**2 + (y - (kernel_size - 1) / 2)**2)) / (2 * np.pi * np.sqrt(
-                (x - (kernel_size - 1) / 2)**2 + (y - (kernel_size - 1) / 2)**2)), [kernel_size, kernel_size])
-    kernel[(kernel_size - 1) // 2, (kernel_size - 1) // 2] = cutoff**2 / (4 * np.pi)
-    kernel = kernel / np.sum(kernel)
-    if pad_to > kernel_size:
-        pad_size = (pad_to - kernel_size) // 2
-        kernel = np.pad(kernel, ((pad_size, pad_size), (pad_size, pad_size)))
-    return kernel
-# ------------------------------------------------------------- #
-# --------------------------- noise --------------------------- #
-# ------------------------------------------------------------- #
-# ----------------------- Gaussian Noise ----------------------- #
-def generate_gaussian_noise(img, sigma=10, gray_noise=False):
-    """Generate Gaussian noise.
-    Args:
-        img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32.
-        sigma (float): Noise scale (measured in range 255). Default: 10.
-    Returns:
-        (Numpy array): Returned noisy image, shape (h, w, c), range[0, 1],
-            float32.
-    """
-    if gray_noise:
-        noise = np.float32(np.random.randn(*(img.shape[0:2]))) * sigma / 255.
-        noise = np.expand_dims(noise, axis=2).repeat(3, axis=2)
-    else:
-        noise = np.float32(np.random.randn(*(img.shape))) * sigma / 255.
-    return noise
-def add_gaussian_noise(img, sigma=10, clip=True, rounds=False, gray_noise=False):
-    """Add Gaussian noise.
-    Args:
-        img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32.
-        sigma (float): Noise scale (measured in range 255). Default: 10.
-    Returns:
-        (Numpy array): Returned noisy image, shape (h, w, c), range[0, 1],
-            float32.
-    """
-    noise = generate_gaussian_noise(img, sigma, gray_noise)
-    out = img + noise
-    if clip and rounds:
-        out = np.clip((out * 255.0).round(), 0, 255) / 255.
-    elif clip:
-        out = np.clip(out, 0, 1)
-    elif rounds:
-        out = (out * 255.0).round() / 255.
-    return out
-def generate_gaussian_noise_pt(img, sigma=10, gray_noise=0):
-    """Add Gaussian noise (PyTorch version).
-    Args:
-        img (Tensor): Shape (b, c, h, w), range[0, 1], float32.
-        scale (float | Tensor): Noise scale. Default: 1.0.
-    Returns:
-        (Tensor): Returned noisy image, shape (b, c, h, w), range[0, 1],
-            float32.
-    """
-    b, _, h, w = img.size()
-    if not isinstance(sigma, (float, int)):
-        sigma = sigma.view(img.size(0), 1, 1, 1)
-    if isinstance(gray_noise, (float, int)):
-        cal_gray_noise = gray_noise > 0
-    else:
-        gray_noise = gray_noise.view(b, 1, 1, 1)
-        cal_gray_noise = torch.sum(gray_noise) > 0
-    if cal_gray_noise:
-        noise_gray = torch.randn(*img.size()[2:4], dtype=img.dtype, device=img.device) * sigma / 255.
-        noise_gray = noise_gray.view(b, 1, h, w)
-    # always calculate color noise
-    noise = torch.randn(*img.size(), dtype=img.dtype, device=img.device) * sigma / 255.
-    if cal_gray_noise:
-        noise = noise * (1 - gray_noise) + noise_gray * gray_noise
-    return noise
-def add_gaussian_noise_pt(img, sigma=10, gray_noise=0, clip=True, rounds=False):
-    """Add Gaussian noise (PyTorch version).
-    Args:
-        img (Tensor): Shape (b, c, h, w), range[0, 1], float32.
-        scale (float | Tensor): Noise scale. Default: 1.0.
-    Returns:
-        (Tensor): Returned noisy image, shape (b, c, h, w), range[0, 1],
-            float32.
-    """
-    noise = generate_gaussian_noise_pt(img, sigma, gray_noise)
-    out = img + noise
-    if clip and rounds:
-        out = torch.clamp((out * 255.0).round(), 0, 255) / 255.
-    elif clip:
-        out = torch.clamp(out, 0, 1)
-    elif rounds:
-        out = (out * 255.0).round() / 255.
-    return out
-# ----------------------- Random Gaussian Noise ----------------------- #
-def random_generate_gaussian_noise(img, sigma_range=(0, 10), gray_prob=0):
-    sigma = np.random.uniform(sigma_range[0], sigma_range[1])
-    if np.random.uniform() < gray_prob:
-        gray_noise = True
-    else:
-        gray_noise = False
-    return generate_gaussian_noise(img, sigma, gray_noise)
-def random_add_gaussian_noise(img, sigma_range=(0, 1.0), gray_prob=0, clip=True, rounds=False):
-    noise = random_generate_gaussian_noise(img, sigma_range, gray_prob)
-    out = img + noise
-    if clip and rounds:
-        out = np.clip((out * 255.0).round(), 0, 255) / 255.
-    elif clip:
-        out = np.clip(out, 0, 1)
-    elif rounds:
-        out = (out * 255.0).round() / 255.
-    return out
-def random_generate_gaussian_noise_pt(img, sigma_range=(0, 10), gray_prob=0):
-    sigma = torch.rand(
-        img.size(0), dtype=img.dtype, device=img.device) * (sigma_range[1] - sigma_range[0]) + sigma_range[0]
-    gray_noise = torch.rand(img.size(0), dtype=img.dtype, device=img.device)
-    gray_noise = (gray_noise < gray_prob).float()
-    return generate_gaussian_noise_pt(img, sigma, gray_noise)
-def random_add_gaussian_noise_pt(img, sigma_range=(0, 1.0), gray_prob=0, clip=True, rounds=False):
-    noise = random_generate_gaussian_noise_pt(img, sigma_range, gray_prob)
-    out = img + noise
-    if clip and rounds:
-        out = torch.clamp((out * 255.0).round(), 0, 255) / 255.
-    elif clip:
-        out = torch.clamp(out, 0, 1)
-    elif rounds:
-        out = (out * 255.0).round() / 255.
-    return out
-# ----------------------- Poisson (Shot) Noise ----------------------- #
-def generate_poisson_noise(img, scale=1.0, gray_noise=False):
-    """Generate poisson noise.
-    Reference: https://github.com/scikit-image/scikit-image/blob/main/skimage/util/noise.py#L37-L219
-    Args:
-        img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32.
-        scale (float): Noise scale. Default: 1.0.
-        gray_noise (bool): Whether generate gray noise. Default: False.
-    Returns:
-        (Numpy array): Returned noisy image, shape (h, w, c), range[0, 1],
-            float32.
-    """
-    if gray_noise:
-        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-    # round and clip image for counting vals correctly
-    img = np.clip((img * 255.0).round(), 0, 255) / 255.
-    vals = len(np.unique(img))
-    vals = 2**np.ceil(np.log2(vals))
-    out = np.float32(np.random.poisson(img * vals) / float(vals))
-    noise = out - img
-    if gray_noise:
-        noise = np.repeat(noise[:, :, np.newaxis], 3, axis=2)
-    return noise * scale
-def add_poisson_noise(img, scale=1.0, clip=True, rounds=False, gray_noise=False):
-    """Add poisson noise.
-    Args:
-        img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32.
-        scale (float): Noise scale. Default: 1.0.
-        gray_noise (bool): Whether generate gray noise. Default: False.
-    Returns:
-        (Numpy array): Returned noisy image, shape (h, w, c), range[0, 1],
-            float32.
-    """
-    noise = generate_poisson_noise(img, scale, gray_noise)
-    out = img + noise
-    if clip and rounds:
-        out = np.clip((out * 255.0).round(), 0, 255) / 255.
-    elif clip:
-        out = np.clip(out, 0, 1)
-    elif rounds:
-        out = (out * 255.0).round() / 255.
-    return out
-def generate_poisson_noise_pt(img, scale=1.0, gray_noise=0):
-    """Generate a batch of poisson noise (PyTorch version)
-    Args:
-        img (Tensor): Input image, shape (b, c, h, w), range [0, 1], float32.
-        scale (float | Tensor): Noise scale. Number or Tensor with shape (b).
-            Default: 1.0.
-        gray_noise (float | Tensor): 0-1 number or Tensor with shape (b).
-            0 for False, 1 for True. Default: 0.
-    Returns:
-        (Tensor): Returned noisy image, shape (b, c, h, w), range[0, 1],
-            float32.
-    """
-    b, _, h, w = img.size()
-    if isinstance(gray_noise, (float, int)):
-        cal_gray_noise = gray_noise > 0
-    else:
-        gray_noise = gray_noise.view(b, 1, 1, 1)
-        cal_gray_noise = torch.sum(gray_noise) > 0
-    if cal_gray_noise:
-        img_gray = rgb_to_grayscale(img, num_output_channels=1)
-        # round and clip image for counting vals correctly
-        img_gray = torch.clamp((img_gray * 255.0).round(), 0, 255) / 255.
-        # use for-loop to get the unique values for each sample
-        vals_list = [len(torch.unique(img_gray[i, :, :, :])) for i in range(b)]
-        vals_list = [2**np.ceil(np.log2(vals)) for vals in vals_list]
-        vals = img_gray.new_tensor(vals_list).view(b, 1, 1, 1)
-        out = torch.poisson(img_gray * vals) / vals
-        noise_gray = out - img_gray
-        noise_gray = noise_gray.expand(b, 3, h, w)
-    # always calculate color noise
-    # round and clip image for counting vals correctly
-    img = torch.clamp((img * 255.0).round(), 0, 255) / 255.
-    # use for-loop to get the unique values for each sample
-    vals_list = [len(torch.unique(img[i, :, :, :])) for i in range(b)]
-    vals_list = [2**np.ceil(np.log2(vals)) for vals in vals_list]
-    vals = img.new_tensor(vals_list).view(b, 1, 1, 1)
-    out = torch.poisson(img * vals) / vals
-    noise = out - img
-    if cal_gray_noise:
-        noise = noise * (1 - gray_noise) + noise_gray * gray_noise
-    if not isinstance(scale, (float, int)):
-        scale = scale.view(b, 1, 1, 1)
-    return noise * scale
-def add_poisson_noise_pt(img, scale=1.0, clip=True, rounds=False, gray_noise=0):
-    """Add poisson noise to a batch of images (PyTorch version).
-    Args:
-        img (Tensor): Input image, shape (b, c, h, w), range [0, 1], float32.
-        scale (float | Tensor): Noise scale. Number or Tensor with shape (b).
-            Default: 1.0.
-        gray_noise (float | Tensor): 0-1 number or Tensor with shape (b).
-            0 for False, 1 for True. Default: 0.
-    Returns:
-        (Tensor): Returned noisy image, shape (b, c, h, w), range[0, 1],
-            float32.
-    """
-    noise = generate_poisson_noise_pt(img, scale, gray_noise)
-    out = img + noise
-    if clip and rounds:
-        out = torch.clamp((out * 255.0).round(), 0, 255) / 255.
-    elif clip:
-        out = torch.clamp(out, 0, 1)
-    elif rounds:
-        out = (out * 255.0).round() / 255.
-    return out
-# ----------------------- Random Poisson (Shot) Noise ----------------------- #
-def random_generate_poisson_noise(img, scale_range=(0, 1.0), gray_prob=0):
-    scale = np.random.uniform(scale_range[0], scale_range[1])
-    if np.random.uniform() < gray_prob:
-        gray_noise = True
-    else:
-        gray_noise = False
-    return generate_poisson_noise(img, scale, gray_noise)
-def random_add_poisson_noise(img, scale_range=(0, 1.0), gray_prob=0, clip=True, rounds=False):
-    noise = random_generate_poisson_noise(img, scale_range, gray_prob)
-    out = img + noise
-    if clip and rounds:
-        out = np.clip((out * 255.0).round(), 0, 255) / 255.
-    elif clip:
-        out = np.clip(out, 0, 1)
-    elif rounds:
-        out = (out * 255.0).round() / 255.
-    return out
-def random_generate_poisson_noise_pt(img, scale_range=(0, 1.0), gray_prob=0):
-    scale = torch.rand(
-        img.size(0), dtype=img.dtype, device=img.device) * (scale_range[1] - scale_range[0]) + scale_range[0]
-    gray_noise = torch.rand(img.size(0), dtype=img.dtype, device=img.device)
-    gray_noise = (gray_noise < gray_prob).float()
-    return generate_poisson_noise_pt(img, scale, gray_noise)
-def random_add_poisson_noise_pt(img, scale_range=(0, 1.0), gray_prob=0, clip=True, rounds=False):
-    noise = random_generate_poisson_noise_pt(img, scale_range, gray_prob)
-    out = img + noise
-    if clip and rounds:
-        out = torch.clamp((out * 255.0).round(), 0, 255) / 255.
-    elif clip:
-        out = torch.clamp(out, 0, 1)
-    elif rounds:
-        out = (out * 255.0).round() / 255.
-    return out
-# ------------------------------------------------------------------------ #
-# --------------------------- JPEG compression --------------------------- #
-# ------------------------------------------------------------------------ #
-def add_jpg_compression(img, quality=90):
-    """Add JPG compression artifacts.
-    Args:
-        img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32.
-        quality (float): JPG compression quality. 0 for lowest quality, 100 for
-            best quality. Default: 90.
-    Returns:
-        (Numpy array): Returned image after JPG, shape (h, w, c), range[0, 1],
-            float32.
-    """
-    img = np.clip(img, 0, 1)
-    encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), quality]
-    _, encimg = cv2.imencode('.jpg', img * 255., encode_param)
-    img = np.float32(cv2.imdecode(encimg, 1)) / 255.
-    return img
-def random_add_jpg_compression(img, quality_range=(90, 100)):
-    """Randomly add JPG compression artifacts.
-    Args:
-        img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32.
-        quality_range (tuple[float] | list[float]): JPG compression quality
-            range. 0 for lowest quality, 100 for best quality.
-            Default: (90, 100).
-    Returns:
-        (Numpy array): Returned image after JPG, shape (h, w, c), range[0, 1],
-            float32.
-    """
-    quality = np.random.uniform(quality_range[0], quality_range[1])
-    return add_jpg_compression(img, quality)

basicsr/data/ffhq_dataset.py DELETED Viewed

@@ -1,80 +0,0 @@
-import random
-import time
-from os import path as osp
-from torch.utils import data as data
-from torchvision.transforms.functional import normalize
-from basicsr.data.transforms import augment
-from basicsr.utils import FileClient, get_root_logger, imfrombytes, img2tensor
-from basicsr.utils.registry import DATASET_REGISTRY
-@DATASET_REGISTRY.register()
-class FFHQDataset(data.Dataset):
-    """FFHQ dataset for StyleGAN.
-    Args:
-        opt (dict): Config for train datasets. It contains the following keys:
-            dataroot_gt (str): Data root path for gt.
-            io_backend (dict): IO backend type and other kwarg.
-            mean (list | tuple): Image mean.
-            std (list | tuple): Image std.
-            use_hflip (bool): Whether to horizontally flip.
-    """
-    def __init__(self, opt):
-        super(FFHQDataset, self).__init__()
-        self.opt = opt
-        # file client (io backend)
-        self.file_client = None
-        self.io_backend_opt = opt['io_backend']
-        self.gt_folder = opt['dataroot_gt']
-        self.mean = opt['mean']
-        self.std = opt['std']
-        if self.io_backend_opt['type'] == 'lmdb':
-            self.io_backend_opt['db_paths'] = self.gt_folder
-            if not self.gt_folder.endswith('.lmdb'):
-                raise ValueError("'dataroot_gt' should end with '.lmdb', but received {self.gt_folder}")
-            with open(osp.join(self.gt_folder, 'meta_info.txt')) as fin:
-                self.paths = [line.split('.')[0] for line in fin]
-        else:
-            # FFHQ has 70000 images in total
-            self.paths = [osp.join(self.gt_folder, f'{v:08d}.png') for v in range(70000)]
-    def __getitem__(self, index):
-        if self.file_client is None:
-            self.file_client = FileClient(self.io_backend_opt.pop('type'), **self.io_backend_opt)
-        # load gt image
-        gt_path = self.paths[index]
-        # avoid errors caused by high latency in reading files
-        retry = 3
-        while retry > 0:
-            try:
-                img_bytes = self.file_client.get(gt_path)
-            except Exception as e:
-                logger = get_root_logger()
-                logger.warning(f'File client error: {e}, remaining retry times: {retry - 1}')
-                # change another file to read
-                index = random.randint(0, self.__len__())
-                gt_path = self.paths[index]
-                time.sleep(1)  # sleep 1s for occasional server congestion
-            else:
-                break
-            finally:
-                retry -= 1
-        img_gt = imfrombytes(img_bytes, float32=True)
-        # random horizontal flip
-        img_gt = augment(img_gt, hflip=self.opt['use_hflip'], rotation=False)
-        # BGR to RGB, HWC to CHW, numpy to tensor
-        img_gt = img2tensor(img_gt, bgr2rgb=True, float32=True)
-        # normalize
-        normalize(img_gt, self.mean, self.std, inplace=True)
-        return {'gt': img_gt, 'gt_path': gt_path}
-    def __len__(self):
-        return len(self.paths)

basicsr/data/meta_info/meta_info_DIV2K800sub_GT.txt DELETED Viewed

The diff for this file is too large to render. See raw diff