:sparkles: clean up code

Browse files

Files changed (7) hide show

.style.yapf +6 -0
scripts/README.md → README.md +6 -2
scripts/attention.py +40 -101
scripts/convert_mvdream_to_diffusers.py +37 -68
scripts/models.py +123 -228
scripts/pipeline_mvdream.py +97 -291
vae/diffusion_pytorch_model.bin +1 -1

.style.yapf ADDED Viewed

	@@ -0,0 +1,6 @@

+[style]
+based_on_style = google
+spaces_before_comment = 1
+indent_width: 4
+split_before_logical_operator = true
+column_limit = 1024

scripts/README.md → README.md RENAMED Viewed

@@ -1,4 +1,8 @@
-# Convert original weights to diffusers
 Download original MVDream checkpoint through one of the following sources:
@@ -14,5 +18,5 @@ wget https://raw.githubusercontent.com/bytedance/MVDream/main/mvdream/configs/sd
 Hugging Face diffusers weights are converted by script:
 ```bash
-python ./scripts/convert_mvdream_to_diffusers.py --checkpoint_path ./sd-v1.5-4view.pt --dump_path . --original_config_file ./sd-v1.yaml
 ```

+# MVDream-HF
+A huggingface implementation of MVDream, used for quick one-line download. See [huggingface repo](https://huggingface.co/KokeCacao/mvdream-hf/tree/main) that hosts sd-v1.5 version.
+## Convert Original Weights to Diffusers
 Download original MVDream checkpoint through one of the following sources:
 Hugging Face diffusers weights are converted by script:
 ```bash
+python ./scripts/convert_mvdream_to_diffusers.py --checkpoint_path ./sd-v1.5-4view.pt --dump_path . --original_config_file ./sd-v1.yaml --test
 ```

scripts/attention.py CHANGED Viewed

@@ -11,7 +11,6 @@ from einops import rearrange, repeat
 from typing import Optional, Any
 from util import checkpoint
 try:
     import xformers
     import xformers.ops
@@ -21,11 +20,12 @@ except:
 # CrossAttn precision handling
 import os
 _ATTN_PRECISION = os.environ.get("ATTN_PRECISION", "fp32")
 def uniq(arr):
-    return{el: True for el in arr}.keys()
 def default(val, d):
@@ -47,6 +47,7 @@ def init_(tensor):
 # feedforward
 class GEGLU(nn.Module):
     def __init__(self, dim_in, dim_out):
         super().__init__()
         self.proj = nn.Linear(dim_in, dim_out * 2)
@@ -57,20 +58,14 @@ class GEGLU(nn.Module):
 class FeedForward(nn.Module):
     def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
         super().__init__()
         inner_dim = int(dim * mult)
         dim_out = default(dim_out, dim)
-        project_in = nn.Sequential(
-            nn.Linear(dim, inner_dim),
-            nn.GELU()
-        ) if not glu else GEGLU(dim, inner_dim)
-        self.net = nn.Sequential(
-            project_in,
-            nn.Dropout(dropout),
-            nn.Linear(inner_dim, dim_out)
-        )
     def forward(self, x):
         return self.net(x)
@@ -90,31 +85,16 @@ def Normalize(in_channels):
 class SpatialSelfAttention(nn.Module):
     def __init__(self, in_channels):
         super().__init__()
         self.in_channels = in_channels
         self.norm = Normalize(in_channels)
-        self.q = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.k = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.v = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.proj_out = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=1,
-                                        stride=1,
-                                        padding=0)
     def forward(self, x):
         h_ = x
@@ -124,7 +104,7 @@ class SpatialSelfAttention(nn.Module):
         v = self.v(h_)
         # compute attention
-        b,c,h,w = q.shape
         q = rearrange(q, 'b c h w -> b (h w) c')
         k = rearrange(k, 'b c h w -> b c (h w)')
         w_ = torch.einsum('bij,bjk->bik', q, k)
@@ -139,26 +119,24 @@ class SpatialSelfAttention(nn.Module):
         h_ = rearrange(h_, 'b c (h w) -> b c h w', h=h)
         h_ = self.proj_out(h_)
-        return x+h_
 class CrossAttention(nn.Module):
     def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.):
         super().__init__()
         inner_dim = dim_head * heads
         context_dim = default(context_dim, query_dim)
-        self.scale = dim_head ** -0.5
         self.heads = heads
         self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
         self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
         self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
-        self.to_out = nn.Sequential(
-            nn.Linear(inner_dim, query_dim),
-            nn.Dropout(dropout)
-        )
     def forward(self, x, context=None, mask=None):
         h = self.heads
@@ -171,15 +149,15 @@ class CrossAttention(nn.Module):
         q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
         # force cast to fp32 to avoid overflowing
-        if _ATTN_PRECISION =="fp32":
-            with autocast(enabled=False, device_type = 'cuda'):
                 q, k = q.float(), k.float()
                 sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
         else:
             sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
         del q, k
         if mask is not None:
             mask = rearrange(mask, 'b ... -> b (...)')
             max_neg_value = -torch.finfo(sim.dtype).max
@@ -221,11 +199,7 @@ class MemoryEfficientCrossAttention(nn.Module):
         b, _, _ = q.shape
         q, k, v = map(
-            lambda t: t.unsqueeze(3)
-            .reshape(b, t.shape[1], self.heads, self.dim_head)
-            .permute(0, 2, 1, 3)
-            .reshape(b * self.heads, t.shape[1], self.dim_head)
-            .contiguous(),
             (q, k, v),
         )
@@ -234,32 +208,25 @@ class MemoryEfficientCrossAttention(nn.Module):
         if mask is not None:
             raise NotImplementedError
-        out = (
-            out.unsqueeze(0)
-            .reshape(b, self.heads, out.shape[1], self.dim_head)
-            .permute(0, 2, 1, 3)
-            .reshape(b, out.shape[1], self.heads * self.dim_head)
-        )
         return self.to_out(out)
 class BasicTransformerBlock(nn.Module):
     ATTENTION_MODES = {
-        "softmax": CrossAttention,  # vanilla attention
         "softmax-xformers": MemoryEfficientCrossAttention
     }
-    def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True,
-                 disable_self_attn=False):
         super().__init__()
         attn_mode = "softmax-xformers" if XFORMERS_IS_AVAILBLE else "softmax"
         assert attn_mode in self.ATTENTION_MODES
         attn_cls = self.ATTENTION_MODES[attn_mode]
         self.disable_self_attn = disable_self_attn
-        self.attn1 = attn_cls(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout,
-                              context_dim=context_dim if self.disable_self_attn else None)  # is a self-attention if not self.disable_self_attn
         self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
-        self.attn2 = attn_cls(query_dim=dim, context_dim=context_dim,
-                              heads=n_heads, dim_head=d_head, dropout=dropout)  # is self-attn if context is none
         self.norm1 = nn.LayerNorm(dim)
         self.norm2 = nn.LayerNorm(dim)
         self.norm3 = nn.LayerNorm(dim)
@@ -284,10 +251,8 @@ class SpatialTransformer(nn.Module):
     Finally, reshape to image
     NEW: use_linear for more efficiency instead of the 1x1 convs
     """
-    def __init__(self, in_channels, n_heads, d_head,
-                 depth=1, dropout=0., context_dim=None,
-                 disable_self_attn=False, use_linear=False,
-                 use_checkpoint=True):
         super().__init__()
         assert context_dim is not None
         if not isinstance(context_dim, list):
@@ -296,25 +261,13 @@ class SpatialTransformer(nn.Module):
         inner_dim = n_heads * d_head
         self.norm = Normalize(in_channels)
         if not use_linear:
-            self.proj_in = nn.Conv2d(in_channels,
-                                     inner_dim,
-                                     kernel_size=1,
-                                     stride=1,
-                                     padding=0)
         else:
             self.proj_in = nn.Linear(in_channels, inner_dim)
-        self.transformer_blocks = nn.ModuleList(
-            [BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim[d],
-                                   disable_self_attn=disable_self_attn, checkpoint=use_checkpoint)
-                for d in range(depth)]
-        )
         if not use_linear:
-            self.proj_out = zero_module(nn.Conv2d(inner_dim,
-                                                  in_channels,
-                                                  kernel_size=1,
-                                                  stride=1,
-                                                  padding=0))
         else:
             self.proj_out = zero_module(nn.Linear(in_channels, inner_dim))
         self.use_linear = use_linear
@@ -356,11 +309,9 @@ class BasicTransformerBlock3D(BasicTransformerBlock):
 class SpatialTransformer3D(nn.Module):
-    ''' 3D self-attention '''
-    def __init__(self, in_channels, n_heads, d_head,
-                 depth=1, dropout=0., context_dim=None,
-                 disable_self_attn=False, use_linear=False,
-                 use_checkpoint=True):
         super().__init__()
         assert context_dim is not None
         if not isinstance(context_dim, list):
@@ -369,25 +320,13 @@ class SpatialTransformer3D(nn.Module):
         inner_dim = n_heads * d_head
         self.norm = Normalize(in_channels)
         if not use_linear:
-            self.proj_in = nn.Conv2d(in_channels,
-                                     inner_dim,
-                                     kernel_size=1,
-                                     stride=1,
-                                     padding=0)
         else:
             self.proj_in = nn.Linear(in_channels, inner_dim)
-        self.transformer_blocks = nn.ModuleList(
-            [BasicTransformerBlock3D(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim[d],
-                                   disable_self_attn=disable_self_attn, checkpoint=use_checkpoint)
-                for d in range(depth)]
-        )
         if not use_linear:
-            self.proj_out = zero_module(nn.Conv2d(inner_dim,
-                                                  in_channels,
-                                                  kernel_size=1,
-                                                  stride=1,
-                                                  padding=0))
         else:
             self.proj_out = zero_module(nn.Linear(in_channels, inner_dim))
         self.use_linear = use_linear
@@ -411,4 +350,4 @@ class SpatialTransformer3D(nn.Module):
         x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w).contiguous()
         if not self.use_linear:
             x = self.proj_out(x)
-        return x + x_in

 from typing import Optional, Any
 from util import checkpoint
 try:
     import xformers
     import xformers.ops
 # CrossAttn precision handling
 import os
 _ATTN_PRECISION = os.environ.get("ATTN_PRECISION", "fp32")
 def uniq(arr):
+    return {el: True for el in arr}.keys()
 def default(val, d):
 # feedforward
 class GEGLU(nn.Module):
     def __init__(self, dim_in, dim_out):
         super().__init__()
         self.proj = nn.Linear(dim_in, dim_out * 2)
 class FeedForward(nn.Module):
     def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
         super().__init__()
         inner_dim = int(dim * mult)
         dim_out = default(dim_out, dim)
+        project_in = nn.Sequential(nn.Linear(dim, inner_dim), nn.GELU()) if not glu else GEGLU(dim, inner_dim)
+        self.net = nn.Sequential(project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out))
     def forward(self, x):
         return self.net(x)
 class SpatialSelfAttention(nn.Module):
     def __init__(self, in_channels):
         super().__init__()
         self.in_channels = in_channels
         self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
     def forward(self, x):
         h_ = x
         v = self.v(h_)
         # compute attention
+        b, c, h, w = q.shape
         q = rearrange(q, 'b c h w -> b (h w) c')
         k = rearrange(k, 'b c h w -> b c (h w)')
         w_ = torch.einsum('bij,bjk->bik', q, k)
         h_ = rearrange(h_, 'b c (h w) -> b c h w', h=h)
         h_ = self.proj_out(h_)
+        return x + h_
 class CrossAttention(nn.Module):
     def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.):
         super().__init__()
         inner_dim = dim_head * heads
         context_dim = default(context_dim, query_dim)
+        self.scale = dim_head**-0.5
         self.heads = heads
         self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
         self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
         self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_out = nn.Sequential(nn.Linear(inner_dim, query_dim), nn.Dropout(dropout))
     def forward(self, x, context=None, mask=None):
         h = self.heads
         q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
         # force cast to fp32 to avoid overflowing
+        if _ATTN_PRECISION == "fp32":
+            with autocast(enabled=False, device_type='cuda'):
                 q, k = q.float(), k.float()
                 sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
         else:
             sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
         del q, k
         if mask is not None:
             mask = rearrange(mask, 'b ... -> b (...)')
             max_neg_value = -torch.finfo(sim.dtype).max
         b, _, _ = q.shape
         q, k, v = map(
+            lambda t: t.unsqueeze(3).reshape(b, t.shape[1], self.heads, self.dim_head).permute(0, 2, 1, 3).reshape(b * self.heads, t.shape[1], self.dim_head).contiguous(),
             (q, k, v),
         )
         if mask is not None:
             raise NotImplementedError
+        out = (out.unsqueeze(0).reshape(b, self.heads, out.shape[1], self.dim_head).permute(0, 2, 1, 3).reshape(b, out.shape[1], self.heads * self.dim_head))
         return self.to_out(out)
 class BasicTransformerBlock(nn.Module):
     ATTENTION_MODES = {
+        "softmax": CrossAttention, # vanilla attention
         "softmax-xformers": MemoryEfficientCrossAttention
     }
+    def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True, disable_self_attn=False):
         super().__init__()
         attn_mode = "softmax-xformers" if XFORMERS_IS_AVAILBLE else "softmax"
         assert attn_mode in self.ATTENTION_MODES
         attn_cls = self.ATTENTION_MODES[attn_mode]
         self.disable_self_attn = disable_self_attn
+        self.attn1 = attn_cls(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout, context_dim=context_dim if self.disable_self_attn else None) # is a self-attention if not self.disable_self_attn
         self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.attn2 = attn_cls(query_dim=dim, context_dim=context_dim, heads=n_heads, dim_head=d_head, dropout=dropout) # is self-attn if context is none
         self.norm1 = nn.LayerNorm(dim)
         self.norm2 = nn.LayerNorm(dim)
         self.norm3 = nn.LayerNorm(dim)
     Finally, reshape to image
     NEW: use_linear for more efficiency instead of the 1x1 convs
     """
+    def __init__(self, in_channels, n_heads, d_head, depth=1, dropout=0., context_dim=None, disable_self_attn=False, use_linear=False, use_checkpoint=True):
         super().__init__()
         assert context_dim is not None
         if not isinstance(context_dim, list):
         inner_dim = n_heads * d_head
         self.norm = Normalize(in_channels)
         if not use_linear:
+            self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
         else:
             self.proj_in = nn.Linear(in_channels, inner_dim)
+        self.transformer_blocks = nn.ModuleList([BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim[d], disable_self_attn=disable_self_attn, checkpoint=use_checkpoint) for d in range(depth)])
         if not use_linear:
+            self.proj_out = zero_module(nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0))
         else:
             self.proj_out = zero_module(nn.Linear(in_channels, inner_dim))
         self.use_linear = use_linear
 class SpatialTransformer3D(nn.Module):
+    ''' 3D self-attention '''
+    def __init__(self, in_channels, n_heads, d_head, depth=1, dropout=0., context_dim=None, disable_self_attn=False, use_linear=False, use_checkpoint=True):
         super().__init__()
         assert context_dim is not None
         if not isinstance(context_dim, list):
         inner_dim = n_heads * d_head
         self.norm = Normalize(in_channels)
         if not use_linear:
+            self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
         else:
             self.proj_in = nn.Linear(in_channels, inner_dim)
+        self.transformer_blocks = nn.ModuleList([BasicTransformerBlock3D(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim[d], disable_self_attn=disable_self_attn, checkpoint=use_checkpoint) for d in range(depth)])
         if not use_linear:
+            self.proj_out = zero_module(nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0))
         else:
             self.proj_out = zero_module(nn.Linear(in_channels, inner_dim))
         self.use_linear = use_linear
         x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w).contiguous()
         if not self.use_linear:
             x = self.proj_out(x)
+        return x + x_in

scripts/convert_mvdream_to_diffusers.py CHANGED Viewed

@@ -3,6 +3,7 @@
 import argparse
 import torch
 import sys
 sys.path.insert(0, '../')
 from transformers import (
@@ -126,9 +127,7 @@ logger = logging.get_logger(__name__)
 #     return config
-def assign_to_checkpoint(
-    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
-):
     """
     This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
     attention layers, and takes into account additional replacements that may arise.
@@ -144,6 +143,7 @@ def assign_to_checkpoint(
             target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
             num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
             old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
@@ -211,6 +211,7 @@ def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
     return mapping
 def renew_attention_paths(old_list, n_shave_prefix_segments=0):
     """
     Updates paths inside attentions to the new naming scheme (local renaming)
@@ -231,6 +232,7 @@ def renew_attention_paths(old_list, n_shave_prefix_segments=0):
     return mapping
 # def convert_ldm_unet_checkpoint(
 #     checkpoint, config, path=None, extract_ema=False, controlnet=False, skip_extract_state_dict=False
 # ):
@@ -496,6 +498,7 @@ def create_vae_diffusers_config(original_config, image_size: int):
     }
     return config
 def convert_ldm_vae_checkpoint(checkpoint, config):
     # extract state dict for VAE
     vae_state_dict = {}
@@ -528,26 +531,18 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
     # Retrieves the keys for the encoder down blocks only
     num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
-    down_blocks = {
-        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
-    }
     # Retrieves the keys for the decoder up blocks only
     num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
-    up_blocks = {
-        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
-    }
     for i in range(num_down_blocks):
         resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
         if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
-            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
-                f"encoder.down.{i}.downsample.conv.weight"
-            )
-            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
-                f"encoder.down.{i}.downsample.conv.bias"
-            )
         paths = renew_vae_resnet_paths(resnets)
         meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
@@ -570,17 +565,11 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
     for i in range(num_up_blocks):
         block_id = num_up_blocks - 1 - i
-        resnets = [
-            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
-        ]
         if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
-            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
-                f"decoder.up.{block_id}.upsample.conv.weight"
-            ]
-            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
-                f"decoder.up.{block_id}.upsample.conv.bias"
-            ]
         paths = renew_vae_resnet_paths(resnets)
         meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
@@ -618,6 +607,7 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
     return mapping
 def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
     """
     Updates paths inside attentions to the new naming scheme (local renaming)
@@ -659,12 +649,8 @@ def conv_attn_to_linear(checkpoint):
             if checkpoint[key].ndim > 2:
                 checkpoint[key] = checkpoint[key][:, :, 0]
-def convert_from_original_mvdream_ckpt(
-    checkpoint_path,
-    original_config_file,
-    extract_ema,
-    device
-):
     checkpoint = torch.load(checkpoint_path, map_location=device)
     # print(f"Checkpoint: {checkpoint.keys()}")
     torch.cuda.empty_cache()
@@ -702,9 +688,7 @@ def convert_from_original_mvdream_ckpt(
     # print(f"Unet Config: {original_config.model.params.unet_config.params}")
     unet: MultiViewUNetWrapperModel = MultiViewUNetWrapperModel(**original_config.model.params.unet_config.params)
     # print(f"Unet State Dict: {unet.state_dict().keys()}")
-    unet.load_state_dict({
-        key.replace("model.diffusion_model.", "unet."): value for key, value in checkpoint.items() if key.replace("model.diffusion_model.", "unet.") in unet.state_dict()
-    })
     for param_name, param in unet.state_dict().items():
         set_module_tensor_to_device(unet, param_name, "cuda:0", value=param)
@@ -712,25 +696,21 @@ def convert_from_original_mvdream_ckpt(
     vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
     converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
-    if (
-        "model" in original_config
-        and "params" in original_config.model
-        and "scale_factor" in original_config.model.params
-    ):
         vae_scaling_factor = original_config.model.params.scale_factor
     else:
-        vae_scaling_factor = 0.18215  # default SD scaling factor
     vae_config["scaling_factor"] = vae_scaling_factor
     with init_empty_weights():
         vae = AutoencoderKL(**vae_config)
     tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
     text_encoder: CLIPTextModel = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14").to(device=torch.device("cuda:0")) # type: ignore
     for param_name, param in converted_vae_checkpoint.items():
-        set_module_tensor_to_device(vae, param_name, "cuda:0", value=param)
     pipe = MVDreamStableDiffusionPipeline(
         vae=vae,
@@ -746,30 +726,20 @@ def convert_from_original_mvdream_ckpt(
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
-    )
     parser.add_argument(
         "--original_config_file",
         default=None,
         type=str,
         help="The YAML config file corresponding to the original architecture.",
     )
-    parser.add_argument(
-        "--extract_ema",
-        action="store_true",
-        help=(
-            "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
-            " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
-            " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
-        ),
-    )
     parser.add_argument(
         "--to_safetensors",
         action="store_true",
         help="Whether to store pipeline in safetensors format or not.",
     )
-    parser.add_argument("--half", action="store_true", help="Save weights in half precision.")
     parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
     parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
     args = parser.parse_args()
@@ -777,22 +747,21 @@ if __name__ == "__main__":
     pipe = convert_from_original_mvdream_ckpt(
         checkpoint_path=args.checkpoint_path,
         original_config_file=args.original_config_file,
-        extract_ema=args.extract_ema,
         device=args.device,
     )
     if args.half:
         pipe.to(torch_dtype=torch.float16)
-    images = pipe(
-        prompt="Head of Hatsune Miku",
-        negative_prompt="painting, bad quality, flat",
-        output_type="pil",
-        return_dict=False,
-        guidance_scale=7.5,
-        num_inference_steps=50,
-    )
-    for i, image in enumerate(images):
-        image.save(f"image_{i}.png")
-    pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)

 import argparse
 import torch
 import sys
 sys.path.insert(0, '../')
 from transformers import (
 #     return config
+def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None):
     """
     This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
     attention layers, and takes into account additional replacements that may arise.
             target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+            assert config is not None
             num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
             old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
     return mapping
 def renew_attention_paths(old_list, n_shave_prefix_segments=0):
     """
     Updates paths inside attentions to the new naming scheme (local renaming)
     return mapping
 # def convert_ldm_unet_checkpoint(
 #     checkpoint, config, path=None, extract_ema=False, controlnet=False, skip_extract_state_dict=False
 # ):
     }
     return config
 def convert_ldm_vae_checkpoint(checkpoint, config):
     # extract state dict for VAE
     vae_state_dict = {}
     # Retrieves the keys for the encoder down blocks only
     num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)}
     # Retrieves the keys for the decoder up blocks only
     num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)}
     for i in range(num_down_blocks):
         resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
         if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.weight")
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.bias")
         paths = renew_vae_resnet_paths(resnets)
         meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
     for i in range(num_up_blocks):
         block_id = num_up_blocks - 1 - i
+        resnets = [key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key]
         if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.weight"]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.bias"]
         paths = renew_vae_resnet_paths(resnets)
         meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
     return mapping
 def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
     """
     Updates paths inside attentions to the new naming scheme (local renaming)
             if checkpoint[key].ndim > 2:
                 checkpoint[key] = checkpoint[key][:, :, 0]
+def convert_from_original_mvdream_ckpt(checkpoint_path, original_config_file, device):
     checkpoint = torch.load(checkpoint_path, map_location=device)
     # print(f"Checkpoint: {checkpoint.keys()}")
     torch.cuda.empty_cache()
     # print(f"Unet Config: {original_config.model.params.unet_config.params}")
     unet: MultiViewUNetWrapperModel = MultiViewUNetWrapperModel(**original_config.model.params.unet_config.params)
     # print(f"Unet State Dict: {unet.state_dict().keys()}")
+    unet.load_state_dict({key.replace("model.diffusion_model.", "unet."): value for key, value in checkpoint.items() if key.replace("model.diffusion_model.", "unet.") in unet.state_dict()})
     for param_name, param in unet.state_dict().items():
         set_module_tensor_to_device(unet, param_name, "cuda:0", value=param)
     vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
     converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
+    if ("model" in original_config and "params" in original_config.model and "scale_factor" in original_config.model.params):
         vae_scaling_factor = original_config.model.params.scale_factor
     else:
+        vae_scaling_factor = 0.18215 # default SD scaling factor
     vae_config["scaling_factor"] = vae_scaling_factor
     with init_empty_weights():
         vae = AutoencoderKL(**vae_config)
     tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
     text_encoder: CLIPTextModel = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14").to(device=torch.device("cuda:0")) # type: ignore
     for param_name, param in converted_vae_checkpoint.items():
+        set_module_tensor_to_device(vae, param_name, "cuda:0", value=param)
     pipe = MVDreamStableDiffusionPipeline(
         vae=vae,
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
+    parser.add_argument("--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert.")
     parser.add_argument(
         "--original_config_file",
         default=None,
         type=str,
         help="The YAML config file corresponding to the original architecture.",
     )
     parser.add_argument(
         "--to_safetensors",
         action="store_true",
         help="Whether to store pipeline in safetensors format or not.",
     )
+    parser.add_argument("--half", action="store_true", help="Save weights in half precision.")
+    parser.add_argument("--test", help="Whether to test inference after convertion.")
     parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
     parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
     args = parser.parse_args()
     pipe = convert_from_original_mvdream_ckpt(
         checkpoint_path=args.checkpoint_path,
         original_config_file=args.original_config_file,
         device=args.device,
     )
     if args.half:
         pipe.to(torch_dtype=torch.float16)
+    if args.test:
+        images = pipe(
+            prompt="Head of Hatsune Miku",
+            negative_prompt="painting, bad quality, flat",
+            output_type="pil",
+            guidance_scale=7.5,
+            num_inference_steps=50,
+        )
+        for i, image in enumerate(images):
+            image.save(f"image_{i}.png") # type: ignore
+    pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)

scripts/models.py CHANGED Viewed

@@ -82,29 +82,19 @@ class Upsample(nn.Module):
                  upsampling occurs in the inner-two dimensions.
     """
-    def __init__(self,
-                 channels,
-                 use_conv,
-                 dims=2,
-                 out_channels=None,
-                 padding=1):
         super().__init__()
         self.channels = channels
         self.out_channels = out_channels or channels
         self.use_conv = use_conv
         self.dims = dims
         if use_conv:
-            self.conv = conv_nd(dims,
-                                self.channels,
-                                self.out_channels,
-                                3,
-                                padding=padding)
     def forward(self, x):
         assert x.shape[1] == self.channels
         if self.dims == 3:
-            x = F.interpolate(x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2),
-                              mode="nearest")
         else:
             x = F.interpolate(x, scale_factor=2, mode="nearest")
         if self.use_conv:
@@ -121,12 +111,7 @@ class Downsample(nn.Module):
                  downsampling occurs in the inner-two dimensions.
     """
-    def __init__(self,
-                 channels,
-                 use_conv,
-                 dims=2,
-                 out_channels=None,
-                 padding=1):
         super().__init__()
         self.channels = channels
         self.out_channels = out_channels or channels
@@ -134,12 +119,7 @@ class Downsample(nn.Module):
         self.dims = dims
         stride = 2 if dims != 3 else (1, 2, 2)
         if use_conv:
-            self.op = conv_nd(dims,
-                              self.channels,
-                              self.out_channels,
-                              3,
-                              stride=stride,
-                              padding=padding)
         else:
             assert self.channels == self.out_channels
             self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
@@ -208,33 +188,22 @@ class ResBlock(TimestepBlock):
             nn.SiLU(),
             linear(
                 emb_channels,
-                2 * self.out_channels
-                if use_scale_shift_norm else self.out_channels,
             ),
         )
         self.out_layers = nn.Sequential(
             normalization(self.out_channels),
             nn.SiLU(),
             nn.Dropout(p=dropout),
-            zero_module(
-                conv_nd(dims,
-                        self.out_channels,
-                        self.out_channels,
-                        3,
-                        padding=1)),
         )
         if self.out_channels == channels:
             self.skip_connection = nn.Identity()
         elif use_conv:
-            self.skip_connection = conv_nd(dims,
-                                           channels,
-                                           self.out_channels,
-                                           3,
-                                           padding=1)
         else:
-            self.skip_connection = conv_nd(dims, channels, self.out_channels,
-                                           1)
     def forward(self, x, emb):
         """
@@ -243,8 +212,7 @@ class ResBlock(TimestepBlock):
         :param emb: an [N x emb_channels] Tensor of timestep embeddings.
         :return: an [N x C x ...] Tensor of outputs.
         """
-        return checkpoint(self._forward, (x, emb), self.parameters(),
-                          self.use_checkpoint)
     def _forward(self, x, emb):
         if self.updown:
@@ -289,9 +257,7 @@ class AttentionBlock(nn.Module):
         if num_head_channels == -1:
             self.num_heads = num_heads
         else:
-            assert (
-                channels % num_head_channels == 0
-            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
             self.num_heads = channels // num_head_channels
         self.use_checkpoint = use_checkpoint
         self.norm = normalization(channels)
@@ -306,9 +272,7 @@ class AttentionBlock(nn.Module):
         self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
     def forward(self, x):
-        return checkpoint(
-            self._forward, (x, ), self.parameters(), True
-        )  # TODO: check checkpoint usage, is True # TODO: fix the .half call!!!
         #return pt_checkpoint(self._forward, x)  # pytorch
     def _forward(self, x):
@@ -358,12 +322,9 @@ class QKVAttentionLegacy(nn.Module):
         bs, width, length = qkv.shape
         assert width % (3 * self.n_heads) == 0
         ch = width // (3 * self.n_heads)
-        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch,
-                                                                       dim=1)
         scale = 1 / math.sqrt(math.sqrt(ch))
-        weight = th.einsum(
-            "bct,bcs->bts", q * scale,
-            k * scale)  # More stable with f16 than dividing afterwards
         weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
         a = th.einsum("bts,bcs->bct", weight, v)
         return a.reshape(bs, -1, length)
@@ -397,10 +358,9 @@ class QKVAttention(nn.Module):
             "bct,bcs->bts",
             (q * scale).view(bs * self.n_heads, ch, length),
             (k * scale).view(bs * self.n_heads, ch, length),
-        )  # More stable with f16 than dividing afterwards
         weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
-        a = th.einsum("bts,bcs->bct", weight,
-                      v.reshape(bs * self.n_heads, ch, length))
         return a.reshape(bs, -1, length)
     @staticmethod
@@ -450,41 +410,40 @@ class MultiViewUNetModel(nn.Module):
     """
     def __init__(
-        self,
-        image_size,
-        in_channels,
-        model_channels,
-        out_channels,
-        num_res_blocks,
-        attention_resolutions,
-        dropout=0,
-        channel_mult=(1, 2, 4, 8),
-        conv_resample=True,
-        dims=2,
-        num_classes=None,
-        use_checkpoint=False,
-        use_fp16=False,
-        use_bf16=False,
-        num_heads=-1,
-        num_head_channels=-1,
-        num_heads_upsample=-1,
-        use_scale_shift_norm=False,
-        resblock_updown=False,
-        use_new_attention_order=False,
-        use_spatial_transformer=False,  # custom transformer support
-        transformer_depth=1,  # custom transformer support
-        context_dim=None,  # custom transformer support
-        n_embed=None,  # custom support for prediction of discrete ids into codebook of first stage vq model
-        legacy=True,
-        disable_self_attentions=None,
-        num_attention_blocks=None,
-        disable_middle_self_attn=False,
-        use_linear_in_transformer=False,
-        adm_in_channels=None,
-        camera_dim=None,
     ):
         super().__init__()
-        assert num_classes is not None
         if use_spatial_transformer:
             assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
@@ -511,26 +470,19 @@ class MultiViewUNetModel(nn.Module):
             self.num_res_blocks = len(channel_mult) * [num_res_blocks]
         else:
             if len(num_res_blocks) != len(channel_mult):
-                raise ValueError(
-                    "provide num_res_blocks either as an int (globally constant) or "
-                    "as a list/tuple (per-level) with the same length as channel_mult"
-                )
             self.num_res_blocks = num_res_blocks
         if disable_self_attentions is not None:
             # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
             assert len(disable_self_attentions) == len(channel_mult)
         if num_attention_blocks is not None:
             assert len(num_attention_blocks) == len(self.num_res_blocks)
-            assert all(
-                map(
-                    lambda i: self.num_res_blocks[i] >= num_attention_blocks[i
-                                                                             ],
-                    range(len(num_attention_blocks))))
-            print(
-                f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
-                f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
-                f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
-                f"attention will still not be set.")
         self.attention_resolutions = attention_resolutions
         self.dropout = dropout
@@ -562,42 +514,36 @@ class MultiViewUNetModel(nn.Module):
         if self.num_classes is not None:
             if isinstance(self.num_classes, int):
-                self.label_emb = nn.Embedding(num_classes, time_embed_dim)
             elif self.num_classes == "continuous":
                 print("setting up linear c_adm embedding layer")
                 self.label_emb = nn.Linear(1, time_embed_dim)
             elif self.num_classes == "sequential":
                 assert adm_in_channels is not None
-                self.label_emb = nn.Sequential(
-                    nn.Sequential(
-                        linear(adm_in_channels, time_embed_dim),
-                        nn.SiLU(),
-                        linear(time_embed_dim, time_embed_dim),
-                    ))
             else:
                 raise ValueError()
-        self.input_blocks = nn.ModuleList([
-            TimestepEmbedSequential(
-                conv_nd(dims, in_channels, model_channels, 3, padding=1))
-        ])
         self._feature_size = model_channels
         input_block_chans = [model_channels]
         ch = model_channels
         ds = 1
         for level, mult in enumerate(channel_mult):
             for nr in range(self.num_res_blocks[level]):
-                layers: List[Any] = [
-                    ResBlock(
-                        ch,
-                        time_embed_dim,
-                        dropout,
-                        out_channels=mult * model_channels,
-                        dims=dims,
-                        use_checkpoint=use_checkpoint,
-                        use_scale_shift_norm=use_scale_shift_norm,
-                    )
-                ]
                 ch = mult * model_channels
                 if ds in attention_resolutions:
                     if num_head_channels == -1:
@@ -613,44 +559,29 @@ class MultiViewUNetModel(nn.Module):
                     else:
                         disabled_sa = False
-                    if num_attention_blocks is None or nr < num_attention_blocks[
-                            level]:
-                        layers.append(
-                            AttentionBlock(
-                                ch,
-                                use_checkpoint=use_checkpoint,
-                                num_heads=num_heads,
-                                num_head_channels=dim_head,
-                                use_new_attention_order=use_new_attention_order,
-                            ) if not use_spatial_transformer else
-                            SpatialTransformer3D(
-                                ch,
-                                num_heads,
-                                dim_head,
-                                depth=transformer_depth,
-                                context_dim=context_dim,
-                                disable_self_attn=disabled_sa,
-                                use_linear=use_linear_in_transformer,
-                                use_checkpoint=use_checkpoint))
                 self.input_blocks.append(TimestepEmbedSequential(*layers))
                 self._feature_size += ch
                 input_block_chans.append(ch)
             if level != len(channel_mult) - 1:
                 out_ch = ch
-                self.input_blocks.append(
-                    TimestepEmbedSequential(
-                        ResBlock(
-                            ch,
-                            time_embed_dim,
-                            dropout,
-                            out_channels=out_ch,
-                            dims=dims,
-                            use_checkpoint=use_checkpoint,
-                            use_scale_shift_norm=use_scale_shift_norm,
-                            down=True,
-                        ) if resblock_updown else Downsample(
-                            ch, conv_resample, dims=dims, out_channels=out_ch))
-                )
                 ch = out_ch
                 input_block_chans.append(ch)
                 ds *= 2
@@ -679,16 +610,8 @@ class MultiViewUNetModel(nn.Module):
                 num_heads=num_heads,
                 num_head_channels=dim_head,
                 use_new_attention_order=use_new_attention_order,
-            ) if not use_spatial_transformer else
-            SpatialTransformer3D(  # always uses a self-attn
-                ch,
-                num_heads,
-                dim_head,
-                depth=transformer_depth,
-                context_dim=context_dim,
-                disable_self_attn=disable_middle_self_attn,
-                use_linear=use_linear_in_transformer,
-                use_checkpoint=use_checkpoint),
             ResBlock(
                 ch,
                 time_embed_dim,
@@ -704,17 +627,15 @@ class MultiViewUNetModel(nn.Module):
         for level, mult in list(enumerate(channel_mult))[::-1]:
             for i in range(self.num_res_blocks[level] + 1):
                 ich = input_block_chans.pop()
-                layers = [
-                    ResBlock(
-                        ch + ich,
-                        time_embed_dim,
-                        dropout,
-                        out_channels=model_channels * mult,
-                        dims=dims,
-                        use_checkpoint=use_checkpoint,
-                        use_scale_shift_norm=use_scale_shift_norm,
-                    )
-                ]
                 ch = model_channels * mult
                 if ds in attention_resolutions:
                     if num_head_channels == -1:
@@ -730,39 +651,26 @@ class MultiViewUNetModel(nn.Module):
                     else:
                         disabled_sa = False
-                    if num_attention_blocks is None or i < num_attention_blocks[
-                            level]:
-                        layers.append(
-                            AttentionBlock(
-                                ch,
-                                use_checkpoint=use_checkpoint,
-                                num_heads=num_heads_upsample,
-                                num_head_channels=dim_head,
-                                use_new_attention_order=use_new_attention_order,
-                            ) if not use_spatial_transformer else
-                            SpatialTransformer3D(
-                                ch,
-                                num_heads,
-                                dim_head,
-                                depth=transformer_depth,
-                                context_dim=context_dim,
-                                disable_self_attn=disabled_sa,
-                                use_linear=use_linear_in_transformer,
-                                use_checkpoint=use_checkpoint))
-                if level and i == self.num_res_blocks[level]:
-                    out_ch = ch
-                    layers.append(
-                        ResBlock(
                             ch,
-                            time_embed_dim,
-                            dropout,
-                            out_channels=out_ch,
-                            dims=dims,
                             use_checkpoint=use_checkpoint,
-                            use_scale_shift_norm=use_scale_shift_norm,
-                            up=True,
-                        ) if resblock_updown else Upsample(
-                            ch, conv_resample, dims=dims, out_channels=out_ch))
                     ds //= 2
                 self.output_blocks.append(TimestepEmbedSequential(*layers))
                 self._feature_size += ch
@@ -770,8 +678,7 @@ class MultiViewUNetModel(nn.Module):
         self.out = nn.Sequential(
             normalization(ch),
             nn.SiLU(),
-            zero_module(
-                conv_nd(dims, model_channels, out_channels, 3, padding=1)),
         )
         if self.predict_codebook_ids:
             self.id_predictor = nn.Sequential(
@@ -796,14 +703,7 @@ class MultiViewUNetModel(nn.Module):
         self.middle_block.apply(convert_module_to_f32)
         self.output_blocks.apply(convert_module_to_f32)
-    def forward(self,
-                x,
-                timesteps=None,
-                context=None,
-                y: Optional[Tensor] = None,
-                camera=None,
-                num_frames=1,
-                **kwargs):
         """
         Apply the model to an input batch.
         :param x: an [(N x F) x C x ...] Tensor of inputs. F is the number of frames (views).
@@ -813,15 +713,10 @@ class MultiViewUNetModel(nn.Module):
         :param num_frames: a integer indicating number of frames for tensor reshaping.
         :return: an [(N x F) x C x ...] Tensor of outputs. F is the number of frames (views).
         """
-        assert x.shape[
-            0] % num_frames == 0, "[UNet] input batch size must be dividable by num_frames!"
-        assert (y is not None) == (
-            self.num_classes is not None
-        ), "must specify y if and only if the model is class-conditional"
         hs = []
-        t_emb = timestep_embedding(timesteps,
-                                   self.model_channels,
-                                   repeat_only=False)
         emb = self.time_embed(t_emb)
         if self.num_classes is not None:

                  upsampling occurs in the inner-two dimensions.
     """
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
         super().__init__()
         self.channels = channels
         self.out_channels = out_channels or channels
         self.use_conv = use_conv
         self.dims = dims
         if use_conv:
+            self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=padding)
     def forward(self, x):
         assert x.shape[1] == self.channels
         if self.dims == 3:
+            x = F.interpolate(x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest")
         else:
             x = F.interpolate(x, scale_factor=2, mode="nearest")
         if self.use_conv:
                  downsampling occurs in the inner-two dimensions.
     """
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
         super().__init__()
         self.channels = channels
         self.out_channels = out_channels or channels
         self.dims = dims
         stride = 2 if dims != 3 else (1, 2, 2)
         if use_conv:
+            self.op = conv_nd(dims, self.channels, self.out_channels, 3, stride=stride, padding=padding)
         else:
             assert self.channels == self.out_channels
             self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
             nn.SiLU(),
             linear(
                 emb_channels,
+                2 * self.out_channels if use_scale_shift_norm else self.out_channels,
             ),
         )
         self.out_layers = nn.Sequential(
             normalization(self.out_channels),
             nn.SiLU(),
             nn.Dropout(p=dropout),
+            zero_module(conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)),
         )
         if self.out_channels == channels:
             self.skip_connection = nn.Identity()
         elif use_conv:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 3, padding=1)
         else:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
     def forward(self, x, emb):
         """
         :param emb: an [N x emb_channels] Tensor of timestep embeddings.
         :return: an [N x C x ...] Tensor of outputs.
         """
+        return checkpoint(self._forward, (x, emb), self.parameters(), self.use_checkpoint)
     def _forward(self, x, emb):
         if self.updown:
         if num_head_channels == -1:
             self.num_heads = num_heads
         else:
+            assert (channels % num_head_channels == 0), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
             self.num_heads = channels // num_head_channels
         self.use_checkpoint = use_checkpoint
         self.norm = normalization(channels)
         self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
     def forward(self, x):
+        return checkpoint(self._forward, (x,), self.parameters(), True) # TODO: check checkpoint usage, is True # TODO: fix the .half call!!!
         #return pt_checkpoint(self._forward, x)  # pytorch
     def _forward(self, x):
         bs, width, length = qkv.shape
         assert width % (3 * self.n_heads) == 0
         ch = width // (3 * self.n_heads)
+        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
         scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = th.einsum("bct,bcs->bts", q * scale, k * scale) # More stable with f16 than dividing afterwards
         weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
         a = th.einsum("bts,bcs->bct", weight, v)
         return a.reshape(bs, -1, length)
             "bct,bcs->bts",
             (q * scale).view(bs * self.n_heads, ch, length),
             (k * scale).view(bs * self.n_heads, ch, length),
+        ) # More stable with f16 than dividing afterwards
         weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
         return a.reshape(bs, -1, length)
     @staticmethod
     """
     def __init__(
+            self,
+            image_size,
+            in_channels,
+            model_channels,
+            out_channels,
+            num_res_blocks,
+            attention_resolutions,
+            dropout=0,
+            channel_mult=(1, 2, 4, 8),
+            conv_resample=True,
+            dims=2,
+            num_classes=None,
+            use_checkpoint=False,
+            use_fp16=False,
+            use_bf16=False,
+            num_heads=-1,
+            num_head_channels=-1,
+            num_heads_upsample=-1,
+            use_scale_shift_norm=False,
+            resblock_updown=False,
+            use_new_attention_order=False,
+            use_spatial_transformer=False, # custom transformer support
+            transformer_depth=1, # custom transformer support
+            context_dim=None, # custom transformer support
+            n_embed=None, # custom support for prediction of discrete ids into codebook of first stage vq model
+            legacy=True,
+            disable_self_attentions=None,
+            num_attention_blocks=None,
+            disable_middle_self_attn=False,
+            use_linear_in_transformer=False,
+            adm_in_channels=None,
+            camera_dim=None,
     ):
         super().__init__()
         if use_spatial_transformer:
             assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
             self.num_res_blocks = len(channel_mult) * [num_res_blocks]
         else:
             if len(num_res_blocks) != len(channel_mult):
+                raise ValueError("provide num_res_blocks either as an int (globally constant) or "
+                                 "as a list/tuple (per-level) with the same length as channel_mult")
             self.num_res_blocks = num_res_blocks
         if disable_self_attentions is not None:
             # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
             assert len(disable_self_attentions) == len(channel_mult)
         if num_attention_blocks is not None:
             assert len(num_attention_blocks) == len(self.num_res_blocks)
+            assert all(map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks))))
+            print(f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
+                  f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
+                  f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
+                  f"attention will still not be set.")
         self.attention_resolutions = attention_resolutions
         self.dropout = dropout
         if self.num_classes is not None:
             if isinstance(self.num_classes, int):
+                self.label_emb = nn.Embedding(self.num_classes, time_embed_dim)
             elif self.num_classes == "continuous":
                 print("setting up linear c_adm embedding layer")
                 self.label_emb = nn.Linear(1, time_embed_dim)
             elif self.num_classes == "sequential":
                 assert adm_in_channels is not None
+                self.label_emb = nn.Sequential(nn.Sequential(
+                    linear(adm_in_channels, time_embed_dim),
+                    nn.SiLU(),
+                    linear(time_embed_dim, time_embed_dim),
+                ))
             else:
                 raise ValueError()
+        self.input_blocks = nn.ModuleList([TimestepEmbedSequential(conv_nd(dims, in_channels, model_channels, 3, padding=1))])
         self._feature_size = model_channels
         input_block_chans = [model_channels]
         ch = model_channels
         ds = 1
         for level, mult in enumerate(channel_mult):
             for nr in range(self.num_res_blocks[level]):
+                layers: List[Any] = [ResBlock(
+                    ch,
+                    time_embed_dim,
+                    dropout,
+                    out_channels=mult * model_channels,
+                    dims=dims,
+                    use_checkpoint=use_checkpoint,
+                    use_scale_shift_norm=use_scale_shift_norm,
+                )]
                 ch = mult * model_channels
                 if ds in attention_resolutions:
                     if num_head_channels == -1:
                     else:
                         disabled_sa = False
+                    if num_attention_blocks is None or nr < num_attention_blocks[level]:
+                        layers.append(AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads,
+                            num_head_channels=dim_head,
+                            use_new_attention_order=use_new_attention_order,
+                        ) if not use_spatial_transformer else SpatialTransformer3D(ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim, disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer, use_checkpoint=use_checkpoint))
                 self.input_blocks.append(TimestepEmbedSequential(*layers))
                 self._feature_size += ch
                 input_block_chans.append(ch)
             if level != len(channel_mult) - 1:
                 out_ch = ch
+                self.input_blocks.append(TimestepEmbedSequential(ResBlock(
+                    ch,
+                    time_embed_dim,
+                    dropout,
+                    out_channels=out_ch,
+                    dims=dims,
+                    use_checkpoint=use_checkpoint,
+                    use_scale_shift_norm=use_scale_shift_norm,
+                    down=True,
+                ) if resblock_updown else Downsample(ch, conv_resample, dims=dims, out_channels=out_ch)))
                 ch = out_ch
                 input_block_chans.append(ch)
                 ds *= 2
                 num_heads=num_heads,
                 num_head_channels=dim_head,
                 use_new_attention_order=use_new_attention_order,
+            ) if not use_spatial_transformer else SpatialTransformer3D( # always uses a self-attn
+                ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim, disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer, use_checkpoint=use_checkpoint),
             ResBlock(
                 ch,
                 time_embed_dim,
         for level, mult in list(enumerate(channel_mult))[::-1]:
             for i in range(self.num_res_blocks[level] + 1):
                 ich = input_block_chans.pop()
+                layers = [ResBlock(
+                    ch + ich,
+                    time_embed_dim,
+                    dropout,
+                    out_channels=model_channels * mult,
+                    dims=dims,
+                    use_checkpoint=use_checkpoint,
+                    use_scale_shift_norm=use_scale_shift_norm,
+                )]
                 ch = model_channels * mult
                 if ds in attention_resolutions:
                     if num_head_channels == -1:
                     else:
                         disabled_sa = False
+                    if num_attention_blocks is None or i < num_attention_blocks[level]:
+                        layers.append(AttentionBlock(
                             ch,
                             use_checkpoint=use_checkpoint,
+                            num_heads=num_heads_upsample,
+                            num_head_channels=dim_head,
+                            use_new_attention_order=use_new_attention_order,
+                        ) if not use_spatial_transformer else SpatialTransformer3D(ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim, disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer, use_checkpoint=use_checkpoint))
+                if level and i == self.num_res_blocks[level]:
+                    out_ch = ch
+                    layers.append(ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=out_ch,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                        up=True,
+                    ) if resblock_updown else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch))
                     ds //= 2
                 self.output_blocks.append(TimestepEmbedSequential(*layers))
                 self._feature_size += ch
         self.out = nn.Sequential(
             normalization(ch),
             nn.SiLU(),
+            zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
         )
         if self.predict_codebook_ids:
             self.id_predictor = nn.Sequential(
         self.middle_block.apply(convert_module_to_f32)
         self.output_blocks.apply(convert_module_to_f32)
+    def forward(self, x, timesteps=None, context=None, y: Optional[Tensor] = None, camera=None, num_frames=1, **kwargs):
         """
         Apply the model to an input batch.
         :param x: an [(N x F) x C x ...] Tensor of inputs. F is the number of frames (views).
         :param num_frames: a integer indicating number of frames for tensor reshaping.
         :return: an [(N x F) x C x ...] Tensor of outputs. F is the number of frames (views).
         """
+        assert x.shape[0] % num_frames == 0, "[UNet] input batch size must be dividable by num_frames!"
+        assert (y is not None) == (self.num_classes is not None), "must specify y if and only if the model is class-conditional"
         hs = []
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
         emb = self.time_embed(t_emb)
         if self.num_classes is not None:

scripts/pipeline_mvdream.py CHANGED Viewed

@@ -1,9 +1,8 @@
 import inspect
 from typing import Any, Callable, Dict, List, Optional, Union
-import torch
 from transformers import CLIPTextModel, CLIPTokenizer
 from diffusers import AutoencoderKL, DiffusionPipeline
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.utils import (
@@ -13,22 +12,16 @@ from diffusers.utils import (
     logging,
     replace_example_docstring,
 )
-try:
-    from diffusers import randn_tensor  # old import
-except ImportError:
-    from diffusers.utils.torch_utils import randn_tensor  # new import
 from diffusers.configuration_utils import FrozenDict
-import numpy as np
 from diffusers.schedulers import DDIMScheduler
-from models import MultiViewUNetModel, MultiViewUNetWrapperModel
-EXAMPLE_DOC_STRING = ""
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-import numpy as np
 def create_camera_to_world_matrix(elevation, azimuth):
     elevation = np.radians(elevation)
@@ -59,29 +52,21 @@ def create_camera_to_world_matrix(elevation, azimuth):
 def convert_opengl_to_blender(camera_matrix):
     if isinstance(camera_matrix, np.ndarray):
         # Construct transformation matrix to convert from OpenGL space to Blender space
-        flip_yz = np.array([[1, 0, 0, 0], [0, 0, -1, 0], [0, 1, 0, 0],
-                            [0, 0, 0, 1]])
         camera_matrix_blender = np.dot(flip_yz, camera_matrix)
     else:
         # Construct transformation matrix to convert from OpenGL space to Blender space
-        flip_yz = torch.tensor([[1, 0, 0, 0], [0, 0, -1, 0], [0, 1, 0, 0],
-                                [0, 0, 0, 1]])
         if camera_matrix.ndim == 3:
             flip_yz = flip_yz.unsqueeze(0)
-        camera_matrix_blender = torch.matmul(flip_yz.to(camera_matrix),
-                                             camera_matrix)
     return camera_matrix_blender
-def get_camera(num_frames,
-               elevation=15,
-               azimuth_start=0,
-               azimuth_span=360,
-               blender_coord=True):
     angle_gap = azimuth_span / num_frames
     cameras = []
-    for azimuth in np.arange(azimuth_start, azimuth_span + azimuth_start,
-                             angle_gap):
         camera_matrix = create_camera_to_world_matrix(elevation, azimuth)
         if blender_coord:
             camera_matrix = convert_opengl_to_blender(camera_matrix)
@@ -101,36 +86,25 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
     ):
         super().__init__()
-        if hasattr(scheduler.config,
-                   "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file")
-            deprecate("steps_offset!=1",
-                      "1.0.0",
-                      deprecation_message,
-                      standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
-        if hasattr(scheduler.config,
-                   "clip_sample") and scheduler.config.clip_sample is True:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
-                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
-                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
-                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
-                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
-            )
-            deprecate("clip_sample not set",
-                      "1.0.0",
-                      deprecation_message,
-                      standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
@@ -142,8 +116,7 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
             tokenizer=tokenizer,
             text_encoder=text_encoder,
         )
-        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) -
-                                    1)
         self.register_to_config(requires_safety_checker=False)
     def enable_vae_slicing(self):
@@ -189,16 +162,13 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
         if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
             from accelerate import cpu_offload
         else:
-            raise ImportError(
-                "`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher"
-            )
         device = torch.device(f"cuda:{gpu_id}")
         if self.device.type != "cpu":
             self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache(
-            )  # otherwise we don't see the memory savings (but they probably exist)
         for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
             cpu_offload(cpu_offloaded_model, device)
@@ -210,26 +180,20 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
         method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
         `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
         """
-        if is_accelerate_available() and is_accelerate_version(
-                ">=", "0.17.0.dev0"):
             from accelerate import cpu_offload_with_hook
         else:
-            raise ImportError(
-                "`enable_model_offload` requires `accelerate v0.17.0` or higher."
-            )
         device = torch.device(f"cuda:{gpu_id}")
         if self.device.type != "cpu":
             self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache(
-            )  # otherwise we don't see the memory savings (but they probably exist)
         hook = None
         for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model,
-                                            device,
-                                            prev_module_hook=hook)
         # We'll offload the last model manually.
         self.final_offload_hook = hook
@@ -244,9 +208,7 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
         if not hasattr(self.unet, "_hf_hook"):
             return self.device
         for module in self.unet.modules():
-            if (hasattr(module, "_hf_hook")
-                    and hasattr(module._hf_hook, "execution_device")
-                    and module._hf_hook.execution_device is not None):
                 return torch.device(module._hf_hook.execution_device)
         return self.device
@@ -257,8 +219,6 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
         num_images_per_prompt,
         do_classifier_free_guidance: bool,
         negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
     ):
         r"""
         Encodes the prompt into text encoder hidden states.
@@ -289,67 +249,55 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
         elif prompt is not None and isinstance(prompt, list):
             batch_size = len(prompt)
         else:
-            batch_size = prompt_embeds.shape[0]
-        if prompt_embeds is None:
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt,
-                                             padding="longest",
-                                             return_tensors="pt").input_ids
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[
-                    -1] and not torch.equal(text_input_ids, untruncated_ids):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-            if hasattr(self.text_encoder.config, "use_attention_mask"
-                       ) and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask.to(device)
-            else:
-                attention_mask = None
-            prompt_embeds = self.text_encoder(
-                text_input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype,
-                                         device=device)
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt,
-                                           seq_len, -1)
         # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
             uncond_tokens: List[str]
             if negative_prompt is None:
                 uncond_tokens = [""] * batch_size
             elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}.")
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`.")
             else:
                 uncond_tokens = negative_prompt
@@ -362,8 +310,7 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
                 return_tensors="pt",
             )
-            if hasattr(self.text_encoder.config, "use_attention_mask"
-                       ) and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask.to(device)
             else:
                 attention_mask = None
@@ -374,17 +321,13 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
             )
             negative_prompt_embeds = negative_prompt_embeds[0]
-        if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.to(
-                dtype=self.text_encoder.dtype, device=device)
-            negative_prompt_embeds = negative_prompt_embeds.repeat(
-                1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(
-                batch_size * num_images_per_prompt, seq_len, -1)
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
@@ -407,42 +350,25 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
-        accepts_eta = "eta" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(
-            inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
-    def prepare_latents(self,
-                        batch_size,
-                        num_channels_latents,
-                        height,
-                        width,
-                        dtype,
-                        device,
-                        generator,
-                        latents=None):
-        shape = (batch_size, num_channels_latents,
-                 height // self.vae_scale_factor,
-                 width // self.vae_scale_factor)
         if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
         if latents is None:
-            latents = randn_tensor(shape,
-                                   generator=generator,
-                                   device=device,
-                                   dtype=dtype)
         else:
             latents = latents.to(device)
@@ -451,7 +377,6 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
         return latents
     @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         height: int = 256,
@@ -462,87 +387,11 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
         negative_prompt: str = "bad quality",
         num_images_per_prompt: int = 1,
         eta: float = 0.0,
-        generator: Optional[Union[torch.Generator,
-                                  List[torch.Generator]]] = None,
         output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.FloatTensor],
-                                    None]] = None,
         callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        controlnet_conditioning_scale: float = 1.0,
     ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-        Args:
-            input_imgs (`PIL` or `List[PIL]`, *optional*):
-                The single input image for each 3D object
-            prompt_imgs (`PIL` or `List[PIL]`, *optional*):
-                Same as input_imgs, but will be used later as an image prompt condition, encoded by CLIP feature
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
-                `self.processor` in
-                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
-        Examples:
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Default height and width to unet
         batch_size = 4
         device = torch.device("cuda:0")
@@ -553,7 +402,7 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
-        # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps = self.scheduler.timesteps
@@ -563,26 +412,10 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
             num_images_per_prompt=num_images_per_prompt,
             do_classifier_free_guidance=do_classifier_free_guidance,
             negative_prompt=negative_prompt,
-        )  # type: ignore
         prompt_embeds_neg, prompt_embeds_pos = _prompt_embeds.chunk(2)
-        _, prompt_embeds_pos_2 = self._encode_prompt(
-            prompt="watermellon",
-            device=device,
-            num_images_per_prompt=num_images_per_prompt,
-            do_classifier_free_guidance=do_classifier_free_guidance,
-            negative_prompt=negative_prompt,
-        ).chunk(2) # type: ignore
-        _, prompt_embeds_pos_4 = self._encode_prompt(
-            prompt="long hair",
-            device=device,
-            num_images_per_prompt=num_images_per_prompt,
-            do_classifier_free_guidance=do_classifier_free_guidance,
-            negative_prompt=negative_prompt,
-        ).chunk(2) # type: ignore
-        # 5. Prepare latent variables
         latents: torch.Tensor = self.prepare_latents(
             batch_size * num_images_per_prompt,
             4,
@@ -594,33 +427,23 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
             None,
         )
-        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-        # 7. Denoising loop
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
                 multiplier = 2 if do_classifier_free_guidance else 1
                 latent_model_input = torch.cat([latents] * multiplier)
-                latent_model_input = self.scheduler.scale_model_input(
-                    latent_model_input, t)
                 # predict the noise residual
-                # print(
-                #     f"shape of latent_model_input: {latent_model_input.shape}"
-                # )  # [2*4, 4, 32, 32]
-                # print(f"shape of prompt_embeds: {prompt_embeds.shape}"
-                #       )  # [2*4, 77, 768]
-                # print(f"shape of camera: {camera.shape}")  # [4, 16]
                 noise_pred = self.unet.forward(
                     x=latent_model_input,
-                    timesteps=torch.tensor([t] * 4 * multiplier,
-                                           device=device),
-                    context=torch.cat([prompt_embeds_neg] * 4 +
-                                      [prompt_embeds_pos, prompt_embeds_pos_2, prompt_embeds_pos, prompt_embeds_pos_4]),
                     num_frames=4,
                     camera=torch.cat([camera] * multiplier),
                 )
@@ -628,46 +451,29 @@ class MVDreamStableDiffusionPipeline(DiffusionPipeline):
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (
-                        noise_pred_text - noise_pred_uncond)
                 # compute the previous noisy sample x_t -> x_t-1
                 # latents = self.scheduler.step(noise_pred.to(dtype=torch.float32), t, latents.to(dtype=torch.float32)).prev_sample.to(prompt_embeds.dtype)
-                latents: torch.Tensor = self.scheduler.step(
-                    noise_pred,
-                    t,
-                    latents,
-                    **extra_step_kwargs,
-                    return_dict=False)[0]
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and
-                    (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)  # type: ignore
-        # 8. Post-processing
         if output_type == "latent":
             image = latents
         elif output_type == "pil":
-            # 8. Post-processing
             image = self.decode_latents(latents)
-            # 10. Convert to PIL
             image = self.numpy_to_pil(image)
         else:
-            # 8. Post-processing
             image = self.decode_latents(latents)
         # Offload last model to CPU
-        if hasattr(
-                self,
-                "final_offload_hook") and self.final_offload_hook is not None:
             self.final_offload_hook.offload()
-        if not return_dict:
-            return image
-        return StableDiffusionPipelineOutput(images=image,
-                                             nsfw_content_detected=None)

+import torch
+import numpy as np
 import inspect
 from typing import Any, Callable, Dict, List, Optional, Union
 from transformers import CLIPTextModel, CLIPTokenizer
 from diffusers import AutoencoderKL, DiffusionPipeline
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.utils import (
     logging,
     replace_example_docstring,
 )
 from diffusers.configuration_utils import FrozenDict
 from diffusers.schedulers import DDIMScheduler
+try:
+    from diffusers import randn_tensor # old import # type: ignore
+except ImportError:
+    from diffusers.utils.torch_utils import randn_tensor # new import # type: ignore
+from models import MultiViewUNetWrapperModel
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
 def create_camera_to_world_matrix(elevation, azimuth):
     elevation = np.radians(elevation)
 def convert_opengl_to_blender(camera_matrix):
     if isinstance(camera_matrix, np.ndarray):
         # Construct transformation matrix to convert from OpenGL space to Blender space
+        flip_yz = np.array([[1, 0, 0, 0], [0, 0, -1, 0], [0, 1, 0, 0], [0, 0, 0, 1]])
         camera_matrix_blender = np.dot(flip_yz, camera_matrix)
     else:
         # Construct transformation matrix to convert from OpenGL space to Blender space
+        flip_yz = torch.tensor([[1, 0, 0, 0], [0, 0, -1, 0], [0, 1, 0, 0], [0, 0, 0, 1]])
         if camera_matrix.ndim == 3:
             flip_yz = flip_yz.unsqueeze(0)
+        camera_matrix_blender = torch.matmul(flip_yz.to(camera_matrix), camera_matrix)
     return camera_matrix_blender
+def get_camera(num_frames, elevation=15, azimuth_start=0, azimuth_span=360, blender_coord=True):
     angle_gap = azimuth_span / num_frames
     cameras = []
+    for azimuth in np.arange(azimuth_start, azimuth_span + azimuth_start, angle_gap):
         camera_matrix = create_camera_to_world_matrix(elevation, azimuth)
         if blender_coord:
             camera_matrix = convert_opengl_to_blender(camera_matrix)
     ):
         super().__init__()
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: # type: ignore
+            deprecation_message = (f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                                   f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " # type: ignore
+                                   "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                                   " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                                   " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                                   " file")
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True: # type: ignore
+            deprecation_message = (f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                                   " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                                   " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                                   " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                                   " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file")
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
             tokenizer=tokenizer,
             text_encoder=text_encoder,
         )
+        self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=False)
     def enable_vae_slicing(self):
         if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
             from accelerate import cpu_offload
         else:
+            raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
         device = torch.device(f"cuda:{gpu_id}")
         if self.device.type != "cpu":
             self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
         for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
             cpu_offload(cpu_offloaded_model, device)
         method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
         `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
         """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
             from accelerate import cpu_offload_with_hook
         else:
+            raise ImportError("`enable_model_offload` requires `accelerate v0.17.0` or higher.")
         device = torch.device(f"cuda:{gpu_id}")
         if self.device.type != "cpu":
             self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
         hook = None
         for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
         # We'll offload the last model manually.
         self.final_offload_hook = hook
         if not hasattr(self.unet, "_hf_hook"):
             return self.device
         for module in self.unet.modules():
+            if (hasattr(module, "_hf_hook") and hasattr(module._hf_hook, "execution_device") and module._hf_hook.execution_device is not None):
                 return torch.device(module._hf_hook.execution_device)
         return self.device
         num_images_per_prompt,
         do_classifier_free_guidance: bool,
         negative_prompt=None,
     ):
         r"""
         Encodes the prompt into text encoder hidden states.
         elif prompt is not None and isinstance(prompt, list):
             batch_size = len(prompt)
         else:
+            raise ValueError(f"`prompt` should be either a string or a list of strings, but got {type(prompt)}.")
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1:-1])
+            logger.warning("The following part of your input was truncated because CLIP can only handle sequences up to"
+                            f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+        prompt_embeds = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        prompt_embeds = prompt_embeds[0]
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
         # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
             uncond_tokens: List[str]
             if negative_prompt is None:
                 uncond_tokens = [""] * batch_size
             elif type(prompt) is not type(negative_prompt):
+                raise TypeError(f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                                f" {type(prompt)}.")
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
+                raise ValueError(f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                                 f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                                 " the batch size of `prompt`.")
             else:
                 uncond_tokens = negative_prompt
                 return_tensors="pt",
             )
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = uncond_input.attention_mask.to(device)
             else:
                 attention_mask = None
             )
             negative_prompt_embeds = negative_prompt_embeds[0]
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
         # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
         if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                             f" size of {batch_size}. Make sure the batch size matches the length of the generators.")
         if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
         else:
             latents = latents.to(device)
         return latents
     @torch.no_grad()
     def __call__(
         self,
         height: int = 256,
         negative_prompt: str = "bad quality",
         num_images_per_prompt: int = 1,
         eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         output_type: Optional[str] = "pil",
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: int = 1,
     ):
         batch_size = 4
         device = torch.device("cuda:0")
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
+        # Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps = self.scheduler.timesteps
             num_images_per_prompt=num_images_per_prompt,
             do_classifier_free_guidance=do_classifier_free_guidance,
             negative_prompt=negative_prompt,
+        ) # type: ignore
         prompt_embeds_neg, prompt_embeds_pos = _prompt_embeds.chunk(2)
+        # Prepare latent variables
         latents: torch.Tensor = self.prepare_latents(
             batch_size * num_images_per_prompt,
             4,
             None,
         )
+        # Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
                 multiplier = 2 if do_classifier_free_guidance else 1
                 latent_model_input = torch.cat([latents] * multiplier)
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
                 # predict the noise residual
                 noise_pred = self.unet.forward(
                     x=latent_model_input,
+                    timesteps=torch.tensor([t] * 4 * multiplier, device=device),
+                    context=torch.cat([prompt_embeds_neg] * 4 + [prompt_embeds_pos] * 4),
                     num_frames=4,
                     camera=torch.cat([camera] * multiplier),
                 )
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 # compute the previous noisy sample x_t -> x_t-1
                 # latents = self.scheduler.step(noise_pred.to(dtype=torch.float32), t, latents.to(dtype=torch.float32)).prev_sample.to(prompt_embeds.dtype)
+                latents: torch.Tensor = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
                 # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents) # type: ignore
+        # Post-processing
         if output_type == "latent":
             image = latents
         elif output_type == "pil":
             image = self.decode_latents(latents)
             image = self.numpy_to_pil(image)
         else:
             image = self.decode_latents(latents)
         # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
             self.final_offload_hook.offload()
+        return image

vae/diffusion_pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3f1b909aa85cc520a2986d6fc379478e0c46c41f853f9a7c73c0150b2c9c9b8b
 size 334716034

 version https://git-lfs.github.com/spec/v1
+oid sha256:660d2d3c357697e87aded9b7d821dd726977291c049be64489132cd442ce6477
 size 334716034