Spaces:

fffiloni
/

svd_keyframe_interpolation

Running on Zero

App Files Files Community

fffiloni commited on 22 days ago

Commit

fcb4edd

•

1 Parent(s): f3566a8

Upload 33 files

Browse files

Files changed (34) hide show

.gitattributes +13 -0
LICENSE +201 -0
attn_ctrl/attention_control.py +285 -0
checkpoints/.DS_Store +0 -0
checkpoints/svd_reverse_motion_with_attnflip/unet/config.json +38 -0
custom_diffusers/pipelines/pipeline_frame_interpolation_with_noise_injection.py +576 -0
custom_diffusers/pipelines/pipeline_stable_video_diffusion_with_ref_attnmap.py +514 -0
custom_diffusers/schedulers/scheduling_euler_discrete.py +466 -0
dataset/stable_video_dataset.py +70 -0
enviroment.yml +45 -0
eval/val/0010.png +0 -0
eval/val/0022.png +0 -0
eval/val/0023.png +3 -0
eval/val/turtle.png +3 -0
examples/.gitignore +1 -0
examples/example_001.gif +3 -0
examples/example_001/frame1.png +3 -0
examples/example_001/frame2.png +3 -0
examples/example_002.gif +3 -0
examples/example_002/frame1.png +3 -0
examples/example_002/frame2.png +3 -0
examples/example_003.gif +3 -0
examples/example_003/frame1.png +0 -0
examples/example_003/frame2.png +3 -0
examples/example_004.gif +3 -0
examples/example_004/frame1.png +3 -0
examples/example_004/frame2.png +3 -0
gradio_app.py +137 -0
keyframe_interpolation.py +98 -0
keyframe_interpolation.sh +26 -0
requirements.txt +35 -0
train_reverse_motion_with_attnflip.py +591 -0
train_reverse_motion_with_attnflip.sh +20 -0
utils/parse_args.py +224 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,16 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+eval/val/0023.png filter=lfs diff=lfs merge=lfs -text
+eval/val/turtle.png filter=lfs diff=lfs merge=lfs -text
+examples/example_001.gif filter=lfs diff=lfs merge=lfs -text
+examples/example_001/frame1.png filter=lfs diff=lfs merge=lfs -text
+examples/example_001/frame2.png filter=lfs diff=lfs merge=lfs -text
+examples/example_002.gif filter=lfs diff=lfs merge=lfs -text
+examples/example_002/frame1.png filter=lfs diff=lfs merge=lfs -text
+examples/example_002/frame2.png filter=lfs diff=lfs merge=lfs -text
+examples/example_003.gif filter=lfs diff=lfs merge=lfs -text
+examples/example_003/frame2.png filter=lfs diff=lfs merge=lfs -text
+examples/example_004.gif filter=lfs diff=lfs merge=lfs -text
+examples/example_004/frame1.png filter=lfs diff=lfs merge=lfs -text
+examples/example_004/frame2.png filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

attn_ctrl/attention_control.py ADDED Viewed

	@@ -0,0 +1,285 @@

+import abc
+import torch
+from typing import Tuple, List
+from einops import rearrange
+class AttentionControl(abc.ABC):
+    def step_callback(self, x_t):
+        return x_t
+    def between_steps(self):
+        return
+    @property
+    def num_uncond_att_layers(self):
+        return 0
+    @abc.abstractmethod
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        raise NotImplementedError
+    def __call__(self, attn, is_cross: bool, place_in_unet: str):
+        if self.cur_att_layer >= self.num_uncond_att_layers:
+            self.forward(attn, is_cross, place_in_unet)
+        self.cur_att_layer += 1
+        if self.cur_att_layer == self.num_att_layers + self.num_uncond_att_layers:
+            self.cur_att_layer = 0
+            self.cur_step += 1
+            self.between_steps()
+    def reset(self):
+        self.cur_step = 0
+        self.cur_att_layer = 0
+    def __init__(self):
+        self.cur_step = 0
+        self.num_att_layers = -1
+        self.cur_att_layer = 0
+class AttentionStore(AttentionControl):
+    @staticmethod
+    def get_empty_store():
+        return {"down_cross": [], "mid_cross": [], "up_cross": [],
+                "down_self": [], "mid_self": [], "up_self": []}
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        key = f"{place_in_unet}_{'cross' if is_cross else 'self'}"
+        #if attn.shape[1] <= 32 ** 2:  # avoid memory overhead
+        self.step_store[key].append(attn)
+        return attn
+    def between_steps(self):
+        self.attention_store = self.step_store
+        if self.save_global_store:
+            with torch.no_grad():
+                if len(self.global_store) == 0:
+                    self.global_store = self.step_store
+                else:
+                    for key in self.global_store:
+                        for i in range(len(self.global_store[key])):
+                            self.global_store[key][i] += self.step_store[key][i].detach()
+        self.step_store = self.get_empty_store()
+        self.step_store = self.get_empty_store()
+    def get_average_attention(self):
+        average_attention = self.attention_store
+        return average_attention
+    def get_average_global_attention(self):
+        average_attention = {key: [item / self.cur_step for item in self.global_store[key]] for key in
+                             self.attention_store}
+        return average_attention
+    def reset(self):
+        super(AttentionStore, self).reset()
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+        self.global_store = {}
+    def __init__(self, save_global_store=False):
+        '''
+        Initialize an empty AttentionStore
+        :param step_index: used to visualize only a specific step in the diffusion process
+        '''
+        super(AttentionStore, self).__init__()
+        self.save_global_store = save_global_store
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+        self.global_store = {}
+        self.curr_step_index = 0
+class AttentionStoreProcessor:
+    def __init__(self, attnstore, place_in_unet):
+        super().__init__()
+        self.attnstore = attnstore
+        self.place_in_unet = place_in_unet
+    def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None, temb=None):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        self.attnstore(rearrange(attention_probs, '(b h) i j -> b h i j', b=batch_size), False, self.place_in_unet)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class AttentionFlipCtrlProcessor:
+    def __init__(self, attnstore, attnstore_ref, place_in_unet):
+        super().__init__()
+        self.attnstore = attnstore
+        self.attnrstore_ref = attnstore_ref
+        self.place_in_unet = place_in_unet
+    def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None, temb=None):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        if self.place_in_unet == 'mid':
+            cur_att_layer = self.attnstore.cur_att_layer-len(self.attnrstore_ref.attention_store["down_self"])
+        elif self.place_in_unet == 'up':
+            cur_att_layer = self.attnstore.cur_att_layer-(len(self.attnrstore_ref.attention_store["down_self"])+len(self.attnrstore_ref.attention_store["mid_self"]))
+        else:
+            cur_att_layer = self.attnstore.cur_att_layer
+        attention_probs_ref = self.attnrstore_ref.attention_store[f"{self.place_in_unet}_{'self'}"][cur_att_layer]
+        attention_probs_ref = rearrange(attention_probs_ref, 'b h i j -> (b h) i j')
+        attention_probs = 0.0 * attention_probs + 1.0 * torch.flip(attention_probs_ref, dims=(-2, -1))
+        self.attnstore(rearrange(attention_probs, '(b h) i j -> b h i j', b=batch_size), False, self.place_in_unet)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+def register_temporal_self_attention_control(unet, controller):
+    attn_procs = {}
+    temporal_self_att_count = 0
+    for name in unet.attn_processors.keys():
+        if name.endswith("temporal_transformer_blocks.0.attn1.processor"):
+            if name.startswith("mid_block"):
+                place_in_unet = "mid"
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                place_in_unet = "up"
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                place_in_unet = "down"
+            else:
+                continue
+            temporal_self_att_count += 1
+            attn_procs[name] = AttentionStoreProcessor(
+                attnstore=controller, place_in_unet=place_in_unet
+            )
+        else:
+            attn_procs[name] = unet.attn_processors[name]
+    unet.set_attn_processor(attn_procs)
+    controller.num_att_layers = temporal_self_att_count
+def register_temporal_self_attention_flip_control(unet, controller, controller_ref):
+    attn_procs = {}
+    temporal_self_att_count = 0
+    for name in unet.attn_processors.keys():
+        if name.endswith("temporal_transformer_blocks.0.attn1.processor"):
+            if name.startswith("mid_block"):
+                place_in_unet = "mid"
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                place_in_unet = "up"
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                place_in_unet = "down"
+            else:
+                continue
+            temporal_self_att_count += 1
+            attn_procs[name] = AttentionFlipCtrlProcessor(
+                attnstore=controller, attnstore_ref=controller_ref, place_in_unet=place_in_unet
+            )
+        else:
+            attn_procs[name] = unet.attn_processors[name]
+    unet.set_attn_processor(attn_procs)
+    controller.num_att_layers = temporal_self_att_count

checkpoints/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

checkpoints/svd_reverse_motion_with_attnflip/unet/config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "_class_name": "UNetSpatioTemporalConditionModel",
+  "_diffusers_version": "0.27.0",
+  "_name_or_path": "/gscratch/realitylab/xiaojwan/projects/video_narratives/stabilityai/stable-video-diffusion-img2vid",
+  "addition_time_embed_dim": 256,
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "cross_attention_dim": 1024,
+  "down_block_types": [
+    "CrossAttnDownBlockSpatioTemporal",
+    "CrossAttnDownBlockSpatioTemporal",
+    "CrossAttnDownBlockSpatioTemporal",
+    "DownBlockSpatioTemporal"
+  ],
+  "in_channels": 8,
+  "layers_per_block": 2,
+  "num_attention_heads": [
+    5,
+    10,
+    20,
+    20
+  ],
+  "num_frames": 14,
+  "out_channels": 4,
+  "projection_class_embeddings_input_dim": 768,
+  "sample_size": 96,
+  "transformer_layers_per_block": 1,
+  "up_block_types": [
+    "UpBlockSpatioTemporal",
+    "CrossAttnUpBlockSpatioTemporal",
+    "CrossAttnUpBlockSpatioTemporal",
+    "CrossAttnUpBlockSpatioTemporal"
+  ]
+}

custom_diffusers/pipelines/pipeline_frame_interpolation_with_noise_injection.py ADDED Viewed

	@@ -0,0 +1,576 @@

+# Adpated from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_stable_video_diffusion.py
+import inspect
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Union
+import copy
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from diffusers import AutoencoderKLTemporalDecoder,  UNetSpatioTemporalConditionModel
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.utils import logging
+from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_video_diffusion.pipeline_stable_video_diffusion import (
+        _append_dims,
+        tensor2vid,
+        _resize_with_antialiasing,
+        StableVideoDiffusionPipelineOutput
+)
+from ..schedulers.scheduling_euler_discrete import EulerDiscreteScheduler
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class FrameInterpolationWithNoiseInjectionPipeline(DiffusionPipeline):
+    r"""
+    Pipeline to generate video from an input image using Stable Video Diffusion.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    Args:
+        vae ([`AutoencoderKLTemporalDecoder`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
+            Frozen CLIP image-encoder ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)).
+        unet ([`UNetSpatioTemporalConditionModel`]):
+            A `UNetSpatioTemporalConditionModel` to denoise the encoded image latents.
+        scheduler ([`EulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images.
+    """
+    model_cpu_offload_seq = "image_encoder->unet->vae"
+    _callback_tensor_inputs = ["latents"]
+    def __init__(
+        self,
+        vae: AutoencoderKLTemporalDecoder,
+        image_encoder: CLIPVisionModelWithProjection,
+        unet: UNetSpatioTemporalConditionModel,
+        scheduler: EulerDiscreteScheduler,
+        feature_extractor: CLIPImageProcessor,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            image_encoder=image_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.ori_unet = copy.deepcopy(unet)
+    def _encode_image(
+        self,
+        image: PipelineImageInput,
+        device: Union[str, torch.device],
+        num_videos_per_prompt: int,
+        do_classifier_free_guidance: bool,
+    ) -> torch.FloatTensor:
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.image_processor.pil_to_numpy(image)
+            image = self.image_processor.numpy_to_pt(image)
+            # We normalize the image before resizing to match with the original implementation.
+            # Then we unnormalize it after resizing.
+            image = image * 2.0 - 1.0
+            image = _resize_with_antialiasing(image, (224, 224))
+            image = (image + 1.0) / 2.0
+        # Normalize the image with for CLIP input
+        image = self.feature_extractor(
+            images=image,
+            do_normalize=True,
+            do_center_crop=False,
+            do_resize=False,
+            do_rescale=False,
+            return_tensors="pt",
+        ).pixel_values
+        image = image.to(device=device, dtype=dtype)
+        image_embeddings = self.image_encoder(image).image_embeds
+        image_embeddings = image_embeddings.unsqueeze(1)
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_videos_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+        if do_classifier_free_guidance:
+            negative_image_embeddings = torch.zeros_like(image_embeddings)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([negative_image_embeddings, image_embeddings])
+        return image_embeddings
+    def _encode_vae_image(
+        self,
+        image: torch.Tensor,
+        device: Union[str, torch.device],
+        num_videos_per_prompt: int,
+        do_classifier_free_guidance: bool,
+    ):
+        image = image.to(device=device)
+        image_latents = self.vae.encode(image).latent_dist.mode()
+        if do_classifier_free_guidance:
+            negative_image_latents = torch.zeros_like(image_latents)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_latents = torch.cat([negative_image_latents, image_latents])
+        # duplicate image_latents for each generation per prompt, using mps friendly method
+        image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1)
+        return image_latents
+    def _get_add_time_ids(
+        self,
+        fps: int,
+        motion_bucket_id: int,
+        noise_aug_strength: float,
+        dtype: torch.dtype,
+        batch_size: int,
+        num_videos_per_prompt: int,
+        do_classifier_free_guidance: bool,
+    ):
+        add_time_ids = [fps, motion_bucket_id, noise_aug_strength]
+        passed_add_embed_dim = self.unet.config.addition_time_embed_dim * len(add_time_ids)
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_time_ids = add_time_ids.repeat(batch_size * num_videos_per_prompt, 1)
+        if do_classifier_free_guidance:
+            add_time_ids = torch.cat([add_time_ids, add_time_ids])
+        return add_time_ids
+    def decode_latents(self, latents: torch.FloatTensor, num_frames: int, decode_chunk_size: int = 14):
+        # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width]
+        latents = latents.flatten(0, 1)
+        latents = 1 / self.vae.config.scaling_factor * latents
+        forward_vae_fn = self.vae._orig_mod.forward if is_compiled_module(self.vae) else self.vae.forward
+        accepts_num_frames = "num_frames" in set(inspect.signature(forward_vae_fn).parameters.keys())
+        # decode decode_chunk_size frames at a time to avoid OOM
+        frames = []
+        for i in range(0, latents.shape[0], decode_chunk_size):
+            num_frames_in = latents[i : i + decode_chunk_size].shape[0]
+            decode_kwargs = {}
+            if accepts_num_frames:
+                # we only pass num_frames_in if it's expected
+                decode_kwargs["num_frames"] = num_frames_in
+            frame = self.vae.decode(latents[i : i + decode_chunk_size], **decode_kwargs).sample
+            frames.append(frame)
+        frames = torch.cat(frames, dim=0)
+        # [batch*frames, channels, height, width] -> [batch, channels, frames, height, width]
+        frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        frames = frames.float()
+        return frames
+    def check_inputs(self, image, height, width):
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+    def prepare_latents(
+        self,
+        batch_size: int,
+        num_frames: int,
+        num_channels_latents: int,
+        height: int,
+        width: int,
+        dtype: torch.dtype,
+        device: Union[str, torch.device],
+        generator: torch.Generator,
+        latents: Optional[torch.FloatTensor] = None,
+    ):
+        shape = (
+            batch_size,
+            num_frames,
+            num_channels_latents // 2,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        if isinstance(self.guidance_scale, (int, float)):
+            return self.guidance_scale > 1
+        return self.guidance_scale.max() > 1
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @torch.no_grad()
+    def multidiffusion_step(self, latents, t,
+                    image1_embeddings,
+                    image2_embeddings,
+                    image1_latents,
+                    image2_latents,
+                    added_time_ids,
+                    avg_weight
+    ):
+        # expand the latents if we are doing classifier free guidance
+        latents1 = latents
+        latents2 = torch.flip(latents, (1,))
+        latent_model_input1 = torch.cat([latents1] * 2) if self.do_classifier_free_guidance else latents1
+        latent_model_input1 = self.scheduler.scale_model_input(latent_model_input1, t)
+        latent_model_input2 = torch.cat([latents2] * 2) if self.do_classifier_free_guidance else latents2
+        latent_model_input2= self.scheduler.scale_model_input(latent_model_input2, t)
+        # Concatenate image_latents over channels dimention
+        latent_model_input1 = torch.cat([latent_model_input1, image1_latents], dim=2)
+        latent_model_input2 = torch.cat([latent_model_input2, image2_latents], dim=2)
+        # predict the noise residual
+        noise_pred1 = self.ori_unet(
+            latent_model_input1,
+            t,
+            encoder_hidden_states=image1_embeddings,
+            added_time_ids=added_time_ids,
+            return_dict=False,
+        )[0]
+        noise_pred2 = self.unet(
+            latent_model_input2,
+            t,
+            encoder_hidden_states=image2_embeddings,
+            added_time_ids=added_time_ids,
+            return_dict=False,
+        )[0]
+        # perform guidance
+        if self.do_classifier_free_guidance:
+            noise_pred_uncond1, noise_pred_cond1 = noise_pred1.chunk(2)
+            noise_pred1 = noise_pred_uncond1 + self.guidance_scale * (noise_pred_cond1 - noise_pred_uncond1)
+            noise_pred_uncond2, noise_pred_cond2 = noise_pred2.chunk(2)
+            noise_pred2 = noise_pred_uncond2 + self.guidance_scale * (noise_pred_cond2 - noise_pred_uncond2)
+        noise_pred2 = torch.flip(noise_pred2, (1,))
+        noise_pred = avg_weight*noise_pred1+ (1-avg_weight)*noise_pred2
+        return noise_pred
+    @torch.no_grad()
+    def __call__(
+        self,
+        image1: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor],
+        image2: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor],
+        height: int = 576,
+        width: int = 1024,
+        num_frames: Optional[int] = None,
+        num_inference_steps: int = 25,
+        min_guidance_scale: float = 1.0,
+        max_guidance_scale: float = 3.0,
+        fps: int = 7,
+        motion_bucket_id: int = 127,
+        noise_aug_strength: float = 0.02,
+        decode_chunk_size: Optional[int] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        weighted_average: bool = False,
+        noise_injection_steps: int = 0,
+        noise_injection_ratio: float=0.0,
+        return_dict: bool = True,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
+                [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_frames (`int`, *optional*):
+                The number of video frames to generate. Defaults to 14 for `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`
+            num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            min_guidance_scale (`float`, *optional*, defaults to 1.0):
+                The minimum guidance scale. Used for the classifier free guidance with first frame.
+            max_guidance_scale (`float`, *optional*, defaults to 3.0):
+                The maximum guidance scale. Used for the classifier free guidance with last frame.
+            fps (`int`, *optional*, defaults to 7):
+                Frames per second. The rate at which the generated images shall be exported to a video after generation.
+                Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
+            motion_bucket_id (`int`, *optional*, defaults to 127):
+                The motion bucket ID. Used as conditioning for the generation. The higher the number the more motion will be in the video.
+            noise_aug_strength (`float`, *optional*, defaults to 0.02):
+                The amount of noise added to the init image, the higher it is the less the video will look like the init image. Increase it for more motion.
+            decode_chunk_size (`int`, *optional*):
+                The number of frames to decode at a time. The higher the chunk size, the higher the temporal consistency
+                between frames, but also the higher the memory consumption. By default, the decoder will decode all frames at once
+                for maximal quality. Reduce `decode_chunk_size` to reduce memory usage.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+        Returns:
+            [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list of list with the generated frames.
+        Examples:
+        ```py
+        from diffusers import StableVideoDiffusionPipeline
+        from diffusers.utils import load_image, export_to_video
+        pipe = StableVideoDiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16")
+        pipe.to("cuda")
+        image = load_image("https://lh3.googleusercontent.com/y-iFOHfLTwkuQSUegpwDdgKmOjRSTvPxat63dQLB25xkTs4lhIbRUFeNBWZzYf370g=s1200")
+        image = image.resize((1024, 576))
+        frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0]
+        export_to_video(frames, "generated.mp4", fps=7)
+        ```
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        num_frames = num_frames if num_frames is not None else self.unet.config.num_frames
+        decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(image1, height, width)
+        self.check_inputs(image2, height, width)
+        # 2. Define call parameters
+        if isinstance(image1, PIL.Image.Image):
+            batch_size = 1
+        elif isinstance(image1, list):
+            batch_size = len(image1)
+        else:
+            batch_size = image1.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        self._guidance_scale = max_guidance_scale
+        # 3. Encode input image
+        image1_embeddings = self._encode_image(image1, device, num_videos_per_prompt, self.do_classifier_free_guidance)
+        image2_embeddings = self._encode_image(image2, device, num_videos_per_prompt, self.do_classifier_free_guidance)
+        # NOTE: Stable Diffusion Video was conditioned on fps - 1, which
+        # is why it is reduced here.
+        # See: https://github.com/Stability-AI/generative-models/blob/ed0997173f98eaf8f4edf7ba5fe8f15c6b877fd3/scripts/sampling/simple_video_sample.py#L188
+        fps = fps - 1
+        # 4. Encode input image using VAE
+        image1 = self.image_processor.preprocess(image1, height=height, width=width).to(device)
+        image2 = self.image_processor.preprocess(image2, height=height, width=width).to(device)
+        noise = randn_tensor(image1.shape, generator=generator, device=image1.device, dtype=image1.dtype)
+        image1 = image1 + noise_aug_strength * noise
+        image2 = image2 + noise_aug_strength * noise
+        needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float32)
+        # Repeat the image latents for each frame so we can concatenate them with the noise
+        # image_latents [batch, channels, height, width] ->[batch, num_frames, channels, height, width]
+        image1_latent = self._encode_vae_image(image1, device, num_videos_per_prompt, self.do_classifier_free_guidance)
+        image1_latent = image1_latent.to(image1_embeddings.dtype)
+        image1_latents = image1_latent.unsqueeze(1).repeat(1, num_frames, 1, 1, 1)
+        image2_latent = self._encode_vae_image(image2, device, num_videos_per_prompt, self.do_classifier_free_guidance)
+        image2_latent = image2_latent.to(image2_embeddings.dtype)
+        image2_latents = image2_latent.unsqueeze(1).repeat(1, num_frames, 1, 1, 1)
+        # cast back to fp16 if needed
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float16)
+        # 5. Get Added Time IDs
+        added_time_ids = self._get_add_time_ids(
+            fps,
+            motion_bucket_id,
+            noise_aug_strength,
+            image1_embeddings.dtype,
+            batch_size,
+            num_videos_per_prompt,
+            self.do_classifier_free_guidance,
+        )
+        added_time_ids = added_time_ids.to(device)
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_frames,
+            num_channels_latents,
+            height,
+            width,
+            image1_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 7. Prepare guidance scale
+        guidance_scale = torch.linspace(min_guidance_scale, max_guidance_scale, num_frames).unsqueeze(0)
+        guidance_scale = guidance_scale.to(device, latents.dtype)
+        guidance_scale = guidance_scale.repeat(batch_size * num_videos_per_prompt, 1)
+        guidance_scale = _append_dims(guidance_scale, latents.ndim)
+        if weighted_average:
+            self._guidance_scale = guidance_scale
+            w = torch.linspace(1, 0, num_frames).unsqueeze(0).to(device, latents.dtype)
+            w = w.repeat(batch_size*num_videos_per_prompt, 1)
+            w = _append_dims(w, latents.ndim)
+        else:
+            self._guidance_scale = (guidance_scale+torch.flip(guidance_scale, (1,)))*0.5
+            w = 0.5
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        self.ori_unet = self.ori_unet.to(device)
+        noise_injection_step_threshold = int(num_inference_steps*noise_injection_ratio)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                noise_pred = self.multidiffusion_step(latents, t,
+                    image1_embeddings, image2_embeddings,
+                    image1_latents, image2_latents, added_time_ids, w
+                )
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+                if i < noise_injection_step_threshold and noise_injection_steps > 0:
+                    sigma_t = self.scheduler.sigmas[self.scheduler.step_index]
+                    sigma_tm1 = self.scheduler.sigmas[self.scheduler.step_index+1]
+                    sigma = torch.sqrt(sigma_t**2-sigma_tm1**2)
+                    for j in range(noise_injection_steps):
+                        noise = randn_tensor(latents.shape, device=latents.device, dtype=latents.dtype)
+                        noise = noise * sigma
+                        latents = latents + noise
+                        noise_pred = self.multidiffusion_step(latents, t,
+                            image1_embeddings, image2_embeddings,
+                            image1_latents, image2_latents, added_time_ids, w
+                        )
+                        # compute the previous noisy sample x_t -> x_t-1
+                        latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+                self.scheduler._step_index += 1
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+        if not output_type == "latent":
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+            frames = self.decode_latents(latents, num_frames, decode_chunk_size)
+            frames = tensor2vid(frames, self.image_processor, output_type=output_type)
+        else:
+            frames = latents
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return frames
+        return StableVideoDiffusionPipelineOutput(frames=frames)

custom_diffusers/pipelines/pipeline_stable_video_diffusion_with_ref_attnmap.py ADDED Viewed

	@@ -0,0 +1,514 @@

+# Adpated from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_stable_video_diffusion.py
+import inspect
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Union
+import copy
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from diffusers import AutoencoderKLTemporalDecoder,  UNetSpatioTemporalConditionModel
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.utils import logging
+from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_video_diffusion.pipeline_stable_video_diffusion import (
+        _append_dims,
+        tensor2vid,
+        _resize_with_antialiasing,
+        StableVideoDiffusionPipelineOutput
+)
+from ..schedulers.scheduling_euler_discrete import EulerDiscreteScheduler
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class StableVideoDiffusionWithRefAttnMapPipeline(DiffusionPipeline):
+    model_cpu_offload_seq = "image_encoder->unet->vae"
+    _callback_tensor_inputs = ["latents"]
+    def __init__(
+        self,
+        vae: AutoencoderKLTemporalDecoder,
+        image_encoder: CLIPVisionModelWithProjection,
+        unet: UNetSpatioTemporalConditionModel,
+        scheduler: EulerDiscreteScheduler,
+        feature_extractor: CLIPImageProcessor,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            image_encoder=image_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+    def _encode_image(
+        self,
+        image: PipelineImageInput,
+        device: Union[str, torch.device],
+        num_videos_per_prompt: int,
+        do_classifier_free_guidance: bool,
+    ) -> torch.FloatTensor:
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.image_processor.pil_to_numpy(image)
+            image = self.image_processor.numpy_to_pt(image)
+            # We normalize the image before resizing to match with the original implementation.
+            # Then we unnormalize it after resizing.
+            image = image * 2.0 - 1.0
+            image = _resize_with_antialiasing(image, (224, 224))
+            image = (image + 1.0) / 2.0
+        # Normalize the image with for CLIP input
+        image = self.feature_extractor(
+            images=image,
+            do_normalize=True,
+            do_center_crop=False,
+            do_resize=False,
+            do_rescale=False,
+            return_tensors="pt",
+        ).pixel_values
+        image = image.to(device=device, dtype=dtype)
+        image_embeddings = self.image_encoder(image).image_embeds
+        image_embeddings = image_embeddings.unsqueeze(1)
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_videos_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+        if do_classifier_free_guidance:
+            negative_image_embeddings = torch.zeros_like(image_embeddings)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([negative_image_embeddings, image_embeddings])
+        return image_embeddings
+    def _encode_vae_image(
+        self,
+        image: torch.Tensor,
+        device: Union[str, torch.device],
+        num_videos_per_prompt: int,
+        do_classifier_free_guidance: bool,
+    ):
+        image = image.to(device=device)
+        image_latents = self.vae.encode(image).latent_dist.mode()
+        if do_classifier_free_guidance:
+            negative_image_latents = torch.zeros_like(image_latents)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_latents = torch.cat([negative_image_latents, image_latents])
+        # duplicate image_latents for each generation per prompt, using mps friendly method
+        image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1)
+        return image_latents
+    def _get_add_time_ids(
+        self,
+        fps: int,
+        motion_bucket_id: int,
+        noise_aug_strength: float,
+        dtype: torch.dtype,
+        batch_size: int,
+        num_videos_per_prompt: int,
+        do_classifier_free_guidance: bool,
+    ):
+        add_time_ids = [fps, motion_bucket_id, noise_aug_strength]
+        passed_add_embed_dim = self.unet.config.addition_time_embed_dim * len(add_time_ids)
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_time_ids = add_time_ids.repeat(batch_size * num_videos_per_prompt, 1)
+        if do_classifier_free_guidance:
+            add_time_ids = torch.cat([add_time_ids, add_time_ids])
+        return add_time_ids
+    def decode_latents(self, latents: torch.FloatTensor, num_frames: int, decode_chunk_size: int = 14):
+        # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width]
+        latents = latents.flatten(0, 1)
+        latents = 1 / self.vae.config.scaling_factor * latents
+        forward_vae_fn = self.vae._orig_mod.forward if is_compiled_module(self.vae) else self.vae.forward
+        accepts_num_frames = "num_frames" in set(inspect.signature(forward_vae_fn).parameters.keys())
+        # decode decode_chunk_size frames at a time to avoid OOM
+        frames = []
+        for i in range(0, latents.shape[0], decode_chunk_size):
+            num_frames_in = latents[i : i + decode_chunk_size].shape[0]
+            decode_kwargs = {}
+            if accepts_num_frames:
+                # we only pass num_frames_in if it's expected
+                decode_kwargs["num_frames"] = num_frames_in
+            frame = self.vae.decode(latents[i : i + decode_chunk_size], **decode_kwargs).sample
+            frames.append(frame)
+        frames = torch.cat(frames, dim=0)
+        # [batch*frames, channels, height, width] -> [batch, channels, frames, height, width]
+        frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        frames = frames.float()
+        return frames
+    def check_inputs(self, image, height, width):
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+    def prepare_latents(
+        self,
+        batch_size: int,
+        num_frames: int,
+        num_channels_latents: int,
+        height: int,
+        width: int,
+        dtype: torch.dtype,
+        device: Union[str, torch.device],
+        generator: torch.Generator,
+        latents: Optional[torch.FloatTensor] = None,
+    ):
+        shape = (
+            batch_size,
+            num_frames,
+            num_channels_latents // 2,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        if isinstance(self.guidance_scale, (int, float)):
+            return self.guidance_scale > 1
+        return self.guidance_scale.max() > 1
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @torch.no_grad()
+    def __call__(
+        self,
+        ref_unet: UNetSpatioTemporalConditionModel,
+        image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor],
+        ref_image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor],
+        height: int = 576,
+        width: int = 1024,
+        num_frames: Optional[int] = None,
+        num_inference_steps: int = 25,
+        min_guidance_scale: float = 1.0,
+        max_guidance_scale: float = 3.0,
+        fps: int = 7,
+        motion_bucket_id: int = 127,
+        noise_aug_strength: float = 0.02,
+        decode_chunk_size: Optional[int] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        return_dict: bool = True,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
+                [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_frames (`int`, *optional*):
+                The number of video frames to generate. Defaults to 14 for `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`
+            num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            min_guidance_scale (`float`, *optional*, defaults to 1.0):
+                The minimum guidance scale. Used for the classifier free guidance with first frame.
+            max_guidance_scale (`float`, *optional*, defaults to 3.0):
+                The maximum guidance scale. Used for the classifier free guidance with last frame.
+            fps (`int`, *optional*, defaults to 7):
+                Frames per second. The rate at which the generated images shall be exported to a video after generation.
+                Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
+            motion_bucket_id (`int`, *optional*, defaults to 127):
+                The motion bucket ID. Used as conditioning for the generation. The higher the number the more motion will be in the video.
+            noise_aug_strength (`float`, *optional*, defaults to 0.02):
+                The amount of noise added to the init image, the higher it is the less the video will look like the init image. Increase it for more motion.
+            decode_chunk_size (`int`, *optional*):
+                The number of frames to decode at a time. The higher the chunk size, the higher the temporal consistency
+                between frames, but also the higher the memory consumption. By default, the decoder will decode all frames at once
+                for maximal quality. Reduce `decode_chunk_size` to reduce memory usage.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+        Returns:
+            [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list of list with the generated frames.
+        Examples:
+        ```py
+        from diffusers import StableVideoDiffusionPipeline
+        from diffusers.utils import load_image, export_to_video
+        pipe = StableVideoDiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16")
+        pipe.to("cuda")
+        image = load_image("https://lh3.googleusercontent.com/y-iFOHfLTwkuQSUegpwDdgKmOjRSTvPxat63dQLB25xkTs4lhIbRUFeNBWZzYf370g=s1200")
+        image = image.resize((1024, 576))
+        frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0]
+        export_to_video(frames, "generated.mp4", fps=7)
+        ```
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        num_frames = num_frames if num_frames is not None else self.unet.config.num_frames
+        decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(image, height, width)
+        self.check_inputs(ref_image, height, width)
+        # 2. Define call parameters
+        if isinstance(image, PIL.Image.Image):
+            batch_size = 1
+        elif isinstance(image, list):
+            batch_size = len(image)
+        else:
+            batch_size = image.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        self._guidance_scale = max_guidance_scale
+        # 3. Encode input image
+        image_embeddings = self._encode_image(image, device, num_videos_per_prompt, self.do_classifier_free_guidance)
+        ref_image_embeddings = self._encode_image(ref_image, device, num_videos_per_prompt, self.do_classifier_free_guidance)
+        # NOTE: Stable Diffusion Video was conditioned on fps - 1, which
+        # is why it is reduced here.
+        # See: https://github.com/Stability-AI/generative-models/blob/ed0997173f98eaf8f4edf7ba5fe8f15c6b877fd3/scripts/sampling/simple_video_sample.py#L188
+        fps = fps - 1
+        # 4. Encode input image using VAE
+        image = self.image_processor.preprocess(image, height=height, width=width).to(device)
+        ref_image = self.image_processor.preprocess(ref_image, height=height, width=width).to(device)
+        noise = randn_tensor(image.shape, generator=generator, device=image.device, dtype=image.dtype)
+        image = image + noise_aug_strength * noise
+        ref_image = ref_image + noise_aug_strength * noise
+        needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float32)
+        # Repeat the image latents for each frame so we can concatenate them with the noise
+        # image_latents [batch, channels, height, width] ->[batch, num_frames, channels, height, width]
+        image_latent = self._encode_vae_image(image, device, num_videos_per_prompt, self.do_classifier_free_guidance)
+        image_latent = image_latent.to(image_embeddings.dtype)
+        image_latents = image_latent.unsqueeze(1).repeat(1, num_frames, 1, 1, 1)
+        ref_image_latent = self._encode_vae_image(ref_image, device, num_videos_per_prompt, self.do_classifier_free_guidance)
+        ref_image_latent = ref_image_latent.to(ref_image_embeddings.dtype)
+        ref_image_latents = ref_image_latent.unsqueeze(1).repeat(1, num_frames, 1, 1, 1)
+        # cast back to fp16 if needed
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float16)
+        # 5. Get Added Time IDs
+        added_time_ids = self._get_add_time_ids(
+            fps,
+            motion_bucket_id,
+            noise_aug_strength,
+            image_embeddings.dtype,
+            batch_size,
+            num_videos_per_prompt,
+            self.do_classifier_free_guidance,
+        )
+        added_time_ids = added_time_ids.to(device)
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_frames,
+            num_channels_latents,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 7. Prepare guidance scale
+        guidance_scale = torch.linspace(min_guidance_scale, max_guidance_scale, num_frames).unsqueeze(0)
+        guidance_scale = guidance_scale.to(device, latents.dtype)
+        guidance_scale = guidance_scale.repeat(batch_size * num_videos_per_prompt, 1)
+        guidance_scale = _append_dims(guidance_scale, latents.ndim)
+        self._guidance_scale = guidance_scale
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        ref_unet = ref_unet.to(device)
+        ref_latents = latents.clone()
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                ref_latent_model_input= torch.cat([ref_latents] * 2) if self.do_classifier_free_guidance else ref_latents
+                ref_latent_model_input = self.scheduler.scale_model_input(ref_latent_model_input, t)
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input= self.scheduler.scale_model_input(latent_model_input, t)
+                # Concatenate image_latents over channels dimention
+                ref_latent_model_input = torch.cat([ref_latent_model_input, ref_image_latents], dim=2)
+                latent_model_input = torch.cat([latent_model_input, image_latents], dim=2)
+                # predict the noise residual
+                noise_pred_ref = ref_unet(
+                    ref_latent_model_input,
+                    t,
+                    encoder_hidden_states=ref_image_embeddings,
+                    added_time_ids=added_time_ids,
+                    return_dict=False,
+                )[0]
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=image_embeddings,
+                    added_time_ids=added_time_ids,
+                    return_dict=False,
+                )[0]
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond_ref, noise_pred_cond_ref = noise_pred_ref.chunk(2)
+                    noise_pred_ref = noise_pred_uncond_ref+ self.guidance_scale * (noise_pred_cond_ref - noise_pred_uncond_ref)
+                    noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                ref_latents = self.scheduler.step(noise_pred_ref, t, ref_latents).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+                self.scheduler._step_index += 1
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+        if not output_type == "latent":
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+            frames = self.decode_latents(latents, num_frames, decode_chunk_size)
+            frames = tensor2vid(frames, self.image_processor, output_type=output_type)
+        else:
+            frames = latents
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return frames
+        return StableVideoDiffusionPipelineOutput(frames=frames)

custom_diffusers/schedulers/scheduling_euler_discrete.py ADDED Viewed

	@@ -0,0 +1,466 @@

+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import BaseOutput, logging
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.schedulers.scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+from diffusers.schedulers.scheduling_euler_discrete import (EulerDiscreteSchedulerOutput,
+    betas_for_alpha_bar,
+    rescale_zero_terminal_snr
+)
+class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Euler scheduler.
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear` or `scaled_linear`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        interpolation_type(`str`, defaults to `"linear"`, *optional*):
+            The interpolation type to compute intermediate sigmas for the scheduler denoising steps. Should be on of
+            `"linear"` or `"log_linear"`.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        prediction_type: str = "epsilon",
+        interpolation_type: str = "linear",
+        use_karras_sigmas: Optional[bool] = False,
+        sigma_min: Optional[float] = None,
+        sigma_max: Optional[float] = None,
+        timestep_spacing: str = "linspace",
+        timestep_type: str = "discrete",  # can be "discrete" or "continuous"
+        steps_offset: int = 0,
+        rescale_betas_zero_snr: bool = False,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        if rescale_betas_zero_snr:
+            # Close to 0 without being 0 so first sigma is not inf
+            # FP16 smallest positive subnormal works well here
+            self.alphas_cumprod[-1] = 2**-24
+        sigmas = (((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5).flip(0)
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
+        timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32)
+        # setable values
+        self.num_inference_steps = None
+        # TODO: Support the full EDM scalings for all prediction types and timestep types
+        if timestep_type == "continuous" and prediction_type == "v_prediction":
+            self.timesteps = torch.Tensor([0.25 * sigma.log() for sigma in sigmas])
+        else:
+            self.timesteps = timesteps
+        self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
+        self.is_scale_input_called = False
+        self.use_karras_sigmas = use_karras_sigmas
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+    @property
+    def init_noise_sigma(self):
+        # standard deviation of the initial noise distribution
+        max_sigma = max(self.sigmas) if isinstance(self.sigmas, list) else self.sigmas.max()
+        if self.config.timestep_spacing in ["linspace", "trailing"]:
+            return max_sigma
+        return (max_sigma**2 + 1) ** 0.5
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increae 1 after each scheduler step.
+        """
+        return self._step_index
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+    def scale_model_input(
+        self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor]
+    ) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        if self.step_index is None:
+            self._init_step_index(timestep)
+        sigma = self.sigmas[self.step_index]
+        sample = sample / ((sigma**2 + 1) ** 0.5)
+        self.is_scale_input_called = True
+        return sample
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.num_inference_steps = num_inference_steps
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=np.float32)[
+                ::-1
+            ].copy()
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.float32)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(self.config.num_train_timesteps, 0, -step_ratio)).round().copy().astype(np.float32)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+            )
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        log_sigmas = np.log(sigmas)
+        if self.config.interpolation_type == "linear":
+            sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+        elif self.config.interpolation_type == "log_linear":
+            sigmas = torch.linspace(np.log(sigmas[-1]), np.log(sigmas[0]), num_inference_steps + 1).exp().numpy()
+        else:
+            raise ValueError(
+                f"{self.config.interpolation_type} is not implemented. Please specify interpolation_type to either"
+                " 'linear' or 'log_linear'"
+            )
+        if self.use_karras_sigmas:
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+        sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
+        # TODO: Support the full EDM scalings for all prediction types and timestep types
+        if self.config.timestep_type == "continuous" and self.config.prediction_type == "v_prediction":
+            self.timesteps = torch.Tensor([0.25 * sigma.log() for sigma in sigmas]).to(device=device)
+        else:
+            self.timesteps = torch.from_numpy(timesteps.astype(np.float32)).to(device=device)
+        self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+    # Copied from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17
+    def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+        indices = (schedule_timesteps == timestep).nonzero()
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+        return indices[pos].item()
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        s_churn: float = 0.0,
+        s_tmin: float = 0.0,
+        s_tmax: float = float("inf"),
+        s_noise: float = 1.0,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True
+    ) -> Union[EulerDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            s_churn (`float`):
+            s_tmin  (`float`):
+            s_tmax  (`float`):
+            s_noise (`float`, defaults to 1.0):
+                Scaling factor for noise added to the sample.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
+                tuple.
+        Returns:
+            [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
+        """
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+        if not self.is_scale_input_called:
+            logger.warning(
+                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+                "See `StableDiffusionPipeline` for a usage example."
+            )
+        if self.step_index is None:
+            self._init_step_index(timestep)
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+        sigma = self.sigmas[self.step_index]
+        gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0
+        noise = randn_tensor(
+            model_output.shape, dtype=model_output.dtype, device=model_output.device, generator=generator
+        )
+        eps = noise * s_noise
+        sigma_hat = sigma * (gamma + 1)
+        if gamma > 0:
+            sample = sample + eps * (sigma_hat**2 - sigma**2) ** 0.5
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        # NOTE: "original_sample" should not be an expected prediction_type but is left in for
+        # backwards compatibility
+        if self.config.prediction_type == "original_sample" or self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+        elif self.config.prediction_type == "epsilon":
+            pred_original_sample = sample - sigma_hat * model_output
+        elif self.config.prediction_type == "v_prediction":
+            # denoised = model_output * c_out + input * c_skip
+            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
+        # 2. Convert to an ODE derivative
+        derivative = (sample - pred_original_sample) / sigma_hat
+        dt = self.sigmas[self.step_index + 1] - sigma_hat
+        prev_sample = sample + derivative * dt
+        # Cast sample back to model compatible dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+        # if increment_step_idx:
+        #     # upon completion increase step index by one
+        #     self._step_index += 1
+        if not return_dict:
+            return (prev_sample,)
+        return EulerDiscreteSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        else:
+            step_indices = [self.begin_index] * timesteps.shape[0]
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+    def __len__(self):
+        return self.config.num_train_timesteps

dataset/stable_video_dataset.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import os
+from glob import glob
+import random
+import numpy as np
+from PIL import Image
+import torch
+from torchvision import transforms
+from torch.utils.data.dataset import Dataset
+class StableVideoDataset(Dataset):
+    def __init__(self,
+        video_data_dir,
+        max_num_videos=None,
+        frame_hight=576, frame_width=1024, num_frames=14,
+        is_reverse_video=True,
+        random_seed=42,
+        double_sampling_rate=False,
+    ):
+        self.video_data_dir = video_data_dir
+        video_names = sorted([video for video in os.listdir(video_data_dir)
+                    if os.path.isdir(os.path.join(video_data_dir, video))])
+        self.length = min(len(video_names), max_num_videos) if max_num_videos is not None else len(video_names)
+        self.video_names = video_names[:self.length]
+        if double_sampling_rate:
+            self.sample_frames = num_frames*2-1
+            self.sample_stride = 2
+        else:
+            self.sample_frames = num_frames
+            self.sample_stride = 1
+        self.frame_width = frame_width
+        self.frame_height = frame_hight
+        self.pixel_transforms = transforms.Compose([
+            transforms.Resize((self.frame_height, self.frame_width), interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+        self.is_reverse_video=is_reverse_video
+        np.random.seed(random_seed)
+    def get_batch(self, idx):
+        video_name = self.video_names[idx]
+        video_frame_paths = sorted(glob(os.path.join(self.video_data_dir, video_name, '*.png')))
+        start_idx = np.random.randint(len(video_frame_paths)-self.sample_frames+1)
+        video_frame_paths = video_frame_paths[start_idx:start_idx+self.sample_frames:self.sample_stride]
+        video_frames = [np.asarray(Image.open(frame_path).convert('RGB')).astype(np.float32)/255.0 for frame_path in video_frame_paths]
+        video_frames = np.stack(video_frames, axis=0)
+        pixel_values = torch.from_numpy(video_frames.transpose(0, 3, 1, 2))
+        return pixel_values
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        while True:
+            try:
+                pixel_values = self.get_batch(idx)
+                break
+            except Exception as e:
+                idx = random.randint(0, self.length-1)
+        pixel_values = self.pixel_transforms(pixel_values)
+        conditions = pixel_values[-1]
+        if self.is_reverse_video:
+            pixel_values = torch.flip(pixel_values, (0,))
+        sample = dict(pixel_values=pixel_values, conditions=conditions)
+        return sample

enviroment.yml ADDED Viewed

	@@ -0,0 +1,45 @@

+name: diffusers-0-27-0
+channels:
+  - pytorch
+  - defaults
+dependencies:
+  - python=3.8.5
+  - pip=20.3
+  - cudatoolkit=11.8
+  - pytorch=2.0.1
+  - torchvision=0.15.2
+  - numpy=1.23.1
+  - pip:
+    - diffusers==0.27.0
+    - albumentations==0.4.3
+    - opencv-python==4.6.0.66
+    - pudb==2019.2
+    - imageio==2.9.0
+    - imageio-ffmpeg==0.4.2
+    - omegaconf==2.1.1
+    - test-tube>=0.7.5
+    - einops==0.3.0
+    - torch-fidelity==0.3.0
+    - torchmetrics==0.11.0
+    - transformers==4.36.0
+    - webdataset==0.2.5
+    - open-clip-torch==2.7.0
+    - invisible-watermark>=0.1.5
+    - accelerate==0.25.0
+    - xformers==0.0.23
+    - peft==0.7.0
+    - torch-ema==0.3
+    - moviepy
+    - tensorboard
+    - Jinja2
+    - ftfy
+    - datasets
+    - wandb
+    - pytorch-fid
+    - notebook
+    - matplotlib
+    - kornia==0.7.2
+    - -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
+    - -e git+https://github.com/openai/CLIP.git@main#egg=clip
+    - -e git+https://github.com/Stability-AI/stablediffusion.git@main#egg=stable-diffusion

eval/val/0010.png ADDED Viewed

eval/val/0022.png ADDED Viewed

eval/val/0023.png ADDED Viewed

Git LFS Details

SHA256: fbf804b9a829b708a4698cedd4b2cc70f9e6b16e1a671e5bff594394122db6e1
Pointer size: 133 Bytes
Size of remote file: 12.9 MB

eval/val/turtle.png ADDED Viewed

Git LFS Details

SHA256: bc9cdd3271757d37650e587245b747d09707dc65477294bbf1aac1c0a3985c92
Pointer size: 132 Bytes
Size of remote file: 1.12 MB

examples/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ results/

examples/example_001.gif ADDED Viewed

Git LFS Details

SHA256: b08620761ee8449b26900784c1e54b169100931dd7230d8aef13f2e1e0b7c284
Pointer size: 133 Bytes
Size of remote file: 10.6 MB

examples/example_001/frame1.png ADDED Viewed

Git LFS Details

SHA256: c3ab7448fba42a26f635205ea90a61d13f6836cbbad324ff609321f8e7bc9296
Pointer size: 132 Bytes
Size of remote file: 6.66 MB

examples/example_001/frame2.png ADDED Viewed

Git LFS Details

SHA256: d089752c7ce7195635d3e20b208fc1cc223ec43347b75ac9e53fad66964d275a
Pointer size: 132 Bytes
Size of remote file: 6.48 MB

examples/example_002.gif ADDED Viewed

Git LFS Details

SHA256: be15ea62b0445164414f12812c74b72e0e400e2bd827e0eedd6cc295e2eb4e4c
Pointer size: 132 Bytes
Size of remote file: 4.8 MB

examples/example_002/frame1.png ADDED Viewed

Git LFS Details

SHA256: 6b5af056e973dae58a713aaff80af4c133e4a30aae2f302c2e4d8e2dc5c8e005
Pointer size: 132 Bytes
Size of remote file: 8.94 MB

examples/example_002/frame2.png ADDED Viewed

Git LFS Details

SHA256: 023f2a6086ff8a08cf9c229d2cd46f5a446b79b49d12f44284463e8544beacc3
Pointer size: 133 Bytes
Size of remote file: 10.1 MB

examples/example_003.gif ADDED Viewed

Git LFS Details

SHA256: 3381db3eaff8b5e95f9f9bd7f28300bcdebae65db55a4d5a06af9e26623e9135
Pointer size: 132 Bytes
Size of remote file: 6.15 MB

examples/example_003/frame1.png ADDED Viewed

examples/example_003/frame2.png ADDED Viewed

Git LFS Details

SHA256: a0ae1a3a7ae144726ee7d68569d0fcbc9cefba54b1264ceb745ec9c7d00e532e
Pointer size: 132 Bytes
Size of remote file: 1.01 MB

examples/example_004.gif ADDED Viewed

Git LFS Details

SHA256: 265bf9ca119401185f94faa3671d4889473c4e80b6fa4ce11a2e0f2b77708bd7
Pointer size: 132 Bytes
Size of remote file: 6.37 MB

examples/example_004/frame1.png ADDED Viewed

Git LFS Details

SHA256: e41bd198625d72a5521e79e2cfc4fd29b29d1987f3876700c5e99f8616ee0dc7
Pointer size: 132 Bytes
Size of remote file: 5.86 MB

examples/example_004/frame2.png ADDED Viewed

Git LFS Details

SHA256: 189fdb1db0dbda49b5cdc135d955391a15e401687603b440a3958f0bf2750f80
Pointer size: 132 Bytes
Size of remote file: 6.89 MB

gradio_app.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import os
+import gradio as gr
+import torch
+# import argparse
+checkpoint_dir = "checkpoints/svd_reverse_motion_with_attnflip"
+from diffusers.utils import load_image, export_to_video
+from diffusers import UNetSpatioTemporalConditionModel
+from custom_diffusers.pipelines.pipeline_frame_interpolation_with_noise_injection import FrameInterpolationWithNoiseInjectionPipeline
+from custom_diffusers.schedulers.scheduling_euler_discrete import EulerDiscreteScheduler
+from attn_ctrl.attention_control import (AttentionStore,
+                                         register_temporal_self_attention_control,
+                                         register_temporal_self_attention_flip_control,
+)
+pretrained_model_name_or_path = "stabilityai/stable-video-diffusion-img2vid-xt"
+noise_scheduler = EulerDiscreteScheduler.from_pretrained(pretrained_model_name_or_path, subfolder="scheduler")
+pipe = FrameInterpolationWithNoiseInjectionPipeline.from_pretrained(
+    pretrained_model_name_or_path,
+    scheduler=noise_scheduler,
+    variant="fp16",
+    torch_dtype=torch.float16,
+)
+ref_unet = pipe.ori_unet
+state_dict = pipe.unet.state_dict()
+# computing delta w
+finetuned_unet = UNetSpatioTemporalConditionModel.from_pretrained(
+    checkpoint_dir,
+    subfolder="unet",
+    torch_dtype=torch.float16,
+)
+assert finetuned_unet.config.num_frames==14
+ori_unet = UNetSpatioTemporalConditionModel.from_pretrained(
+    "stabilityai/stable-video-diffusion-img2vid",
+    subfolder="unet",
+    variant='fp16',
+    torch_dtype=torch.float16,
+)
+finetuned_state_dict = finetuned_unet.state_dict()
+ori_state_dict = ori_unet.state_dict()
+for name, param in finetuned_state_dict.items():
+    if 'temporal_transformer_blocks.0.attn1.to_v' in name or "temporal_transformer_blocks.0.attn1.to_out.0" in name:
+        delta_w = param - ori_state_dict[name]
+        state_dict[name] = state_dict[name] + delta_w
+pipe.unet.load_state_dict(state_dict)
+controller_ref= AttentionStore()
+register_temporal_self_attention_control(ref_unet, controller_ref)
+controller = AttentionStore()
+register_temporal_self_attention_flip_control(pipe.unet, controller, controller_ref)
+device = "cuda"
+pipe = pipe.to(device)
+def check_outputs_folder(folder_path):
+    # Check if the folder exists
+    if os.path.exists(folder_path) and os.path.isdir(folder_path):
+        # Delete all contents inside the folder
+        for filename in os.listdir(folder_path):
+            file_path = os.path.join(folder_path, filename)
+            try:
+                if os.path.isfile(file_path) or os.path.islink(file_path):
+                    os.unlink(file_path)  # Remove file or link
+                elif os.path.isdir(file_path):
+                    shutil.rmtree(file_path)  # Remove directory
+            except Exception as e:
+                print(f'Failed to delete {file_path}. Reason: {e}')
+    else:
+        print(f'The folder {folder_path} does not exist.')
+def infer(frame1_path, frame2_path):
+    seed = 42
+    num_inference_steps = 25
+    noise_injection_steps = 0
+    noise_injection_ratio = 0.5
+    weighted_average = True
+    generator = torch.Generator(device)
+    if seed is not None:
+        generator = generator.manual_seed(seed)
+    frame1 = load_image(frame1_path)
+    frame1 = frame1.resize((1024, 576))
+    frame2 = load_image(frame2_path)
+    frame2 = frame2.resize((1024, 576))
+    frames = pipe(image1=frame1, image2=frame2,
+        num_inference_steps=num_inference_steps, # 50
+        generator=generator,
+        weighted_average=weighted_average, # True
+        noise_injection_steps=noise_injection_steps, # 0
+        noise_injection_ratio= noise_injection_ratio, # 0.5
+    ).frames[0]
+    out_dir = "result"
+    check_outputs_folder(out_dir)
+    os.makedirs(out_dir, exist_ok=True)
+    out_path = "result/video_result.mp4"
+    if out_path.endswith('.gif'):
+        frames[0].save(out_path, save_all=True, append_images=frames[1:], duration=142, loop=0)
+    else:
+        export_to_video(frames, out_path, fps=7)
+    return out_path
+with gr.Blocks() as demo:
+    with gr.Column():
+        gr.Markdown("# Keyframe Interpolation with Stable Video Diffusion")
+        with gr.Row():
+            with gr.Column():
+                image_input1 = gr.Image(type="filepath")
+                image_input2 = gr.Image(type="filepath")
+                submit_btn = gr.Button("Submit")
+            with gr.Column():
+                output = gr.Video()
+    submit_btn.click(
+        fn = infer,
+        inputs = [image_input1, image_input2],
+        outputs = [output],
+        show_api = False
+    )
+demo.queue().launch(show_api=False, show_error=True)

keyframe_interpolation.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import os
+import torch
+import argparse
+import copy
+from diffusers.utils import load_image, export_to_video
+from diffusers import UNetSpatioTemporalConditionModel
+from custom_diffusers.pipelines.pipeline_frame_interpolation_with_noise_injection import FrameInterpolationWithNoiseInjectionPipeline
+from custom_diffusers.schedulers.scheduling_euler_discrete import EulerDiscreteScheduler
+from attn_ctrl.attention_control import (AttentionStore,
+                                         register_temporal_self_attention_control,
+                                         register_temporal_self_attention_flip_control,
+)
+def main(args):
+    noise_scheduler = EulerDiscreteScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    pipe = FrameInterpolationWithNoiseInjectionPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        scheduler=noise_scheduler,
+        variant="fp16",
+        torch_dtype=torch.float16,
+    )
+    ref_unet = pipe.ori_unet
+    state_dict = pipe.unet.state_dict()
+    # computing delta w
+    finetuned_unet = UNetSpatioTemporalConditionModel.from_pretrained(
+        args.checkpoint_dir,
+        subfolder="unet",
+        torch_dtype=torch.float16,
+    )
+    assert finetuned_unet.config.num_frames==14
+    ori_unet = UNetSpatioTemporalConditionModel.from_pretrained(
+        "stabilityai/stable-video-diffusion-img2vid",
+        subfolder="unet",
+        variant='fp16',
+        torch_dtype=torch.float16,
+    )
+    finetuned_state_dict = finetuned_unet.state_dict()
+    ori_state_dict = ori_unet.state_dict()
+    for name, param in finetuned_state_dict.items():
+        if 'temporal_transformer_blocks.0.attn1.to_v' in name or "temporal_transformer_blocks.0.attn1.to_out.0" in name:
+            delta_w = param - ori_state_dict[name]
+            state_dict[name] = state_dict[name] + delta_w
+    pipe.unet.load_state_dict(state_dict)
+    controller_ref= AttentionStore()
+    register_temporal_self_attention_control(ref_unet, controller_ref)
+    controller = AttentionStore()
+    register_temporal_self_attention_flip_control(pipe.unet, controller, controller_ref)
+    pipe = pipe.to(args.device)
+    # run inference
+    generator = torch.Generator(device=args.device)
+    if args.seed is not None:
+        generator = generator.manual_seed(args.seed)
+    frame1 = load_image(args.frame1_path)
+    frame1 = frame1.resize((1024, 576))
+    frame2 = load_image(args.frame2_path)
+    frame2 = frame2.resize((1024, 576))
+    frames = pipe(image1=frame1, image2=frame2,
+                num_inference_steps=args.num_inference_steps,
+                generator=generator,
+                weighted_average=args.weighted_average,
+                noise_injection_steps=args.noise_injection_steps,
+                noise_injection_ratio= args.noise_injection_ratio,
+    ).frames[0]
+    if args.out_path.endswith('.gif'):
+        frames[0].save(args.out_path, save_all=True, append_images=frames[1:], duration=142, loop=0)
+    else:
+        export_to_video(frames, args.out_path, fps=7)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pretrained_model_name_or_path", type=str, default="stabilityai/stable-video-diffusion-img2vid-xt")
+    parser.add_argument("--checkpoint_dir", type=str, required=True)
+    parser.add_argument('--frame1_path', type=str, required=True)
+    parser.add_argument('--frame2_path', type=str, required=True)
+    parser.add_argument('--out_path', type=str, required=True)
+    parser.add_argument('--seed', type=int, default=42)
+    parser.add_argument('--num_inference_steps', type=int, default=50)
+    parser.add_argument('--weighted_average', action='store_true')
+    parser.add_argument('--noise_injection_steps', type=int, default=0)
+    parser.add_argument('--noise_injection_ratio', type=float, default=0.5)
+    parser.add_argument('--device', type=str, default='cuda:0')
+    args = parser.parse_args()
+    out_dir = os.path.dirname(args.out_path)
+    os.makedirs(out_dir, exist_ok=True)
+    main(args)

keyframe_interpolation.sh ADDED Viewed

	@@ -0,0 +1,26 @@

+#!bin/bash
+noise_injection_steps=5
+noise_injection_ratio=0.5
+EVAL_DIR=examples
+CHECKPOINT_DIR=checkpoints/svd_reverse_motion_with_attnflip
+MODEL_NAME=stabilityai/stable-video-diffusion-img2vid-xt
+OUT_DIR=results
+mkdir -p $OUT_DIR
+for example_dir in $(ls -d $EVAL_DIR/*)
+do
+    example_name=$(basename $example_dir)
+    echo $example_name
+    out_fn=$OUT_DIR/$example_name'.gif'
+    python keyframe_interpolation.py \
+        --frame1_path=$example_dir/frame1.png \
+        --frame2_path=$example_dir/frame2.png \
+        --pretrained_model_name_or_path=$MODEL_NAME \
+        --checkpoint_dir=$CHECKPOINT_DIR \
+        --noise_injection_steps=$noise_injection_steps \
+        --noise_injection_ratio=$noise_injection_ratio \
+        --out_path=$out_fn
+done

requirements.txt ADDED Viewed

	@@ -0,0 +1,35 @@

+torch=2.0.1
+torchvision=0.15.2
+numpy=1.23.1
+diffusers==0.27.0
+albumentations==0.4.3
+opencv-python==4.6.0.66
+pudb==2019.2
+imageio==2.9.0
+imageio-ffmpeg==0.4.2
+omegaconf==2.1.1
+test-tube>=0.7.5
+einops==0.3.0
+torch-fidelity==0.3.0
+torchmetrics==0.11.0
+transformers==4.36.0
+webdataset==0.2.5
+open-clip-torch==2.7.0
+invisible-watermark>=0.1.5
+accelerate==0.25.0
+xformers==0.0.23
+peft==0.7.0
+torch-ema==0.3
+moviepy
+tensorboard
+Jinja2
+ftfy
+datasets
+wandb
+pytorch-fid
+notebook
+matplotlib
+kornia==0.7.2
+-e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
+-e git+https://github.com/openai/CLIP.git@main#egg=clip
+-e git+https://github.com/Stability-AI/stablediffusion.git@main#egg=stable-diffusion

train_reverse_motion_with_attnflip.py ADDED Viewed

	@@ -0,0 +1,591 @@

+"""Fine-tuning script for Stable Video Diffusion for image2video with support for LoRA."""
+import logging
+import math
+import os
+import shutil
+from glob import glob
+from pathlib import Path
+from PIL import Image
+import accelerate
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from einops import rearrange
+import transformers
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from packaging import version
+from tqdm.auto import tqdm
+import copy
+import diffusers
+from diffusers import AutoencoderKLTemporalDecoder
+from diffusers import  UNetSpatioTemporalConditionModel
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import cast_training_params
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
+from diffusers.pipelines.stable_video_diffusion.pipeline_stable_video_diffusion import _resize_with_antialiasing
+from custom_diffusers.pipelines.pipeline_stable_video_diffusion_with_ref_attnmap import StableVideoDiffusionWithRefAttnMapPipeline
+from custom_diffusers.schedulers.scheduling_euler_discrete import EulerDiscreteScheduler
+from attn_ctrl.attention_control import (AttentionStore,
+                                         register_temporal_self_attention_control,
+                                         register_temporal_self_attention_flip_control,
+)
+from utils.parse_args import parse_args
+from dataset.stable_video_dataset import StableVideoDataset
+logger = get_logger(__name__, log_level="INFO")
+def rand_log_normal(shape, loc=0., scale=1., device='cpu', dtype=torch.float32):
+    """Draws samples from an lognormal distribution."""
+    u = torch.rand(shape, dtype=dtype, device=device) * (1 - 2e-7) + 1e-7
+    return torch.distributions.Normal(loc, scale).icdf(u).exp()
+def main():
+    args = parse_args()
+    logging_dir = Path(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+        import wandb
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+    # Load scheduler, tokenizer and models.
+    noise_scheduler = EulerDiscreteScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    feature_extractor = CLIPImageProcessor.from_pretrained(args.pretrained_model_name_or_path, subfolder="feature_extractor")
+    image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="image_encoder", variant=args.variant
+    )
+    vae = AutoencoderKLTemporalDecoder.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="vae", variant=args.variant
+    )
+    unet = UNetSpatioTemporalConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", low_cpu_mem_usage=True, variant=args.variant
+    )
+    ref_unet = copy.deepcopy(unet)
+    # register customized attn processors
+    controller_ref = AttentionStore()
+    register_temporal_self_attention_control(ref_unet, controller_ref)
+    controller = AttentionStore()
+    register_temporal_self_attention_flip_control(unet, controller, controller_ref)
+    # freeze parameters of models to save more memory
+    ref_unet.requires_grad_(False)
+    unet.requires_grad_(False)
+    vae.requires_grad_(False)
+    image_encoder.requires_grad_(False)
+    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    # Move unet, vae and image_encoder to device and cast to weight_dtype
+    # unet.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+    image_encoder.to(accelerator.device, dtype=weight_dtype)
+    ref_unet.to(accelerator.device, dtype=weight_dtype)
+    unet_train_params_list = []
+    # Customize the parameters that need to be trained; if necessary, you can uncomment them yourself.
+    for name, para in unet.named_parameters():
+        if 'temporal_transformer_blocks.0.attn1.to_v.weight' in name or 'temporal_transformer_blocks.0.attn1.to_out.0.weight' in name:
+            unet_train_params_list.append(para)
+            para.requires_grad = True
+        else:
+            para.requires_grad = False
+    if args.mixed_precision == "fp16":
+        # only upcast trainable parameters into fp32
+        cast_training_params(unet, dtype=torch.float32)
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warn(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+     # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if accelerator.is_main_process:
+                for i, model in enumerate(models):
+                    model.save_pretrained(os.path.join(output_dir, "unet"))
+                    # make sure to pop weight so that corresponding model is not saved again
+                    weights.pop()
+        def load_model_hook(models, input_dir):
+            for _ in range(len(models)):
+                # pop models so that they are not loaded again
+                model = models.pop()
+                # load diffusers style into model
+                load_model = UNetSpatioTemporalConditionModel.from_pretrained(input_dir, subfolder="unet")
+                model.register_to_config(**load_model.config)
+                model.load_state_dict(load_model.state_dict())
+                del load_model
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+    if accelerator.is_main_process:
+        rec_txt1 = open('frozen_param.txt', 'w')
+        rec_txt2 = open('train_param.txt', 'w')
+        for name, para in unet.named_parameters():
+            if para.requires_grad is False:
+                rec_txt1.write(f'{name}\n')
+            else:
+                rec_txt2.write(f'{name}\n')
+        rec_txt1.close()
+        rec_txt2.close()
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+    # Initialize the optimizer
+    optimizer = torch.optim.AdamW(
+        unet_train_params_list,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+    train_dataset = StableVideoDataset(video_data_dir=args.train_data_dir,
+                                       max_num_videos=args.max_train_samples,
+                                       num_frames=args.num_frames,
+                                       is_reverse_video=True,
+                                       double_sampling_rate=args.double_sampling_rate)
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        conditions = torch.stack([example["conditions"] for example in examples])
+        conditions =conditions.to(memory_format=torch.contiguous_format).float()
+        return {"pixel_values": pixel_values, "conditions": conditions}
+    # DataLoaders creation:
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+    # Validation data
+    if args.validation_data_dir is not None:
+        validation_image_paths = sorted(glob(os.path.join(args.validation_data_dir, '*.png')))
+        num_validation_images = min(args.num_validation_images, len(validation_image_paths))
+        validation_image_paths = validation_image_paths[:num_validation_images]
+        validation_images = [Image.open(image_path).convert('RGB').resize((1024, 576)) for image_path in validation_image_paths]
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+    )
+    # Prepare everything with our `accelerator`.
+    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        unet, optimizer, train_dataloader, lr_scheduler
+    )
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("image2video-reverse-fine-tune", config=vars(args))
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+    # default motion param setting
+    def _get_add_time_ids(
+        dtype,
+        batch_size,
+        fps=6,
+        motion_bucket_id=127,
+        noise_aug_strength=0.02,
+    ):
+        add_time_ids = [fps, motion_bucket_id, noise_aug_strength]
+        passed_add_embed_dim = unet.module.config.addition_time_embed_dim * \
+            len(add_time_ids)
+        expected_add_embed_dim = unet.module.add_embedding.linear_1.in_features
+        assert (expected_add_embed_dim == passed_add_embed_dim)
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_time_ids = add_time_ids.repeat(batch_size, 1)
+        return add_time_ids
+    def compute_image_embeddings(image):
+        image = _resize_with_antialiasing(image, (224, 224))
+        image = (image + 1.0) / 2.0
+        # Normalize the image with for CLIP input
+        image = feature_extractor(
+            images=image,
+            do_normalize=True,
+            do_center_crop=False,
+            do_resize=False,
+            do_rescale=False,
+            return_tensors="pt",
+        ).pixel_values
+        image = image.to(accelerator.device).to(dtype=weight_dtype)
+        image_embeddings = image_encoder(image).image_embeds
+        image_embeddings = image_embeddings.unsqueeze(1)
+        return image_embeddings
+    noise_aug_strength = 0.02
+    fps=7
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # Get the image embedding for conditioning
+                encoder_hidden_states = compute_image_embeddings(batch["conditions"])
+                encoder_hidden_states_ref = compute_image_embeddings(batch["pixel_values"][:, -1])
+                batch["conditions"] = batch["conditions"].to(accelerator.device).to(dtype=weight_dtype)
+                batch["pixel_values"] = batch["pixel_values"].to(accelerator.device).to(dtype=weight_dtype)
+                # Get the image latent for input condtioning
+                noise =  torch.randn_like(batch["conditions"])
+                conditions = batch["conditions"] + noise_aug_strength * noise
+                conditions_latent = vae.encode(conditions).latent_dist.mode()
+                conditions_latent = conditions_latent.unsqueeze(1).repeat(1, args.num_frames, 1, 1, 1)
+                conditions_ref = batch["pixel_values"][:, -1] + noise_aug_strength * noise
+                conditions_latent_ref = vae.encode(conditions_ref).latent_dist.mode()
+                conditions_latent_ref = conditions_latent_ref.unsqueeze(1).repeat(1, args.num_frames, 1, 1, 1)
+                # Convert frames to latent space
+                pixel_values = rearrange(batch["pixel_values"], "b f c h w -> (b f) c h w")
+                latents = vae.encode(pixel_values).latent_dist.sample()
+                latents = latents * vae.config.scaling_factor
+                latents = rearrange(latents, "(b f) c h w -> b f c h w", f=args.num_frames)
+                latents_ref= torch.flip(latents, dims=(1,))
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                if args.noise_offset:
+                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
+                    noise += args.noise_offset * torch.randn(
+                        (latents.shape[0], latents.shape[1], latents.shape[2], 1, 1), device=latents.device
+                    )
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                # P_mean=0.7 P_std=1.6
+                sigmas = rand_log_normal(shape=[bsz,], loc=0.7, scale=1.6).to(latents.device)
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                sigmas = sigmas[:, None, None, None, None]
+                timesteps = torch.Tensor(
+                    [0.25 * sigma.log() for sigma in sigmas]).to(accelerator.device)
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = latents + noise * sigmas
+                noisy_latents_inp = noisy_latents / ((sigmas**2 + 1) ** 0.5)
+                noisy_latents_inp = torch.cat([noisy_latents_inp, conditions_latent], dim=2)
+                noisy_latents_ref = latents_ref + torch.flip(noise, dims=(1,)) * sigmas
+                noisy_latents_ref_inp = noisy_latents_ref / ((sigmas**2 + 1) ** 0.5)
+                noisy_latents_ref_inp = torch.cat([noisy_latents_ref_inp, conditions_latent_ref], dim=2)
+                # Get the target for loss depending on the prediction type
+                target = latents
+                # Predict the noise residual and compute loss
+                added_time_ids = _get_add_time_ids(encoder_hidden_states.dtype, bsz).to(accelerator.device)
+                ref_model_pred = ref_unet(noisy_latents_ref_inp.to(weight_dtype), timesteps.to(weight_dtype),
+                                encoder_hidden_states=encoder_hidden_states_ref,
+                                added_time_ids=added_time_ids,
+                                return_dict=False)[0]
+                model_pred = unet(noisy_latents_inp, timesteps,
+                                encoder_hidden_states=encoder_hidden_states,
+                                added_time_ids=added_time_ids,
+                                return_dict=False)[0] # v-prediction
+                # Denoise the latents
+                c_out = -sigmas / ((sigmas**2 + 1)**0.5)
+                c_skip = 1 / (sigmas**2 + 1)
+                denoised_latents = model_pred * c_out + c_skip * noisy_latents
+                weighing = (1 + sigmas ** 2) * (sigmas**-2.0)
+                 # MSE loss
+                loss = torch.mean(
+                        (weighing.float() * (denoised_latents.float() -
+                        target.float()) ** 2).reshape(target.shape[0], -1),
+                        dim=1,
+                )
+                loss = loss.mean()
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = unet_train_params_list
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            if global_step >= args.max_train_steps:
+                break
+        if accelerator.is_main_process:
+            if args.validation_data_dir is not None and epoch % args.validation_epochs == 0:
+                logger.info(
+                    f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+                    f" {args.validation_data_dir}."
+                )
+                # create pipeline
+                pipeline = StableVideoDiffusionWithRefAttnMapPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    scheduler=noise_scheduler,
+                    unet=unwrap_model(unet),
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+                pipeline = pipeline.to(accelerator.device)
+                pipeline.set_progress_bar_config(disable=True)
+                # run inference
+                generator = torch.Generator(device=accelerator.device)
+                if args.seed is not None:
+                    generator = generator.manual_seed(args.seed)
+                videos = []
+                with torch.cuda.amp.autocast():
+                    for val_idx in range(num_validation_images):
+                        val_img = validation_images[val_idx]
+                        videos.append(
+                            pipeline(ref_unet=ref_unet, image=val_img, ref_image=val_img, num_inference_steps=50, generator=generator, output_type='pt').frames[0]
+                        )
+                for tracker in accelerator.trackers:
+                    if tracker.name == "tensorboard":
+                        videos = torch.stack(videos)
+                        tracker.writer.add_video("validation", videos, epoch, fps=fps)
+                del pipeline
+                torch.cuda.empty_cache()
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = unet.to(torch.float32)
+        unwrapped_unet = unwrap_model(unet)
+        pipeline = StableVideoDiffusionWithRefAttnMapPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    scheduler=noise_scheduler,
+                    unet=unwrapped_unet,
+                    variant=args.variant,
+                )
+        pipeline.save_pretrained(args.output_dir)
+        # Final inference
+        # Load previous pipeline
+        if args.validation_data_dir is not None:
+            pipeline = pipeline.to(accelerator.device)
+            pipeline.torch_dtype = weight_dtype
+            # run inference
+            generator = torch.Generator(device=accelerator.device)
+            if args.seed is not None:
+                generator = generator.manual_seed(args.seed)
+            videos = []
+            with torch.cuda.amp.autocast():
+                for val_idx in range(num_validation_images):
+                    val_img = validation_images[val_idx]
+                    videos.append(
+                        pipeline(ref_unet=ref_unet, image=val_img, ref_image=val_img, num_inference_steps=50, generator=generator, output_type='pt').frames[0]
+                    )
+            for tracker in accelerator.trackers:
+                if len(videos) != 0:
+                    if tracker.name == "tensorboard":
+                        videos = torch.stack(videos)
+                        tracker.writer.add_video("validation", videos, epoch, fps=fps)
+    accelerator.end_training()
+if __name__ == "__main__":
+    main()

train_reverse_motion_with_attnflip.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+MODEL_NAME=stabilityai/stable-video-diffusion-img2vid
+TRAIN_DIR=../keyframe_interpolation_data/synthetic_videos_frames
+VALIDATION_DIR=eval/val
+accelerate launch --mixed_precision="fp16" train_reverse_motion_with_attnflip.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --variant "fp16" \
+  --num_frames 14 \
+  --train_data_dir=$TRAIN_DIR \
+  --validation_data_dir=$VALIDATION_DIR \
+  --max_train_samples=100 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps 1 \
+  --num_train_epochs=1000 --checkpointing_steps=2000 \
+  --validation_epochs=50 \
+  --learning_rate=1e-04 --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --seed=42 \
+  --double_sampling_rate \
+  --output_dir="checkpoints/svd_reverse_motion_with_attnflip" \
+  --cache_dir="checkpoints/svd_reverse_motion_with_attnflip_cache" \
+  --report_to="tensorboard"

utils/parse_args.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import os
+import argparse
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--num_frames",
+        type=int,
+        default=25,
+        help="Number of frames that should be generated in the video.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        required=True,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--double_sampling_rate",
+        action="store_true",
+        help=(
+            "whether or not sampling training frames double rate"
+        ),
+    )
+    parser.add_argument(
+        "--validation_data_dir", type=str, default=None, help="A prompt that is sampled during training for inference."
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=1,
+        help=(
+            "Run fine-tuning validation every X epochs. The validation process consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="sd-model-finetuned-lora",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--prediction_type",
+        type=str,
+        default=None,
+        help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediction_type` is chosen.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=4,
+        help=("The dimension of the LoRA update matrices."),
+    )
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+    return args