Spaces:

PAIR
/

StreamingSVD

Running on Zero

File size: 11,862 Bytes

8fd2f2f

# pytorch_lightning==2.2.2
seed_everything: 33
trainer:
  accelerator: auto
  strategy: auto
  devices: '1'
  num_nodes: 1
  precision: 16-mixed
  logger: False
model:
  class_path: diffusion_trainer.streaming_svd.StreamingSVD
  init_args:
    vfi:
      class_path: modules.params.vfi.VFIParams 
      init_args:
        ckpt_path_local: checkpoint/VFI/ours.pkl
        ckpt_path_global: https://drive.google.com/file/d/1XCNoyhA1RX3m8W-XJK8H8inH47l36kxP/view?usp=sharing
    i2v_enhance:
      class_path:  modules.params.i2v_enhance.I2VEnhanceParams
      init_args:
        ckpt_path_local: checkpoint/i2v_enhance/
        ckpt_path_global: ali-vilab/i2vgen-xl
    module_loader:
      class_path: modules.loader.module_loader.GenericModuleLoader
      init_args:
        pipeline_repo: stabilityai/stable-video-diffusion-img2vid-xt
        pipeline_obj: streamingt2v_pipeline
        set_prediction_type: ''
        module_names:
        - network_config
        - model
        - controlnet
        - denoiser
        - conditioner
        - first_stage_model
        - sampler
        - svd_pipeline
        module_config:
          controlnet:
            class_path: modules.loader.module_loader_config.ModuleLoaderConfig
            init_args:
              loader_cls_path: models.control.controlnet.ControlNet
              cls_func: from_unet
              cls_func_fast_dev_run: ''
              kwargs_diffusers: null
              model_params:
                merging_mode: addition
                zero_conv_mode: Identity
                frame_expansion: none
                downsample_controlnet_cond: true
                use_image_encoder_normalization: true
                use_controlnet_mask: false
                condition_encoder: ''
                conditioning_embedding_out_channels:
                - 32
                - 96
                - 256
                - 512
              kwargs_diff_trainer_params: null
              args: []
              dependent_modules:
                model: model
              dependent_modules_cloned: null
              state_dict_path: ''
              strict_loading: true
              state_dict_filters: []
          network_config:
            class_path: models.diffusion.video_model.VideoUNet
            init_args:
              in_channels: 8
              model_channels: 320
              out_channels: 4
              num_res_blocks: 2
              num_conditional_frames: null
              attention_resolutions:
              - 4
              - 2
              - 1
              dropout: 0.0
              channel_mult:
              - 1
              - 2
              - 4
              - 4
              conv_resample: true
              dims: 2
              num_classes: sequential
              use_checkpoint: False
              num_heads: -1
              num_head_channels: 64
              num_heads_upsample: -1
              use_scale_shift_norm: false
              resblock_updown: false
              transformer_depth: 1
              transformer_depth_middle: null
              context_dim: 1024
              time_downup: false
              time_context_dim: null
              extra_ff_mix_layer: true
              use_spatial_context: true
              merge_strategy: learned_with_images
              merge_factor: 0.5
              spatial_transformer_attn_type: softmax-xformers
              video_kernel_size:
              - 3
              - 1
              - 1
              use_linear_in_transformer: true
              adm_in_channels: 768
              disable_temporal_crossattention: false
              max_ddpm_temb_period: 10000
              merging_mode: attention_cross_attention
              controlnet_mode: true
              use_apm: false
          model:
            class_path: modules.loader.module_loader_config.ModuleLoaderConfig
            init_args:
              loader_cls_path: models.svd.sgm.modules.diffusionmodules.wrappers.OpenAIWrapper
              cls_func: ''
              cls_func_fast_dev_run: ''
              kwargs_diffusers:
                compile_model: false
              model_params: null
              model_params_fast_dev_run: null
              kwargs_diff_trainer_params: null
              args: []
              dependent_modules:
                diffusion_model: network_config
              dependent_modules_cloned: null
              state_dict_path: ''
              strict_loading: true
              state_dict_filters: []
          denoiser:
            class_path: models.svd.sgm.modules.diffusionmodules.denoiser.Denoiser
            init_args:
              scaling_config:
                target: models.svd.sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
          sampler:
            class_path: models.svd.sgm.modules.diffusionmodules.sampling.EulerEDMSampler
            init_args:
              s_churn: 0.0
              s_tmin: 0.0
              s_tmax: .inf
              s_noise: 1.0
              discretization_config:
                target: models.diffusion.discretizer.AlignYourSteps
                params:
                  sigma_max: 700.0
              num_steps: 30
              guider_config:
                target: models.svd.sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
                params:
                  max_scale: 3.0
                  min_scale: 1.5
                  num_frames: 25
              verbose: false
              device: cuda
          conditioner:
            class_path: models.svd.sgm.modules.GeneralConditioner
            init_args:
              emb_models:
              - is_trainable: false
                input_key: cond_frames_without_noise
                target: models.svd.sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
                params:
                  n_cond_frames: 1
                  n_copies: 1
                  open_clip_embedding_config:
                    target: models.svd.sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
                    params:
                      freeze: true
              - input_key: fps_id
                is_trainable: false
                target: models.svd.sgm.modules.encoders.modules.ConcatTimestepEmbedderND
                params:
                  outdim: 256
              - input_key: motion_bucket_id
                is_trainable: false
                target: models.svd.sgm.modules.encoders.modules.ConcatTimestepEmbedderND
                params:
                  outdim: 256
              - input_key: cond_frames
                is_trainable: false
                target: models.svd.sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
                params:
                  disable_encoder_autocast: true
                  n_cond_frames: 1
                  n_copies: 1
                  is_ae: true
                  encoder_config:
                    target: models.svd.sgm.models.autoencoder.AutoencoderKLModeOnly
                    params:
                      embed_dim: 4
                      monitor: val/rec_loss
                      ddconfig:
                        attn_type: vanilla-xformers
                        double_z: true
                        z_channels: 4
                        resolution: 256
                        in_channels: 3
                        out_ch: 3
                        ch: 128
                        ch_mult:
                        - 1
                        - 2
                        - 4
                        - 4
                        num_res_blocks: 2
                        attn_resolutions: []
                        dropout: 0.0
                      lossconfig:
                        target: torch.nn.Identity
              - input_key: cond_aug
                is_trainable: false
                target: models.svd.sgm.modules.encoders.modules.ConcatTimestepEmbedderND
                params:
                  outdim: 256
          first_stage_model:
            class_path: models.svd.sgm.AutoencodingEngine
            init_args:
              encoder_config:
                target: models.svd.sgm.modules.diffusionmodules.model.Encoder
                params:
                  attn_type: vanilla
                  double_z: true
                  z_channels: 4
                  resolution: 256
                  in_channels: 3
                  out_ch: 3
                  ch: 128
                  ch_mult:
                  - 1
                  - 2
                  - 4
                  - 4
                  num_res_blocks: 2
                  attn_resolutions: []
                  dropout: 0.0
              decoder_config:
                target: models.svd.sgm.modules.autoencoding.temporal_ae.VideoDecoder
                params:
                  attn_type: vanilla
                  double_z: true
                  z_channels: 4
                  resolution: 256
                  in_channels: 3
                  out_ch: 3
                  ch: 128
                  ch_mult:
                  - 1
                  - 2
                  - 4
                  - 4
                  num_res_blocks: 2
                  attn_resolutions: []
                  dropout: 0.0
                  video_kernel_size:
                  - 3
                  - 1
                  - 1
              loss_config:
                target: torch.nn.Identity
              regularizer_config:
                target: models.svd.sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
              optimizer_config: null
              lr_g_factor: 1.0
              trainable_ae_params: null
              ae_optimizer_args: null
              trainable_disc_params: null
              disc_optimizer_args: null
              disc_start_iter: 0
              diff_boost_factor: 3.0
              ckpt_engine: null
              ckpt_path: null
              additional_decode_keys: null
              ema_decay: null
              monitor: null
              input_key: jpg
          svd_pipeline:
            class_path: modules.loader.module_loader_config.ModuleLoaderConfig
            init_args:
              loader_cls_path: diffusers.StableVideoDiffusionPipeline
              cls_func: from_pretrained
              cls_func_fast_dev_run: ''
              kwargs_diffusers:
                torch_dtype: torch.float16
                variant: fp16
                use_safetensors: true
              model_params: null
              model_params_fast_dev_run: null
              kwargs_diff_trainer_params: null
              args:
              - stabilityai/stable-video-diffusion-img2vid-xt
              dependent_modules: null
              dependent_modules_cloned: null
              state_dict_path: ''
              strict_loading: true
              state_dict_filters: []
        root_cls: null
    diff_trainer_params:
      class_path: modules.params.diffusion_trainer.params_streaming_diff_trainer.DiffusionTrainerParams
      init_args:
        scale_factor: 0.18215
        streamingsvd_ckpt:
          class_path: modules.params.diffusion_trainer.params_streaming_diff_trainer.CheckpointDescriptor
          init_args:
            ckpt_path_local: checkpoint/StreamingSVD/model.safetensors
            ckpt_path_global: PAIR/StreamingSVD/resolve/main/model.safetensors
        disable_first_stage_autocast: true
    inference_params:
      class_path: modules.params.diffusion.inference_params.T2VInferenceParams
      init_args:
        n_autoregressive_generations: 2 # Number of autoregression for StreamingSVD
        num_conditional_frames: 7 # is this used?
        anchor_frames: '6'  #  Take the (Number+1)th frame as CLIP encoding for StreamingSVD
        reset_seed_per_generation: true # If true, the seed is reset on every generation