Spaces:
Running
on
Zero
Running
on
Zero
# pytorch_lightning==2.2.2 | |
seed_everything: 33 | |
trainer: | |
accelerator: auto | |
strategy: auto | |
devices: '1' | |
num_nodes: 1 | |
precision: 16-mixed | |
logger: False | |
model: | |
class_path: diffusion_trainer.streaming_svd.StreamingSVD | |
init_args: | |
vfi: | |
class_path: modules.params.vfi.VFIParams | |
init_args: | |
ckpt_path_local: checkpoint/VFI/ours.pkl | |
ckpt_path_global: https://drive.google.com/file/d/1XCNoyhA1RX3m8W-XJK8H8inH47l36kxP/view?usp=sharing | |
i2v_enhance: | |
class_path: modules.params.i2v_enhance.I2VEnhanceParams | |
init_args: | |
ckpt_path_local: checkpoint/i2v_enhance/ | |
ckpt_path_global: ali-vilab/i2vgen-xl | |
module_loader: | |
class_path: modules.loader.module_loader.GenericModuleLoader | |
init_args: | |
pipeline_repo: stabilityai/stable-video-diffusion-img2vid-xt | |
pipeline_obj: streamingt2v_pipeline | |
set_prediction_type: '' | |
module_names: | |
- network_config | |
- model | |
- controlnet | |
- denoiser | |
- conditioner | |
- first_stage_model | |
- sampler | |
- svd_pipeline | |
module_config: | |
controlnet: | |
class_path: modules.loader.module_loader_config.ModuleLoaderConfig | |
init_args: | |
loader_cls_path: models.control.controlnet.ControlNet | |
cls_func: from_unet | |
cls_func_fast_dev_run: '' | |
kwargs_diffusers: null | |
model_params: | |
merging_mode: addition | |
zero_conv_mode: Identity | |
frame_expansion: none | |
downsample_controlnet_cond: true | |
use_image_encoder_normalization: true | |
use_controlnet_mask: false | |
condition_encoder: '' | |
conditioning_embedding_out_channels: | |
- 32 | |
- 96 | |
- 256 | |
- 512 | |
kwargs_diff_trainer_params: null | |
args: [] | |
dependent_modules: | |
model: model | |
dependent_modules_cloned: null | |
state_dict_path: '' | |
strict_loading: true | |
state_dict_filters: [] | |
network_config: | |
class_path: models.diffusion.video_model.VideoUNet | |
init_args: | |
in_channels: 8 | |
model_channels: 320 | |
out_channels: 4 | |
num_res_blocks: 2 | |
num_conditional_frames: null | |
attention_resolutions: | |
- 4 | |
- 2 | |
- 1 | |
dropout: 0.0 | |
channel_mult: | |
- 1 | |
- 2 | |
- 4 | |
- 4 | |
conv_resample: true | |
dims: 2 | |
num_classes: sequential | |
use_checkpoint: False | |
num_heads: -1 | |
num_head_channels: 64 | |
num_heads_upsample: -1 | |
use_scale_shift_norm: false | |
resblock_updown: false | |
transformer_depth: 1 | |
transformer_depth_middle: null | |
context_dim: 1024 | |
time_downup: false | |
time_context_dim: null | |
extra_ff_mix_layer: true | |
use_spatial_context: true | |
merge_strategy: learned_with_images | |
merge_factor: 0.5 | |
spatial_transformer_attn_type: softmax-xformers | |
video_kernel_size: | |
- 3 | |
- 1 | |
- 1 | |
use_linear_in_transformer: true | |
adm_in_channels: 768 | |
disable_temporal_crossattention: false | |
max_ddpm_temb_period: 10000 | |
merging_mode: attention_cross_attention | |
controlnet_mode: true | |
use_apm: false | |
model: | |
class_path: modules.loader.module_loader_config.ModuleLoaderConfig | |
init_args: | |
loader_cls_path: models.svd.sgm.modules.diffusionmodules.wrappers.OpenAIWrapper | |
cls_func: '' | |
cls_func_fast_dev_run: '' | |
kwargs_diffusers: | |
compile_model: false | |
model_params: null | |
model_params_fast_dev_run: null | |
kwargs_diff_trainer_params: null | |
args: [] | |
dependent_modules: | |
diffusion_model: network_config | |
dependent_modules_cloned: null | |
state_dict_path: '' | |
strict_loading: true | |
state_dict_filters: [] | |
denoiser: | |
class_path: models.svd.sgm.modules.diffusionmodules.denoiser.Denoiser | |
init_args: | |
scaling_config: | |
target: models.svd.sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise | |
sampler: | |
class_path: models.svd.sgm.modules.diffusionmodules.sampling.EulerEDMSampler | |
init_args: | |
s_churn: 0.0 | |
s_tmin: 0.0 | |
s_tmax: .inf | |
s_noise: 1.0 | |
discretization_config: | |
target: models.diffusion.discretizer.AlignYourSteps | |
params: | |
sigma_max: 700.0 | |
num_steps: 30 | |
guider_config: | |
target: models.svd.sgm.modules.diffusionmodules.guiders.LinearPredictionGuider | |
params: | |
max_scale: 3.0 | |
min_scale: 1.5 | |
num_frames: 25 | |
verbose: false | |
device: cuda | |
conditioner: | |
class_path: models.svd.sgm.modules.GeneralConditioner | |
init_args: | |
emb_models: | |
- is_trainable: false | |
input_key: cond_frames_without_noise | |
target: models.svd.sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder | |
params: | |
n_cond_frames: 1 | |
n_copies: 1 | |
open_clip_embedding_config: | |
target: models.svd.sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder | |
params: | |
freeze: true | |
- input_key: fps_id | |
is_trainable: false | |
target: models.svd.sgm.modules.encoders.modules.ConcatTimestepEmbedderND | |
params: | |
outdim: 256 | |
- input_key: motion_bucket_id | |
is_trainable: false | |
target: models.svd.sgm.modules.encoders.modules.ConcatTimestepEmbedderND | |
params: | |
outdim: 256 | |
- input_key: cond_frames | |
is_trainable: false | |
target: models.svd.sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder | |
params: | |
disable_encoder_autocast: true | |
n_cond_frames: 1 | |
n_copies: 1 | |
is_ae: true | |
encoder_config: | |
target: models.svd.sgm.models.autoencoder.AutoencoderKLModeOnly | |
params: | |
embed_dim: 4 | |
monitor: val/rec_loss | |
ddconfig: | |
attn_type: vanilla-xformers | |
double_z: true | |
z_channels: 4 | |
resolution: 256 | |
in_channels: 3 | |
out_ch: 3 | |
ch: 128 | |
ch_mult: | |
- 1 | |
- 2 | |
- 4 | |
- 4 | |
num_res_blocks: 2 | |
attn_resolutions: [] | |
dropout: 0.0 | |
lossconfig: | |
target: torch.nn.Identity | |
- input_key: cond_aug | |
is_trainable: false | |
target: models.svd.sgm.modules.encoders.modules.ConcatTimestepEmbedderND | |
params: | |
outdim: 256 | |
first_stage_model: | |
class_path: models.svd.sgm.AutoencodingEngine | |
init_args: | |
encoder_config: | |
target: models.svd.sgm.modules.diffusionmodules.model.Encoder | |
params: | |
attn_type: vanilla | |
double_z: true | |
z_channels: 4 | |
resolution: 256 | |
in_channels: 3 | |
out_ch: 3 | |
ch: 128 | |
ch_mult: | |
- 1 | |
- 2 | |
- 4 | |
- 4 | |
num_res_blocks: 2 | |
attn_resolutions: [] | |
dropout: 0.0 | |
decoder_config: | |
target: models.svd.sgm.modules.autoencoding.temporal_ae.VideoDecoder | |
params: | |
attn_type: vanilla | |
double_z: true | |
z_channels: 4 | |
resolution: 256 | |
in_channels: 3 | |
out_ch: 3 | |
ch: 128 | |
ch_mult: | |
- 1 | |
- 2 | |
- 4 | |
- 4 | |
num_res_blocks: 2 | |
attn_resolutions: [] | |
dropout: 0.0 | |
video_kernel_size: | |
- 3 | |
- 1 | |
- 1 | |
loss_config: | |
target: torch.nn.Identity | |
regularizer_config: | |
target: models.svd.sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer | |
optimizer_config: null | |
lr_g_factor: 1.0 | |
trainable_ae_params: null | |
ae_optimizer_args: null | |
trainable_disc_params: null | |
disc_optimizer_args: null | |
disc_start_iter: 0 | |
diff_boost_factor: 3.0 | |
ckpt_engine: null | |
ckpt_path: null | |
additional_decode_keys: null | |
ema_decay: null | |
monitor: null | |
input_key: jpg | |
svd_pipeline: | |
class_path: modules.loader.module_loader_config.ModuleLoaderConfig | |
init_args: | |
loader_cls_path: diffusers.StableVideoDiffusionPipeline | |
cls_func: from_pretrained | |
cls_func_fast_dev_run: '' | |
kwargs_diffusers: | |
torch_dtype: torch.float16 | |
variant: fp16 | |
use_safetensors: true | |
model_params: null | |
model_params_fast_dev_run: null | |
kwargs_diff_trainer_params: null | |
args: | |
- stabilityai/stable-video-diffusion-img2vid-xt | |
dependent_modules: null | |
dependent_modules_cloned: null | |
state_dict_path: '' | |
strict_loading: true | |
state_dict_filters: [] | |
root_cls: null | |
diff_trainer_params: | |
class_path: modules.params.diffusion_trainer.params_streaming_diff_trainer.DiffusionTrainerParams | |
init_args: | |
scale_factor: 0.18215 | |
streamingsvd_ckpt: | |
class_path: modules.params.diffusion_trainer.params_streaming_diff_trainer.CheckpointDescriptor | |
init_args: | |
ckpt_path_local: checkpoint/StreamingSVD/model.safetensors | |
ckpt_path_global: PAIR/StreamingSVD/resolve/main/model.safetensors | |
disable_first_stage_autocast: true | |
inference_params: | |
class_path: modules.params.diffusion.inference_params.T2VInferenceParams | |
init_args: | |
n_autoregressive_generations: 2 # Number of autoregression for StreamingSVD | |
num_conditional_frames: 7 # is this used? | |
anchor_frames: '6' # Take the (Number+1)th frame as CLIP encoding for StreamingSVD | |
reset_seed_per_generation: true # If true, the seed is reset on every generation | |