sd_autoencoder:
  type: autoencoderkl
  args:
    embed_dim: 4
    monitor: val/rec_loss
    ddconfig:
      double_z: true
      z_channels: 4
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult: [1, 2, 4, 4]
      num_res_blocks: 2
      attn_resolutions: []
      dropout: 0.0
#       use_video_arch: true
    lossconfig:
      target: torch.nn.Identity
  pth: pretrained/kl-f8.pth