Diffusers
TensorBoard
controlnet
Nacholmo's picture
Update 1500.yaml
3393251
raw
history blame
2.92 kB
model:
target: cldm.cldm.ControlLDM
params:
linear_start: 0.00085
linear_end: 0.0120
num_timesteps_cond: 1
log_every_t: 200
timesteps: 1000
first_stage_key: "jpg"
cond_stage_key: "txt"
control_key: "hint"
image_size: 64
channels: 4
cond_stage_trainable: false
conditioning_key: crossattn
monitor: val/loss_simple_ema
scale_factor: 0.18215
use_ema: False
only_mid_control: False
control_stage_config:
target: cldm.cldm.ControlNet
params:
image_size: 32 # unused
in_channels: 4
hint_channels: 3
model_channels: 320
attention_resolutions: [4, 2, 1]
num_res_blocks: 2
channel_mult: [1, 2, 4, 4]
num_heads: 8
use_spatial_transformer: True
transformer_depth: 1
context_dim: 768
use_checkpoint: True
legacy: False
unet_config:
target: cldm.cldm.ControlledUnetModel
params:
image_size: 32 # unused
in_channels: 4
out_channels: 4
model_channels: 320
attention_resolutions: [4, 2, 1]
num_res_blocks: 2
channel_mult: [1, 2, 4, 4]
num_heads: 8
use_spatial_transformer: True
transformer_depth: 1
context_dim: 768
use_checkpoint: True
legacy: False
first_stage_config:
target: ldm.models.autoencoder.AutoencoderKL
params:
embed_dim: 4
monitor: val/rec_loss
ddconfig:
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult:
- 1
- 2
- 4
- 4
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
lossconfig:
target: torch.nn.Identity
cond_stage_config:
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
params:
_class_name: ControlNetModel
_diffusers_version: "0.18.0.dev0"
act_fn: silu
attention_head_dim: 8
block_out_channels: [320, 640, 1280, 1280]
class_embed_type: null
conditioning_embedding_out_channels: [16, 32, 96, 256]
controlnet_conditioning_channel_order: rgb
cross_attention_dim: 768
down_block_types: ["CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D"]
downsample_padding: 1
flip_sin_to_cos: true
freq_shift: 0
global_pool_conditions: false
in_channels: 4
layers_per_block: 2
mid_block_scale_factor: 1
norm_eps: 1e-05
norm_num_groups: 32
num_class_embeds: null
only_cross_attention: false
projection_class_embeddings_input_dim: null
resnet_time_scale_shift: "default"
upcast_attention: false
use_linear_projection: false