RamAnanth1 commited on
Commit
a93b5aa
1 Parent(s): 38da27c

Upload 5 files

Browse files
configs/stable-diffusion/test_keypose.yaml ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: test_keypose
2
+ model:
3
+ base_learning_rate: 1.0e-04
4
+ target: ldm.models.diffusion.ddpm.LatentDiffusion
5
+ params:
6
+ linear_start: 0.00085
7
+ linear_end: 0.0120
8
+ num_timesteps_cond: 1
9
+ log_every_t: 200
10
+ timesteps: 1000
11
+ first_stage_key: "jpg"
12
+ cond_stage_key: "txt"
13
+ image_size: 64
14
+ channels: 4
15
+ cond_stage_trainable: false # Note: different from the one we trained before
16
+ conditioning_key: crossattn
17
+ monitor: val/loss_simple_ema
18
+ scale_factor: 0.18215
19
+ use_ema: False
20
+
21
+ scheduler_config: # 10000 warmup steps
22
+ target: ldm.lr_scheduler.LambdaLinearScheduler
23
+ params:
24
+ warm_up_steps: [ 10000 ]
25
+ cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
26
+ f_start: [ 1.e-6 ]
27
+ f_max: [ 1. ]
28
+ f_min: [ 1. ]
29
+
30
+ unet_config:
31
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
32
+ params:
33
+ image_size: 32 # unused
34
+ in_channels: 4
35
+ out_channels: 4
36
+ model_channels: 320
37
+ attention_resolutions: [ 4, 2, 1 ]
38
+ num_res_blocks: 2
39
+ channel_mult: [ 1, 2, 4, 4 ]
40
+ num_heads: 8
41
+ use_spatial_transformer: True
42
+ transformer_depth: 1
43
+ context_dim: 768
44
+ use_checkpoint: True
45
+ legacy: False
46
+
47
+ first_stage_config:
48
+ target: ldm.models.autoencoder.AutoencoderKL
49
+ params:
50
+ embed_dim: 4
51
+ monitor: val/rec_loss
52
+ ddconfig:
53
+ double_z: true
54
+ z_channels: 4
55
+ resolution: 256
56
+ in_channels: 3
57
+ out_ch: 3
58
+ ch: 128
59
+ ch_mult:
60
+ - 1
61
+ - 2
62
+ - 4
63
+ - 4
64
+ num_res_blocks: 2
65
+ attn_resolutions: []
66
+ dropout: 0.0
67
+ lossconfig:
68
+ target: torch.nn.Identity
69
+
70
+ cond_stage_config: #__is_unconditional__
71
+ target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
72
+ params:
73
+ version: models/clip-vit-large-patch14
74
+
75
+ logger:
76
+ print_freq: 100
77
+ save_checkpoint_freq: !!float 1e4
78
+ use_tb_logger: true
79
+ wandb:
80
+ project: ~
81
+ resume_id: ~
82
+ dist_params:
83
+ backend: nccl
84
+ port: 29500
85
+ training:
86
+ lr: !!float 1e-5
87
+ save_freq: 1e4
configs/stable-diffusion/test_mask.yaml ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: test_mask
2
+ model:
3
+ base_learning_rate: 1.0e-04
4
+ target: ldm.models.diffusion.ddpm.LatentDiffusion
5
+ params:
6
+ linear_start: 0.00085
7
+ linear_end: 0.0120
8
+ num_timesteps_cond: 1
9
+ log_every_t: 200
10
+ timesteps: 1000
11
+ first_stage_key: "jpg"
12
+ cond_stage_key: "txt"
13
+ image_size: 64
14
+ channels: 4
15
+ cond_stage_trainable: false # Note: different from the one we trained before
16
+ conditioning_key: crossattn
17
+ monitor: val/loss_simple_ema
18
+ scale_factor: 0.18215
19
+ use_ema: False
20
+
21
+ scheduler_config: # 10000 warmup steps
22
+ target: ldm.lr_scheduler.LambdaLinearScheduler
23
+ params:
24
+ warm_up_steps: [ 10000 ]
25
+ cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
26
+ f_start: [ 1.e-6 ]
27
+ f_max: [ 1. ]
28
+ f_min: [ 1. ]
29
+
30
+ unet_config:
31
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
32
+ params:
33
+ image_size: 32 # unused
34
+ in_channels: 4
35
+ out_channels: 4
36
+ model_channels: 320
37
+ attention_resolutions: [ 4, 2, 1 ]
38
+ num_res_blocks: 2
39
+ channel_mult: [ 1, 2, 4, 4 ]
40
+ num_heads: 8
41
+ use_spatial_transformer: True
42
+ transformer_depth: 1
43
+ context_dim: 768
44
+ use_checkpoint: True
45
+ legacy: False
46
+
47
+ first_stage_config:
48
+ target: ldm.models.autoencoder.AutoencoderKL
49
+ params:
50
+ embed_dim: 4
51
+ monitor: val/rec_loss
52
+ ddconfig:
53
+ double_z: true
54
+ z_channels: 4
55
+ resolution: 256
56
+ in_channels: 3
57
+ out_ch: 3
58
+ ch: 128
59
+ ch_mult:
60
+ - 1
61
+ - 2
62
+ - 4
63
+ - 4
64
+ num_res_blocks: 2
65
+ attn_resolutions: []
66
+ dropout: 0.0
67
+ lossconfig:
68
+ target: torch.nn.Identity
69
+
70
+ cond_stage_config: #__is_unconditional__
71
+ target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
72
+ params:
73
+ version: models/clip-vit-large-patch14
74
+
75
+ logger:
76
+ print_freq: 100
77
+ save_checkpoint_freq: !!float 1e4
78
+ use_tb_logger: true
79
+ wandb:
80
+ project: ~
81
+ resume_id: ~
82
+ dist_params:
83
+ backend: nccl
84
+ port: 29500
85
+ training:
86
+ lr: !!float 1e-5
87
+ save_freq: 1e4
configs/stable-diffusion/test_mask_sketch.yaml ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: test_mask_sketch
2
+ model:
3
+ base_learning_rate: 1.0e-04
4
+ target: ldm.models.diffusion.ddpm.LatentDiffusion
5
+ params:
6
+ linear_start: 0.00085
7
+ linear_end: 0.0120
8
+ num_timesteps_cond: 1
9
+ log_every_t: 200
10
+ timesteps: 1000
11
+ first_stage_key: "jpg"
12
+ cond_stage_key: "txt"
13
+ image_size: 64
14
+ channels: 4
15
+ cond_stage_trainable: false # Note: different from the one we trained before
16
+ conditioning_key: crossattn
17
+ monitor: val/loss_simple_ema
18
+ scale_factor: 0.18215
19
+ use_ema: False
20
+
21
+ scheduler_config: # 10000 warmup steps
22
+ target: ldm.lr_scheduler.LambdaLinearScheduler
23
+ params:
24
+ warm_up_steps: [ 10000 ]
25
+ cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
26
+ f_start: [ 1.e-6 ]
27
+ f_max: [ 1. ]
28
+ f_min: [ 1. ]
29
+
30
+ unet_config:
31
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
32
+ params:
33
+ image_size: 32 # unused
34
+ in_channels: 4
35
+ out_channels: 4
36
+ model_channels: 320
37
+ attention_resolutions: [ 4, 2, 1 ]
38
+ num_res_blocks: 2
39
+ channel_mult: [ 1, 2, 4, 4 ]
40
+ num_heads: 8
41
+ use_spatial_transformer: True
42
+ transformer_depth: 1
43
+ context_dim: 768
44
+ use_checkpoint: True
45
+ legacy: False
46
+
47
+ first_stage_config:
48
+ target: ldm.models.autoencoder.AutoencoderKL
49
+ params:
50
+ embed_dim: 4
51
+ monitor: val/rec_loss
52
+ ddconfig:
53
+ double_z: true
54
+ z_channels: 4
55
+ resolution: 256
56
+ in_channels: 3
57
+ out_ch: 3
58
+ ch: 128
59
+ ch_mult:
60
+ - 1
61
+ - 2
62
+ - 4
63
+ - 4
64
+ num_res_blocks: 2
65
+ attn_resolutions: []
66
+ dropout: 0.0
67
+ lossconfig:
68
+ target: torch.nn.Identity
69
+
70
+ cond_stage_config: #__is_unconditional__
71
+ target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
72
+ params:
73
+ version: models/clip-vit-large-patch14
74
+
75
+ logger:
76
+ print_freq: 100
77
+ save_checkpoint_freq: !!float 1e4
78
+ use_tb_logger: true
79
+ wandb:
80
+ project: ~
81
+ resume_id: ~
82
+ dist_params:
83
+ backend: nccl
84
+ port: 29500
85
+ training:
86
+ lr: !!float 1e-5
87
+ save_freq: 1e4
configs/stable-diffusion/test_sketch.yaml ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: test_sketch
2
+ model:
3
+ base_learning_rate: 1.0e-04
4
+ target: ldm.models.diffusion.ddpm.LatentDiffusion
5
+ params:
6
+ linear_start: 0.00085
7
+ linear_end: 0.0120
8
+ num_timesteps_cond: 1
9
+ log_every_t: 200
10
+ timesteps: 1000
11
+ first_stage_key: "jpg"
12
+ cond_stage_key: "txt"
13
+ image_size: 64
14
+ channels: 4
15
+ cond_stage_trainable: false # Note: different from the one we trained before
16
+ conditioning_key: crossattn
17
+ monitor: val/loss_simple_ema
18
+ scale_factor: 0.18215
19
+ use_ema: False
20
+
21
+ scheduler_config: # 10000 warmup steps
22
+ target: ldm.lr_scheduler.LambdaLinearScheduler
23
+ params:
24
+ warm_up_steps: [ 10000 ]
25
+ cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
26
+ f_start: [ 1.e-6 ]
27
+ f_max: [ 1. ]
28
+ f_min: [ 1. ]
29
+
30
+ unet_config:
31
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
32
+ params:
33
+ image_size: 32 # unused
34
+ in_channels: 4
35
+ out_channels: 4
36
+ model_channels: 320
37
+ attention_resolutions: [ 4, 2, 1 ]
38
+ num_res_blocks: 2
39
+ channel_mult: [ 1, 2, 4, 4 ]
40
+ num_heads: 8
41
+ use_spatial_transformer: True
42
+ transformer_depth: 1
43
+ context_dim: 768
44
+ use_checkpoint: True
45
+ legacy: False
46
+
47
+ first_stage_config:
48
+ target: ldm.models.autoencoder.AutoencoderKL
49
+ params:
50
+ embed_dim: 4
51
+ monitor: val/rec_loss
52
+ ddconfig:
53
+ double_z: true
54
+ z_channels: 4
55
+ resolution: 256
56
+ in_channels: 3
57
+ out_ch: 3
58
+ ch: 128
59
+ ch_mult:
60
+ - 1
61
+ - 2
62
+ - 4
63
+ - 4
64
+ num_res_blocks: 2
65
+ attn_resolutions: []
66
+ dropout: 0.0
67
+ lossconfig:
68
+ target: torch.nn.Identity
69
+
70
+ cond_stage_config: #__is_unconditional__
71
+ target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
72
+ params:
73
+ version: models/clip-vit-large-patch14
74
+
75
+ logger:
76
+ print_freq: 100
77
+ save_checkpoint_freq: !!float 1e4
78
+ use_tb_logger: true
79
+ wandb:
80
+ project: ~
81
+ resume_id: ~
82
+ dist_params:
83
+ backend: nccl
84
+ port: 29500
85
+ training:
86
+ lr: !!float 1e-5
87
+ save_freq: 1e4
configs/stable-diffusion/test_sketch_edit.yaml ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: test_sketch_edit
2
+ model:
3
+ base_learning_rate: 1.0e-04
4
+ target: ldm.models.diffusion.ddpm.LatentDiffusion
5
+ params:
6
+ linear_start: 0.00085
7
+ linear_end: 0.0120
8
+ num_timesteps_cond: 1
9
+ log_every_t: 200
10
+ timesteps: 1000
11
+ first_stage_key: "jpg"
12
+ cond_stage_key: "txt"
13
+ image_size: 64
14
+ channels: 4
15
+ cond_stage_trainable: false # Note: different from the one we trained before
16
+ conditioning_key: crossattn
17
+ monitor: val/loss_simple_ema
18
+ scale_factor: 0.18215
19
+ use_ema: False
20
+
21
+ scheduler_config: # 10000 warmup steps
22
+ target: ldm.lr_scheduler.LambdaLinearScheduler
23
+ params:
24
+ warm_up_steps: [ 10000 ]
25
+ cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
26
+ f_start: [ 1.e-6 ]
27
+ f_max: [ 1. ]
28
+ f_min: [ 1. ]
29
+
30
+ unet_config:
31
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
32
+ params:
33
+ image_size: 32 # unused
34
+ in_channels: 4
35
+ out_channels: 4
36
+ model_channels: 320
37
+ attention_resolutions: [ 4, 2, 1 ]
38
+ num_res_blocks: 2
39
+ channel_mult: [ 1, 2, 4, 4 ]
40
+ num_heads: 8
41
+ use_spatial_transformer: True
42
+ transformer_depth: 1
43
+ context_dim: 768
44
+ use_checkpoint: True
45
+ legacy: False
46
+
47
+ first_stage_config:
48
+ target: ldm.models.autoencoder.AutoencoderKL
49
+ params:
50
+ embed_dim: 4
51
+ monitor: val/rec_loss
52
+ ddconfig:
53
+ double_z: true
54
+ z_channels: 4
55
+ resolution: 256
56
+ in_channels: 3
57
+ out_ch: 3
58
+ ch: 128
59
+ ch_mult:
60
+ - 1
61
+ - 2
62
+ - 4
63
+ - 4
64
+ num_res_blocks: 2
65
+ attn_resolutions: []
66
+ dropout: 0.0
67
+ lossconfig:
68
+ target: torch.nn.Identity
69
+
70
+ cond_stage_config: #__is_unconditional__
71
+ target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
72
+ params:
73
+ version: models/clip-vit-large-patch14
74
+
75
+ logger:
76
+ print_freq: 100
77
+ save_checkpoint_freq: !!float 1e4
78
+ use_tb_logger: true
79
+ wandb:
80
+ project: ~
81
+ resume_id: ~
82
+ dist_params:
83
+ backend: nccl
84
+ port: 29500
85
+ training:
86
+ lr: !!float 1e-5
87
+ save_freq: 1e4