radames commited on
Commit
cebd32a
·
1 Parent(s): ece5a96

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +126 -24
  2. lcm_txt2img/pipeline.py +760 -0
  3. requirements.txt +23 -0
app.py CHANGED
@@ -1,28 +1,130 @@
1
  import gradio as gr
2
  from PIL import Image
 
 
 
3
 
4
- def predict(merge_ratio, guidance, steps, sharpness, prompt1, prompt2, seed):
5
- result_image = Image.new('RGB', [512,512], (seed))
6
- print(merge_ratio, guidance, steps, sharpness, prompt1, prompt2, seed)
7
- return result_image
8
-
9
-
10
- with gr.Blocks() as demo:
11
- with gr.Row():
12
- with gr.Column():
13
- image = gr.Image(type="pil")
14
- with gr.Column():
15
- merge_ratio = gr.Slider(minimum=0, maximum=50, step=1, label="Merge Ratio")
16
- guidance = gr.Slider(label="Guidance")
17
- steps = gr.Slider(label="Steps")
18
- sharpness = gr.Slider(minimum=0, maximum=50, step=1, label="sharpness")
19
- seed = gr.Slider(randomize=True, minimum=0, maximum=12013012031030)
20
- prompt1 = gr.Textbox(label="Prompt 1")
21
- prompt2 = gr.Textbox(label="Prompt 2")
22
- generate_bt = gr.Button("Generate")
23
-
24
- inputs = [merge_ratio, guidance, steps, sharpness, prompt1, prompt2, seed]
25
- generate_bt.click(predict, inputs=inputs, outputs=image, show_progress=False)
26
- seed.change(predict, inputs=inputs, outputs=image, show_progress=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  if __name__ == "__main__":
28
- demo.launch()
 
1
  import gradio as gr
2
  from PIL import Image
3
+ import torch
4
+ from diffusers import DiffusionPipeline, AutoencoderTiny
5
+ import os
6
 
7
+ SAFETY_CHECKER = os.environ.get("SAFETY_CHECKER", None)
8
+ TORCH_COMPILE = os.environ.get("TORCH_COMPILE", None)
9
+
10
+ if SAFETY_CHECKER:
11
+ pipe = DiffusionPipeline.from_pretrained(
12
+ "SimianLuo/LCM_Dreamshaper_v7",
13
+ custom_pipeline="lcm_txt2img",
14
+ scheduler=None,
15
+ )
16
+ else:
17
+ pipe = DiffusionPipeline.from_pretrained(
18
+ "SimianLuo/LCM_Dreamshaper_v7",
19
+ custom_pipeline="lcm_txt2img",
20
+ scheduler=None,
21
+ safety_checker=None,
22
+ )
23
+ pipe.to(device="cuda", dtype=torch.float16)
24
+ pipe.vae = AutoencoderTiny.from_pretrained(
25
+ "madebyollin/taesd", device="cuda", torch_dtype=torch.float16
26
+ )
27
+ pipe.vae = pipe.vae.cuda()
28
+ pipe.unet.to(memory_format=torch.channels_last)
29
+ pipe.set_progress_bar_config(disable=True)
30
+
31
+ if TORCH_COMPILE:
32
+ pipe.text_encoder = torch.compile(pipe.text_encoder, mode="max-autotune")
33
+ pipe.tokenizer = torch.compile(pipe.tokenizer, mode="max-autotune")
34
+ pipe.unet = torch.compile(pipe.unet, mode="max-autotune")
35
+ pipe.vae = torch.compile(pipe.vae, mode="max-autotune")
36
+
37
+
38
+ def predict(prompt1, prompt2, merge_ratio, guidance, steps, sharpness, seed=1231231):
39
+ torch.manual_seed(seed)
40
+ img = pipe(
41
+ prompt1=prompt1,
42
+ prompt2=prompt2,
43
+ sv=merge_ratio,
44
+ sharpness=sharpness,
45
+ width=512,
46
+ height=512,
47
+ num_inference_steps=steps,
48
+ guidance_scale=guidance,
49
+ lcm_origin_steps=50,
50
+ output_type="pil",
51
+ return_dict=False,
52
+ )
53
+ return img
54
+
55
+
56
+ css="""
57
+ #container{
58
+ margin: 0 auto;
59
+ max-width: 80rem;
60
+ }
61
+ #intro{
62
+ max-width: 32rem;
63
+ text-align: center;
64
+ margin: 0 auto;
65
+ }
66
+ """
67
+ with gr.Blocks(css=css) as demo:
68
+ with gr.Column(elem_id="container"):
69
+ gr.Markdown(
70
+ """# SDZoom
71
+
72
+ Welcome to sdzoom, a testbed application designed for optimizing and experimenting with various
73
+ configurations to achieve the fastest Stable Diffusion (SD) pipelines.
74
+ RTSD leverages the expertise provided by Latent Consistency Models (LCM). For more information about LCM,
75
+ visit their website at [Latent Consistency Models](https://latent-consistency-models.github.io/).
76
+
77
+ """, elem_id="intro"
78
+ )
79
+ with gr.Row():
80
+ with gr.Column():
81
+ image = gr.Image(type="pil")
82
+ with gr.Column():
83
+ merge_ratio = gr.Slider(
84
+ value=50, minimum=1, maximum=100, step=1, label="Merge Ratio"
85
+ )
86
+ guidance = gr.Slider(
87
+ label="Guidance", minimum=1, maximum=50, value=10.0, step=0.01
88
+ )
89
+ steps = gr.Slider(label="Steps", value=4, minimum=2, maximum=20, step=1)
90
+ sharpness = gr.Slider(
91
+ value=1.0, minimum=0, maximum=1, step=0.001, label="Sharpness"
92
+ )
93
+ seed = gr.Slider(randomize=True, minimum=0, maximum=12013012031030, label="Seed")
94
+ prompt1 = gr.Textbox(label="Prompt 1")
95
+ prompt2 = gr.Textbox(label="Prompt 2")
96
+ generate_bt = gr.Button("Generate")
97
+
98
+ inputs = [prompt1, prompt2, merge_ratio, guidance, steps, sharpness, seed]
99
+ gr.Examples(
100
+ examples=[
101
+ ["Elon Musk", "Mark Zuckerberg", 50, 10.0, 4, 1.0, 1231231],
102
+ ["Elon Musk", "Bill Gates", 50, 10.0, 4, 1.0, 53453],
103
+ [
104
+ "Asian women, intricate jewlery in her hair, 8k",
105
+ "Tom Cruise, intricate jewlery in her hair, 8k",
106
+ 50,
107
+ 10.0,
108
+ 4,
109
+ 1.0,
110
+ 542343,
111
+ ],
112
+ ],
113
+ fn=predict,
114
+ inputs=inputs,
115
+ outputs=image,
116
+ )
117
+ generate_bt.click(fn=predict, inputs=inputs, outputs=image, show_progress=False)
118
+ seed.change(fn=predict, inputs=inputs, outputs=image, show_progress=False)
119
+ merge_ratio.change(
120
+ fn=predict, inputs=inputs, outputs=image, show_progress=False
121
+ )
122
+ guidance.change(fn=predict, inputs=inputs, outputs=image, show_progress=False)
123
+ steps.change(fn=predict, inputs=inputs, outputs=image, show_progress=False)
124
+ sharpness.change(fn=predict, inputs=inputs, outputs=image, show_progress=False)
125
+ prompt1.change(fn=predict, inputs=inputs, outputs=image, show_progress=False)
126
+ prompt2.change(fn=predict, inputs=inputs, outputs=image, show_progress=False)
127
+
128
+ demo.queue()
129
  if __name__ == "__main__":
130
+ demo.launch()
lcm_txt2img/pipeline.py ADDED
@@ -0,0 +1,760 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Stanford University Team and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
16
+ # and https://github.com/hojonathanho/diffusion
17
+
18
+ import math
19
+ from dataclasses import dataclass
20
+ from typing import Any, Dict, List, Optional, Tuple, Union
21
+
22
+ import numpy as np
23
+ import torch
24
+ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
25
+
26
+ from diffusers import AutoencoderKL, ConfigMixin, DiffusionPipeline, SchedulerMixin, UNet2DConditionModel, logging
27
+ from diffusers.configuration_utils import register_to_config
28
+ from diffusers.image_processor import VaeImageProcessor
29
+ from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
30
+ from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
31
+ from diffusers.utils import BaseOutput
32
+
33
+
34
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
35
+
36
+
37
+ class LatentConsistencyModelPipeline(DiffusionPipeline):
38
+ _optional_components = ["scheduler"]
39
+
40
+ def __init__(
41
+ self,
42
+ vae: AutoencoderKL,
43
+ text_encoder: CLIPTextModel,
44
+ tokenizer: CLIPTokenizer,
45
+ unet: UNet2DConditionModel,
46
+ scheduler: "LCMScheduler",
47
+ safety_checker: StableDiffusionSafetyChecker,
48
+ feature_extractor: CLIPImageProcessor,
49
+ requires_safety_checker: bool = True,
50
+ ):
51
+ super().__init__()
52
+
53
+ scheduler = (
54
+ scheduler
55
+ if scheduler is not None
56
+ else LCMScheduler(
57
+ beta_start=0.00085, beta_end=0.0120, beta_schedule="scaled_linear", prediction_type="epsilon"
58
+ )
59
+ )
60
+
61
+ self.register_modules(
62
+ vae=vae,
63
+ text_encoder=text_encoder,
64
+ tokenizer=tokenizer,
65
+ unet=unet,
66
+ scheduler=scheduler,
67
+ safety_checker=safety_checker,
68
+ feature_extractor=feature_extractor,
69
+ )
70
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
71
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
72
+
73
+ def _encode_prompt(
74
+ self,
75
+ prompt,
76
+ device,
77
+ num_images_per_prompt,
78
+ prompt_embeds: None,
79
+ ):
80
+ r"""
81
+ Encodes the prompt into text encoder hidden states.
82
+ Args:
83
+ prompt (`str` or `List[str]`, *optional*):
84
+ prompt to be encoded
85
+ device: (`torch.device`):
86
+ torch device
87
+ num_images_per_prompt (`int`):
88
+ number of images that should be generated per prompt
89
+ prompt_embeds (`torch.FloatTensor`, *optional*):
90
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
91
+ provided, text embeddings will be generated from `prompt` input argument.
92
+ """
93
+
94
+ if prompt is not None and isinstance(prompt, str):
95
+ pass
96
+ elif prompt is not None and isinstance(prompt, list):
97
+ len(prompt)
98
+ else:
99
+ prompt_embeds.shape[0]
100
+
101
+ if prompt_embeds is None:
102
+ text_inputs = self.tokenizer(
103
+ prompt,
104
+ padding="max_length",
105
+ max_length=self.tokenizer.model_max_length,
106
+ truncation=True,
107
+ return_tensors="pt",
108
+ )
109
+ text_input_ids = text_inputs.input_ids
110
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
111
+
112
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
113
+ text_input_ids, untruncated_ids
114
+ ):
115
+ removed_text = self.tokenizer.batch_decode(
116
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
117
+ )
118
+ logger.warning(
119
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
120
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
121
+ )
122
+
123
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
124
+ attention_mask = text_inputs.attention_mask.to(device)
125
+ else:
126
+ attention_mask = None
127
+
128
+ prompt_embeds = self.text_encoder(
129
+ text_input_ids.to(device),
130
+ attention_mask=attention_mask,
131
+ )
132
+ prompt_embeds = prompt_embeds[0]
133
+
134
+ if self.text_encoder is not None:
135
+ prompt_embeds_dtype = self.text_encoder.dtype
136
+ elif self.unet is not None:
137
+ prompt_embeds_dtype = self.unet.dtype
138
+ else:
139
+ prompt_embeds_dtype = prompt_embeds.dtype
140
+
141
+ prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
142
+
143
+ bs_embed, seq_len, _ = prompt_embeds.shape
144
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
145
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
146
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
147
+
148
+ # Don't need to get uncond prompt embedding because of LCM Guided Distillation
149
+ return prompt_embeds
150
+
151
+ def run_safety_checker(self, image, device, dtype):
152
+ if self.safety_checker is None:
153
+ has_nsfw_concept = None
154
+ else:
155
+ if torch.is_tensor(image):
156
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
157
+ else:
158
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
159
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
160
+ image, has_nsfw_concept = self.safety_checker(
161
+ images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
162
+ )
163
+ return image, has_nsfw_concept
164
+
165
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, latents=None):
166
+ #print(f"{batch_size}, {num_channels_latents}, {height // self.vae_scale_factor}, {width // self.vae_scale_factor}, {latents}")
167
+ #print(f"init_noise_sigma = {self.scheduler.init_noise_sigma}")
168
+ shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
169
+ if latents is None:
170
+ latents = torch.randn(shape, dtype=dtype).to(device)
171
+ else:
172
+ latents = latents.to(device)
173
+ # scale the initial noise by the standard deviation required by the scheduler
174
+ latents = latents * self.scheduler.init_noise_sigma
175
+ return latents
176
+
177
+ def get_w_embedding(self, w, embedding_dim=512, dtype=torch.float32):
178
+ """
179
+ see https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
180
+ Args:
181
+ timesteps: torch.Tensor: generate embedding vectors at these timesteps
182
+ embedding_dim: int: dimension of the embeddings to generate
183
+ dtype: data type of the generated embeddings
184
+ Returns:
185
+ embedding vectors with shape `(len(timesteps), embedding_dim)`
186
+ """
187
+ assert len(w.shape) == 1
188
+ w = w * 1000.0
189
+
190
+ half_dim = embedding_dim // 2
191
+ emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
192
+ emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
193
+ emb = w.to(dtype)[:, None] * emb[None, :]
194
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
195
+ if embedding_dim % 2 == 1: # zero pad
196
+ emb = torch.nn.functional.pad(emb, (0, 1))
197
+ assert emb.shape == (w.shape[0], embedding_dim)
198
+ return emb
199
+
200
+ @torch.no_grad()
201
+ def __call__(
202
+ self,
203
+ prompt1: Union[str, List[str]] = None,
204
+ prompt2: Union[str, List[str]] = None,
205
+ sv: float = .5,
206
+ sharpness: float = 1.,
207
+ height: Optional[int] = 768,
208
+ width: Optional[int] = 768,
209
+ guidance_scale: float = 7.5,
210
+ num_images_per_prompt: Optional[int] = 1,
211
+ latents: Optional[torch.FloatTensor] = None,
212
+ num_inference_steps: int = 4,
213
+ lcm_origin_steps: int = 50,
214
+ prompt_embeds: Optional[torch.FloatTensor] = None,
215
+ output_type: Optional[str] = "pil",
216
+ return_dict: bool = True,
217
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
218
+ ):
219
+ # 0. Default height and width to unet
220
+ height = height or self.unet.config.sample_size * self.vae_scale_factor
221
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
222
+
223
+ # 2. Define call parameters
224
+ #if prompt is not None and isinstance(prompt, str):
225
+ # batch_size = 1
226
+ #elif prompt is not None and isinstance(prompt, list):
227
+ # batch_size = len(prompt)
228
+ #else:
229
+ # batch_size = prompt_embeds.shape[0]
230
+
231
+ batch_size = 1
232
+
233
+ device = self._execution_device
234
+ # do_classifier_free_guidance = guidance_scale > 0.0 # In LCM Implementation: cfg_noise = noise_cond + cfg_scale * (noise_cond - noise_uncond) , (cfg_scale > 0.0 using CFG)
235
+
236
+ # 3. Encode input prompt
237
+ print(f"prompt1 = {prompt1}")
238
+ pe1 = self._encode_prompt(
239
+ prompt1,
240
+ device,
241
+ num_images_per_prompt,
242
+ prompt_embeds=prompt_embeds,
243
+ )
244
+
245
+ print(f"prompt2 = {prompt2}")
246
+ pe2 = self._encode_prompt(
247
+ prompt2,
248
+ device,
249
+ num_images_per_prompt,
250
+ prompt_embeds=None,
251
+ )
252
+
253
+ prompt_embeds = (100-sv)/100 * pe1 + sv/100 * pe2
254
+
255
+ # 4. Prepare timesteps
256
+ self.scheduler.set_timesteps(num_inference_steps, lcm_origin_steps)
257
+ timesteps = self.scheduler.timesteps
258
+
259
+ # 5. Prepare latent variable
260
+ num_channels_latents = self.unet.config.in_channels
261
+ latents = self.prepare_latents(
262
+ batch_size * num_images_per_prompt,
263
+ num_channels_latents,
264
+ height,
265
+ width,
266
+ prompt_embeds.dtype,
267
+ device,
268
+ latents,
269
+ )
270
+ bs = batch_size * num_images_per_prompt
271
+
272
+ # 6. Get Guidance Scale Embedding
273
+ w = torch.tensor(guidance_scale).repeat(bs)
274
+ w_embedding = self.get_w_embedding(w, embedding_dim=256).to(device=device, dtype=latents.dtype)
275
+
276
+ # 7. LCM MultiStep Sampling Loop:
277
+ #import time
278
+ #tsLenm1 = len(timesteps) - 1
279
+ #tm0 = time.time()
280
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
281
+ for i, t in enumerate(timesteps):
282
+ ts = torch.full((bs,), t, device=device, dtype=torch.long)
283
+ latents = latents.to(prompt_embeds.dtype)
284
+
285
+ # model prediction (v-prediction, eps, x)
286
+ model_pred = self.unet(
287
+ latents,
288
+ ts,
289
+ timestep_cond=w_embedding,
290
+ encoder_hidden_states=prompt_embeds,
291
+ cross_attention_kwargs=cross_attention_kwargs,
292
+ return_dict=False,
293
+ )[0]
294
+
295
+ # compute the previous noisy sample x_t -> x_t-1
296
+ latents, denoised = self.scheduler.step(model_pred, i, t, latents, return_dict=False)
297
+
298
+ # # call the callback, if provided
299
+ # if i == len(timesteps) - 1:
300
+ #if i == tsLenm1:
301
+ # print('SYNC')
302
+ # torch.cuda.synchronize()
303
+ progress_bar.update()
304
+ #print(f"unet time = {time.time() - tm0}")
305
+
306
+ denoised /= sharpness
307
+
308
+ #denoised = denoised.to(prompt_embeds.dtype)
309
+ if not output_type == "latent":
310
+ image = self.vae.decode(denoised / self.vae.config.scaling_factor, return_dict=False)[0]
311
+ #image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
312
+ has_nsfw_concept = None
313
+ else:
314
+ image = denoised
315
+ has_nsfw_concept = None
316
+
317
+ if has_nsfw_concept is None:
318
+ do_denormalize = [True] * image.shape[0]
319
+ else:
320
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
321
+
322
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
323
+
324
+ if not return_dict:
325
+ #return (image, has_nsfw_concept)
326
+ print(f"image[0] isa {type(image[0])}")
327
+ return image[0]
328
+
329
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
330
+
331
+
332
+ @dataclass
333
+ # Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
334
+ class LCMSchedulerOutput(BaseOutput):
335
+ """
336
+ Output class for the scheduler's `step` function output.
337
+ Args:
338
+ prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
339
+ Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
340
+ denoising loop.
341
+ pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
342
+ The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
343
+ `pred_original_sample` can be used to preview progress or for guidance.
344
+ """
345
+
346
+ prev_sample: torch.FloatTensor
347
+ denoised: Optional[torch.FloatTensor] = None
348
+
349
+
350
+ # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
351
+ def betas_for_alpha_bar(
352
+ num_diffusion_timesteps,
353
+ max_beta=0.999,
354
+ alpha_transform_type="cosine",
355
+ ):
356
+ """
357
+ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
358
+ (1-beta) over time from t = [0,1].
359
+ Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
360
+ to that part of the diffusion process.
361
+ Args:
362
+ num_diffusion_timesteps (`int`): the number of betas to produce.
363
+ max_beta (`float`): the maximum beta to use; use values lower than 1 to
364
+ prevent singularities.
365
+ alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
366
+ Choose from `cosine` or `exp`
367
+ Returns:
368
+ betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
369
+ """
370
+ if alpha_transform_type == "cosine":
371
+
372
+ def alpha_bar_fn(t):
373
+ return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
374
+
375
+ elif alpha_transform_type == "exp":
376
+
377
+ def alpha_bar_fn(t):
378
+ return math.exp(t * -12.0)
379
+
380
+ else:
381
+ raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
382
+
383
+ betas = []
384
+ for i in range(num_diffusion_timesteps):
385
+ t1 = i / num_diffusion_timesteps
386
+ t2 = (i + 1) / num_diffusion_timesteps
387
+ betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
388
+ return torch.tensor(betas, dtype=torch.float32)
389
+
390
+
391
+ def rescale_zero_terminal_snr(betas):
392
+ """
393
+ Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
394
+ Args:
395
+ betas (`torch.FloatTensor`):
396
+ the betas that the scheduler is being initialized with.
397
+ Returns:
398
+ `torch.FloatTensor`: rescaled betas with zero terminal SNR
399
+ """
400
+ # Convert betas to alphas_bar_sqrt
401
+ alphas = 1.0 - betas
402
+ alphas_cumprod = torch.cumprod(alphas, dim=0)
403
+ alphas_bar_sqrt = alphas_cumprod.sqrt()
404
+
405
+ # Store old values.
406
+ alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
407
+ alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
408
+
409
+ # Shift so the last timestep is zero.
410
+ alphas_bar_sqrt -= alphas_bar_sqrt_T
411
+
412
+ # Scale so the first timestep is back to the old value.
413
+ alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
414
+
415
+ # Convert alphas_bar_sqrt to betas
416
+ alphas_bar = alphas_bar_sqrt**2 # Revert sqrt
417
+ alphas = alphas_bar[1:] / alphas_bar[:-1] # Revert cumprod
418
+ alphas = torch.cat([alphas_bar[0:1], alphas])
419
+ betas = 1 - alphas
420
+
421
+ return betas
422
+
423
+
424
+ class LCMScheduler(SchedulerMixin, ConfigMixin):
425
+ """
426
+ `LCMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with
427
+ non-Markovian guidance.
428
+ This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
429
+ methods the library implements for all schedulers such as loading and saving.
430
+ Args:
431
+ num_train_timesteps (`int`, defaults to 1000):
432
+ The number of diffusion steps to train the model.
433
+ beta_start (`float`, defaults to 0.0001):
434
+ The starting `beta` value of inference.
435
+ beta_end (`float`, defaults to 0.02):
436
+ The final `beta` value.
437
+ beta_schedule (`str`, defaults to `"linear"`):
438
+ The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
439
+ `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
440
+ trained_betas (`np.ndarray`, *optional*):
441
+ Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
442
+ clip_sample (`bool`, defaults to `True`):
443
+ Clip the predicted sample for numerical stability.
444
+ clip_sample_range (`float`, defaults to 1.0):
445
+ The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
446
+ set_alpha_to_one (`bool`, defaults to `True`):
447
+ Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
448
+ there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
449
+ otherwise it uses the alpha value at step 0.
450
+ steps_offset (`int`, defaults to 0):
451
+ An offset added to the inference steps. You can use a combination of `offset=1` and
452
+ `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
453
+ Diffusion.
454
+ prediction_type (`str`, defaults to `epsilon`, *optional*):
455
+ Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
456
+ `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
457
+ Video](https://imagen.research.google/video/paper.pdf) paper).
458
+ thresholding (`bool`, defaults to `False`):
459
+ Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
460
+ as Stable Diffusion.
461
+ dynamic_thresholding_ratio (`float`, defaults to 0.995):
462
+ The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
463
+ sample_max_value (`float`, defaults to 1.0):
464
+ The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
465
+ timestep_spacing (`str`, defaults to `"leading"`):
466
+ The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
467
+ Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
468
+ rescale_betas_zero_snr (`bool`, defaults to `False`):
469
+ Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
470
+ dark samples instead of limiting it to samples with medium brightness. Loosely related to
471
+ [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
472
+ """
473
+
474
+ # _compatibles = [e.name for e in KarrasDiffusionSchedulers]
475
+ order = 1
476
+
477
+ @register_to_config
478
+ def __init__(
479
+ self,
480
+ num_train_timesteps: int = 1000,
481
+ beta_start: float = 0.0001,
482
+ beta_end: float = 0.02,
483
+ beta_schedule: str = "linear",
484
+ trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
485
+ clip_sample: bool = True,
486
+ set_alpha_to_one: bool = True,
487
+ steps_offset: int = 0,
488
+ prediction_type: str = "epsilon",
489
+ thresholding: bool = False,
490
+ dynamic_thresholding_ratio: float = 0.995,
491
+ clip_sample_range: float = 1.0,
492
+ sample_max_value: float = 1.0,
493
+ timestep_spacing: str = "leading",
494
+ rescale_betas_zero_snr: bool = False,
495
+ ):
496
+ if trained_betas is not None:
497
+ self.betas = torch.tensor(trained_betas, dtype=torch.float32)
498
+ elif beta_schedule == "linear":
499
+ self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
500
+ elif beta_schedule == "scaled_linear":
501
+ # this schedule is very specific to the latent diffusion model.
502
+ self.betas = (
503
+ torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
504
+ )
505
+ elif beta_schedule == "squaredcos_cap_v2":
506
+ # Glide cosine schedule
507
+ self.betas = betas_for_alpha_bar(num_train_timesteps)
508
+ else:
509
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
510
+
511
+ # Rescale for zero SNR
512
+ if rescale_betas_zero_snr:
513
+ self.betas = rescale_zero_terminal_snr(self.betas)
514
+
515
+ self.alphas = 1.0 - self.betas
516
+ self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
517
+
518
+ # At every step in ddim, we are looking into the previous alphas_cumprod
519
+ # For the final step, there is no previous alphas_cumprod because we are already at 0
520
+ # `set_alpha_to_one` decides whether we set this parameter simply to one or
521
+ # whether we use the final alpha of the "non-previous" one.
522
+ self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
523
+
524
+ # standard deviation of the initial noise distribution
525
+ self.init_noise_sigma = 1.0
526
+
527
+ # setable values
528
+ self.num_inference_steps = None
529
+ self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
530
+
531
+ def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
532
+ """
533
+ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
534
+ current timestep.
535
+ Args:
536
+ sample (`torch.FloatTensor`):
537
+ The input sample.
538
+ timestep (`int`, *optional*):
539
+ The current timestep in the diffusion chain.
540
+ Returns:
541
+ `torch.FloatTensor`:
542
+ A scaled input sample.
543
+ """
544
+ return sample
545
+
546
+ def _get_variance(self, timestep, prev_timestep):
547
+ alpha_prod_t = self.alphas_cumprod[timestep]
548
+ alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
549
+ beta_prod_t = 1 - alpha_prod_t
550
+ beta_prod_t_prev = 1 - alpha_prod_t_prev
551
+
552
+ variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
553
+
554
+ return variance
555
+
556
+ # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
557
+ def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
558
+ """
559
+ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
560
+ prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
561
+ s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
562
+ pixels from saturation at each step. We find that dynamic thresholding results in significantly better
563
+ photorealism as well as better image-text alignment, especially when using very large guidance weights."
564
+ https://arxiv.org/abs/2205.11487
565
+ """
566
+ dtype = sample.dtype
567
+ batch_size, channels, height, width = sample.shape
568
+
569
+ if dtype not in (torch.float32, torch.float64):
570
+ sample = sample.float() # upcast for quantile calculation, and clamp not implemented for cpu half
571
+
572
+ # Flatten sample for doing quantile calculation along each image
573
+ sample = sample.reshape(batch_size, channels * height * width)
574
+
575
+ abs_sample = sample.abs() # "a certain percentile absolute pixel value"
576
+
577
+ s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
578
+ s = torch.clamp(
579
+ s, min=1, max=self.config.sample_max_value
580
+ ) # When clamped to min=1, equivalent to standard clipping to [-1, 1]
581
+
582
+ s = s.unsqueeze(1) # (batch_size, 1) because clamp will broadcast along dim=0
583
+ sample = torch.clamp(sample, -s, s) / s # "we threshold xt0 to the range [-s, s] and then divide by s"
584
+
585
+ sample = sample.reshape(batch_size, channels, height, width)
586
+ sample = sample.to(dtype)
587
+
588
+ return sample
589
+
590
+ def set_timesteps(self, num_inference_steps: int, lcm_origin_steps: int, device: Union[str, torch.device] = None):
591
+ """
592
+ Sets the discrete timesteps used for the diffusion chain (to be run before inference).
593
+ Args:
594
+ num_inference_steps (`int`):
595
+ The number of diffusion steps used when generating samples with a pre-trained model.
596
+ """
597
+
598
+ if num_inference_steps > self.config.num_train_timesteps:
599
+ raise ValueError(
600
+ f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
601
+ f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
602
+ f" maximal {self.config.num_train_timesteps} timesteps."
603
+ )
604
+
605
+ self.num_inference_steps = num_inference_steps
606
+
607
+ # LCM Timesteps Setting: # Linear Spacing
608
+ c = self.config.num_train_timesteps // lcm_origin_steps
609
+ lcm_origin_timesteps = np.asarray(list(range(1, lcm_origin_steps + 1))) * c - 1 # LCM Training Steps Schedule
610
+ skipping_step = len(lcm_origin_timesteps) // num_inference_steps
611
+ timesteps = lcm_origin_timesteps[::-skipping_step][:num_inference_steps] # LCM Inference Steps Schedule
612
+
613
+ self.timesteps = torch.from_numpy(timesteps.copy()).to(device)
614
+
615
+ def get_scalings_for_boundary_condition_discrete(self, t):
616
+ self.sigma_data = 0.5 # Default: 0.5
617
+
618
+ # By dividing 0.1: This is almost a delta function at t=0.
619
+ c_skip = self.sigma_data**2 / ((t / 0.1) ** 2 + self.sigma_data**2)
620
+ c_out = (t / 0.1) / ((t / 0.1) ** 2 + self.sigma_data**2) ** 0.5
621
+ return c_skip, c_out
622
+
623
+ def step(
624
+ self,
625
+ model_output: torch.FloatTensor,
626
+ timeindex: int,
627
+ timestep: int,
628
+ sample: torch.FloatTensor,
629
+ eta: float = 0.0,
630
+ use_clipped_model_output: bool = False,
631
+ generator=None,
632
+ variance_noise: Optional[torch.FloatTensor] = None,
633
+ return_dict: bool = True,
634
+ ) -> Union[LCMSchedulerOutput, Tuple]:
635
+ """
636
+ Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
637
+ process from the learned model outputs (most often the predicted noise).
638
+ Args:
639
+ model_output (`torch.FloatTensor`):
640
+ The direct output from learned diffusion model.
641
+ timestep (`float`):
642
+ The current discrete timestep in the diffusion chain.
643
+ sample (`torch.FloatTensor`):
644
+ A current instance of a sample created by the diffusion process.
645
+ eta (`float`):
646
+ The weight of noise for added noise in diffusion step.
647
+ use_clipped_model_output (`bool`, defaults to `False`):
648
+ If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
649
+ because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
650
+ clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
651
+ `use_clipped_model_output` has no effect.
652
+ generator (`torch.Generator`, *optional*):
653
+ A random number generator.
654
+ variance_noise (`torch.FloatTensor`):
655
+ Alternative to generating noise with `generator` by directly providing the noise for the variance
656
+ itself. Useful for methods such as [`CycleDiffusion`].
657
+ return_dict (`bool`, *optional*, defaults to `True`):
658
+ Whether or not to return a [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] or `tuple`.
659
+ Returns:
660
+ [`~schedulers.scheduling_utils.LCMSchedulerOutput`] or `tuple`:
661
+ If return_dict is `True`, [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] is returned, otherwise a
662
+ tuple is returned where the first element is the sample tensor.
663
+ """
664
+ if self.num_inference_steps is None:
665
+ raise ValueError(
666
+ "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
667
+ )
668
+
669
+ # 1. get previous step value
670
+ prev_timeindex = timeindex + 1
671
+ if prev_timeindex < len(self.timesteps):
672
+ prev_timestep = self.timesteps[prev_timeindex]
673
+ else:
674
+ prev_timestep = timestep
675
+
676
+ # 2. compute alphas, betas
677
+ alpha_prod_t = self.alphas_cumprod[timestep]
678
+ alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
679
+
680
+ beta_prod_t = 1 - alpha_prod_t
681
+ beta_prod_t_prev = 1 - alpha_prod_t_prev
682
+
683
+ # 3. Get scalings for boundary conditions
684
+ c_skip, c_out = self.get_scalings_for_boundary_condition_discrete(timestep)
685
+
686
+ # 4. Different Parameterization:
687
+ parameterization = self.config.prediction_type
688
+
689
+ if parameterization == "epsilon": # noise-prediction
690
+ pred_x0 = (sample - beta_prod_t.sqrt() * model_output) / alpha_prod_t.sqrt()
691
+
692
+ elif parameterization == "sample": # x-prediction
693
+ pred_x0 = model_output
694
+
695
+ elif parameterization == "v_prediction": # v-prediction
696
+ pred_x0 = alpha_prod_t.sqrt() * sample - beta_prod_t.sqrt() * model_output
697
+
698
+ # 4. Denoise model output using boundary conditions
699
+ denoised = c_out * pred_x0 + c_skip * sample
700
+
701
+ # 5. Sample z ~ N(0, I), For MultiStep Inference
702
+ # Noise is not used for one-step sampling.
703
+ if len(self.timesteps) > 1:
704
+ noise = torch.randn(model_output.shape).to(model_output.device)
705
+ prev_sample = alpha_prod_t_prev.sqrt() * denoised + beta_prod_t_prev.sqrt() * noise
706
+ else:
707
+ prev_sample = denoised
708
+
709
+ if not return_dict:
710
+ return (prev_sample, denoised)
711
+
712
+ return LCMSchedulerOutput(prev_sample=prev_sample, denoised=denoised)
713
+
714
+ # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
715
+ def add_noise(
716
+ self,
717
+ original_samples: torch.FloatTensor,
718
+ noise: torch.FloatTensor,
719
+ timesteps: torch.IntTensor,
720
+ ) -> torch.FloatTensor:
721
+ # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
722
+ alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
723
+ timesteps = timesteps.to(original_samples.device)
724
+
725
+ sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
726
+ sqrt_alpha_prod = sqrt_alpha_prod.flatten()
727
+ while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
728
+ sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
729
+
730
+ sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
731
+ sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
732
+ while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
733
+ sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
734
+
735
+ noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
736
+ return noisy_samples
737
+
738
+ # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
739
+ def get_velocity(
740
+ self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor
741
+ ) -> torch.FloatTensor:
742
+ # Make sure alphas_cumprod and timestep have same device and dtype as sample
743
+ alphas_cumprod = self.alphas_cumprod.to(device=sample.device, dtype=sample.dtype)
744
+ timesteps = timesteps.to(sample.device)
745
+
746
+ sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
747
+ sqrt_alpha_prod = sqrt_alpha_prod.flatten()
748
+ while len(sqrt_alpha_prod.shape) < len(sample.shape):
749
+ sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
750
+
751
+ sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
752
+ sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
753
+ while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
754
+ sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
755
+
756
+ velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
757
+ return velocity
758
+
759
+ def __len__(self):
760
+ return self.config.num_train_timesteps
requirements.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wheel
2
+ setuptools
3
+
4
+ torch==2.1.0
5
+ diffusers
6
+ Pillow
7
+ typing_extensions
8
+ sympy
9
+ mpmath
10
+ packaging
11
+ huggingface_hub
12
+ tqdm
13
+ flask
14
+ requests
15
+ urllib3
16
+ PyYAML
17
+ filelock
18
+ numpy
19
+ safetensors
20
+ transformers
21
+ regex
22
+ accelerate
23
+ gradio