Update scheduler and pipeline code to be closer to diffusers version
Browse filesUpdate scheduler config to latest version
Create and use torch.Generator seeded from np.random.RandomState to fix custom seeds
Properly override __init__ with edited OVModelUnet
Use official _encode_prompt
Rename get_w_embedding to get_guidance_scale_embedding and use guidance_scale-1
Update README.md
- README.md +1 -1
- lcm_ov_pipeline.py +90 -73
- lcm_scheduler.py +140 -90
- model_index.json +2 -2
- scheduler/scheduler_config.json +4 -4
README.md
CHANGED
@@ -73,5 +73,5 @@ num_inference_steps = 4
|
|
73 |
pipe.reshape(batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images)
|
74 |
pipe.compile()
|
75 |
|
76 |
-
images = pipe(prompt=prompt, width=width, height=height, num_inference_steps=num_inference_steps, guidance_scale=8.0,
|
77 |
```
|
|
|
73 |
pipe.reshape(batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images)
|
74 |
pipe.compile()
|
75 |
|
76 |
+
images = pipe(prompt=prompt, width=width, height=height, num_inference_steps=num_inference_steps, guidance_scale=8.0, output_type="pil").images
|
77 |
```
|
lcm_ov_pipeline.py
CHANGED
@@ -9,7 +9,15 @@ import openvino
|
|
9 |
import torch
|
10 |
|
11 |
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
|
12 |
-
from optimum.intel.openvino.modeling_diffusion import OVStableDiffusionPipeline, OVModelUnet
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
from diffusers import logging
|
15 |
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
@@ -43,6 +51,7 @@ class LCMOVModelUnet(OVModelUnet):
|
|
43 |
return list(outputs.values())
|
44 |
|
45 |
class OVLatentConsistencyModelPipeline(OVStableDiffusionPipeline):
|
|
|
46 |
def __init__(
|
47 |
self,
|
48 |
vae_decoder: openvino.runtime.Model,
|
@@ -62,9 +71,56 @@ class OVLatentConsistencyModelPipeline(OVStableDiffusionPipeline):
|
|
62 |
model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
|
63 |
**kwargs,
|
64 |
):
|
65 |
-
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
self.unet = LCMOVModelUnet(unet, self)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
def _reshape_unet(
|
70 |
self,
|
@@ -110,63 +166,7 @@ class OVLatentConsistencyModelPipeline(OVStableDiffusionPipeline):
|
|
110 |
model.reshape(shapes)
|
111 |
return model
|
112 |
|
113 |
-
|
114 |
-
def _encode_prompt(
|
115 |
-
self,
|
116 |
-
prompt: Union[str, List[str]],
|
117 |
-
num_images_per_prompt: Optional[int],
|
118 |
-
prompt_embeds: Optional[np.ndarray] = None,
|
119 |
-
):
|
120 |
-
r"""
|
121 |
-
Encodes the prompt into text encoder hidden states.
|
122 |
-
|
123 |
-
Args:
|
124 |
-
prompt (`str` or `List[str]`):
|
125 |
-
prompt to be encoded
|
126 |
-
num_images_per_prompt (`int`):
|
127 |
-
number of images that should be generated per prompt
|
128 |
-
prompt_embeds (`np.ndarray`, *optional*):
|
129 |
-
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
130 |
-
provided, text embeddings will be generated from `prompt` input argument.
|
131 |
-
"""
|
132 |
-
if prompt is not None and isinstance(prompt, str):
|
133 |
-
batch_size = 1
|
134 |
-
elif prompt is not None and isinstance(prompt, list):
|
135 |
-
batch_size = len(prompt)
|
136 |
-
else:
|
137 |
-
batch_size = prompt_embeds.shape[0]
|
138 |
-
|
139 |
-
if prompt_embeds is None:
|
140 |
-
# get prompt text embeddings
|
141 |
-
text_inputs = self.tokenizer(
|
142 |
-
prompt,
|
143 |
-
padding="max_length",
|
144 |
-
max_length=self.tokenizer.model_max_length,
|
145 |
-
truncation=True,
|
146 |
-
return_tensors="np",
|
147 |
-
)
|
148 |
-
text_input_ids = text_inputs.input_ids
|
149 |
-
untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
|
150 |
-
|
151 |
-
if not np.array_equal(text_input_ids, untruncated_ids):
|
152 |
-
removed_text = self.tokenizer.batch_decode(
|
153 |
-
untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
|
154 |
-
)
|
155 |
-
logger.warning(
|
156 |
-
"The following part of your input was truncated because CLIP can only handle sequences up to"
|
157 |
-
f" {self.tokenizer.model_max_length} tokens: {removed_text}"
|
158 |
-
)
|
159 |
-
|
160 |
-
prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
|
161 |
-
|
162 |
-
bs_embed, seq_len, _ = prompt_embeds.shape
|
163 |
-
|
164 |
-
prompt_embeds = np.tile(prompt_embeds, [1, num_images_per_prompt, 1])
|
165 |
-
prompt_embeds = np.reshape(prompt_embeds, [bs_embed * num_images_per_prompt, seq_len, -1])
|
166 |
-
|
167 |
-
return prompt_embeds
|
168 |
-
|
169 |
-
def get_w_embedding(self, w, embedding_dim=512, dtype=np.float32):
|
170 |
"""
|
171 |
see https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
|
172 |
Args:
|
@@ -197,7 +197,7 @@ class OVLatentConsistencyModelPipeline(OVStableDiffusionPipeline):
|
|
197 |
height: Optional[int] = None,
|
198 |
width: Optional[int] = None,
|
199 |
num_inference_steps: int = 4,
|
200 |
-
|
201 |
guidance_scale: float = 7.5,
|
202 |
num_images_per_prompt: int = 1,
|
203 |
eta: float = 0.0,
|
@@ -224,8 +224,11 @@ class OVLatentConsistencyModelPipeline(OVStableDiffusionPipeline):
|
|
224 |
num_inference_steps (`int`, defaults to 4):
|
225 |
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
226 |
expense of slower inference.
|
227 |
-
|
228 |
-
The number of
|
|
|
|
|
|
|
229 |
guidance_scale (`float`, defaults to 7.5):
|
230 |
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
231 |
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
@@ -290,14 +293,25 @@ class OVLatentConsistencyModelPipeline(OVStableDiffusionPipeline):
|
|
290 |
if generator is None:
|
291 |
generator = np.random
|
292 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
293 |
prompt_embeds = self._encode_prompt(
|
294 |
prompt,
|
295 |
num_images_per_prompt,
|
|
|
|
|
296 |
prompt_embeds=prompt_embeds,
|
|
|
297 |
)
|
298 |
|
299 |
# set timesteps
|
300 |
-
self.scheduler.set_timesteps(num_inference_steps,
|
301 |
timesteps = self.scheduler.timesteps
|
302 |
|
303 |
latents = self.prepare_latents(
|
@@ -310,6 +324,13 @@ class OVLatentConsistencyModelPipeline(OVStableDiffusionPipeline):
|
|
310 |
latents,
|
311 |
)
|
312 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
313 |
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
314 |
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
315 |
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
|
@@ -319,13 +340,9 @@ class OVLatentConsistencyModelPipeline(OVStableDiffusionPipeline):
|
|
319 |
if accepts_eta:
|
320 |
extra_step_kwargs["eta"] = eta
|
321 |
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
# Get Guidance Scale Embedding
|
326 |
-
w = np.tile(guidance_scale, batch_size * num_images_per_prompt)
|
327 |
-
w_embedding = self.get_w_embedding(w, embedding_dim=self.unet.config.get("time_cond_proj_dim", 256))
|
328 |
-
|
329 |
|
330 |
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
331 |
for i, t in enumerate(self.progress_bar(timesteps)):
|
@@ -333,11 +350,11 @@ class OVLatentConsistencyModelPipeline(OVStableDiffusionPipeline):
|
|
333 |
# predict the noise residual
|
334 |
timestep = np.array([t], dtype=timestep_dtype)
|
335 |
|
336 |
-
noise_pred = self.unet(sample=latents, timestep=timestep,
|
337 |
|
338 |
# compute the previous noisy sample x_t -> x_t-1
|
339 |
latents, denoised = self.scheduler.step(
|
340 |
-
torch.from_numpy(noise_pred),
|
341 |
)
|
342 |
|
343 |
latents, denoised = latents.numpy(), denoised.numpy()
|
|
|
9 |
import torch
|
10 |
|
11 |
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
|
12 |
+
from optimum.intel.openvino.modeling_diffusion import OVStableDiffusionPipeline, OVModelUnet, OVModelVaeDecoder, OVModelTextEncoder, OVModelVaeEncoder, VaeImageProcessor
|
13 |
+
from optimum.utils import (
|
14 |
+
DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER,
|
15 |
+
DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER,
|
16 |
+
DIFFUSION_MODEL_UNET_SUBFOLDER,
|
17 |
+
DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER,
|
18 |
+
DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER,
|
19 |
+
)
|
20 |
+
|
21 |
|
22 |
from diffusers import logging
|
23 |
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
|
|
51 |
return list(outputs.values())
|
52 |
|
53 |
class OVLatentConsistencyModelPipeline(OVStableDiffusionPipeline):
|
54 |
+
|
55 |
def __init__(
|
56 |
self,
|
57 |
vae_decoder: openvino.runtime.Model,
|
|
|
71 |
model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
|
72 |
**kwargs,
|
73 |
):
|
74 |
+
self._internal_dict = config
|
75 |
+
self._device = device.upper()
|
76 |
+
self.is_dynamic = dynamic_shapes
|
77 |
+
self.ov_config = ov_config if ov_config is not None else {}
|
78 |
+
self._model_save_dir = (
|
79 |
+
Path(model_save_dir.name) if isinstance(model_save_dir, TemporaryDirectory) else model_save_dir
|
80 |
+
)
|
81 |
+
self.vae_decoder = OVModelVaeDecoder(vae_decoder, self)
|
82 |
self.unet = LCMOVModelUnet(unet, self)
|
83 |
+
self.text_encoder = OVModelTextEncoder(text_encoder, self) if text_encoder is not None else None
|
84 |
+
self.text_encoder_2 = (
|
85 |
+
OVModelTextEncoder(text_encoder_2, self, model_name=DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER)
|
86 |
+
if text_encoder_2 is not None
|
87 |
+
else None
|
88 |
+
)
|
89 |
+
self.vae_encoder = OVModelVaeEncoder(vae_encoder, self) if vae_encoder is not None else None
|
90 |
+
|
91 |
+
if "block_out_channels" in self.vae_decoder.config:
|
92 |
+
self.vae_scale_factor = 2 ** (len(self.vae_decoder.config["block_out_channels"]) - 1)
|
93 |
+
else:
|
94 |
+
self.vae_scale_factor = 8
|
95 |
+
|
96 |
+
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
97 |
+
|
98 |
+
self.tokenizer = tokenizer
|
99 |
+
self.tokenizer_2 = tokenizer_2
|
100 |
+
self.scheduler = scheduler
|
101 |
+
self.feature_extractor = feature_extractor
|
102 |
+
self.safety_checker = None
|
103 |
+
self.preprocessors = []
|
104 |
+
|
105 |
+
if self.is_dynamic:
|
106 |
+
self.reshape(batch_size=-1, height=-1, width=-1, num_images_per_prompt=-1)
|
107 |
+
|
108 |
+
if compile:
|
109 |
+
self.compile()
|
110 |
+
|
111 |
+
sub_models = {
|
112 |
+
DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER: self.text_encoder,
|
113 |
+
DIFFUSION_MODEL_UNET_SUBFOLDER: self.unet,
|
114 |
+
DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER: self.vae_decoder,
|
115 |
+
DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER: self.vae_encoder,
|
116 |
+
DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER: self.text_encoder_2,
|
117 |
+
}
|
118 |
+
for name in sub_models.keys():
|
119 |
+
self._internal_dict[name] = (
|
120 |
+
("optimum", sub_models[name].__class__.__name__) if sub_models[name] is not None else (None, None)
|
121 |
+
)
|
122 |
+
|
123 |
+
self._internal_dict.pop("vae", None)
|
124 |
|
125 |
def _reshape_unet(
|
126 |
self,
|
|
|
166 |
model.reshape(shapes)
|
167 |
return model
|
168 |
|
169 |
+
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=np.float32):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
"""
|
171 |
see https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
|
172 |
Args:
|
|
|
197 |
height: Optional[int] = None,
|
198 |
width: Optional[int] = None,
|
199 |
num_inference_steps: int = 4,
|
200 |
+
original_inference_steps: int = None,
|
201 |
guidance_scale: float = 7.5,
|
202 |
num_images_per_prompt: int = 1,
|
203 |
eta: float = 0.0,
|
|
|
224 |
num_inference_steps (`int`, defaults to 4):
|
225 |
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
226 |
expense of slower inference.
|
227 |
+
original_inference_steps (`int`, *optional*):
|
228 |
+
The original number of inference steps use to generate a linearly-spaced timestep schedule, from which
|
229 |
+
we will draw `num_inference_steps` evenly spaced timesteps from as our final timestep schedule,
|
230 |
+
following the Skipping-Step method in the paper (see Section 4.3). If not set this will default to the
|
231 |
+
scheduler's `original_inference_steps` attribute.
|
232 |
guidance_scale (`float`, defaults to 7.5):
|
233 |
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
234 |
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
|
|
293 |
if generator is None:
|
294 |
generator = np.random
|
295 |
|
296 |
+
# Create torch.Generator instance with same state as np.random.RandomState
|
297 |
+
torch_generator = torch.Generator().manual_seed(int(generator.get_state()[1][0]))
|
298 |
+
|
299 |
+
#do_classifier_free_guidance = guidance_scale > 1.0
|
300 |
+
|
301 |
+
# NOTE: when a LCM is distilled from an LDM via latent consistency distillation (Algorithm 1) with guided
|
302 |
+
# distillation, the forward pass of the LCM learns to approximate sampling from the LDM using CFG with the
|
303 |
+
# unconditional prompt "" (the empty string). Due to this, LCMs currently do not support negative prompts.
|
304 |
prompt_embeds = self._encode_prompt(
|
305 |
prompt,
|
306 |
num_images_per_prompt,
|
307 |
+
False,
|
308 |
+
negative_prompt=None,
|
309 |
prompt_embeds=prompt_embeds,
|
310 |
+
negative_prompt_embeds=None,
|
311 |
)
|
312 |
|
313 |
# set timesteps
|
314 |
+
self.scheduler.set_timesteps(num_inference_steps, "cpu", original_inference_steps=original_inference_steps)
|
315 |
timesteps = self.scheduler.timesteps
|
316 |
|
317 |
latents = self.prepare_latents(
|
|
|
324 |
latents,
|
325 |
)
|
326 |
|
327 |
+
# Get Guidance Scale Embedding
|
328 |
+
w = np.tile(guidance_scale - 1, batch_size * num_images_per_prompt)
|
329 |
+
w_embedding = self.get_guidance_scale_embedding(w, embedding_dim=self.unet.config.get("time_cond_proj_dim", 256))
|
330 |
+
|
331 |
+
# Adapted from diffusers to extend it for other runtimes than ORT
|
332 |
+
timestep_dtype = self.unet.input_dtype.get("timestep", np.float32)
|
333 |
+
|
334 |
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
335 |
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
336 |
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
|
|
|
340 |
if accepts_eta:
|
341 |
extra_step_kwargs["eta"] = eta
|
342 |
|
343 |
+
accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
344 |
+
if accepts_generator:
|
345 |
+
extra_step_kwargs["generator"] = torch_generator
|
|
|
|
|
|
|
|
|
346 |
|
347 |
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
348 |
for i, t in enumerate(self.progress_bar(timesteps)):
|
|
|
350 |
# predict the noise residual
|
351 |
timestep = np.array([t], dtype=timestep_dtype)
|
352 |
|
353 |
+
noise_pred = self.unet(sample=latents, timestep=timestep, timestep_cond = w_embedding, encoder_hidden_states=prompt_embeds)[0]
|
354 |
|
355 |
# compute the previous noisy sample x_t -> x_t-1
|
356 |
latents, denoised = self.scheduler.step(
|
357 |
+
torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs, return_dict = False
|
358 |
)
|
359 |
|
360 |
latents, denoised = latents.numpy(), denoised.numpy()
|
lcm_scheduler.py
CHANGED
@@ -22,13 +22,16 @@ from typing import List, Optional, Tuple, Union
|
|
22 |
import numpy as np
|
23 |
import torch
|
24 |
|
25 |
-
from diffusers import ConfigMixin,
|
26 |
-
from diffusers.
|
27 |
-
from diffusers.utils import
|
|
|
|
|
|
|
|
|
28 |
|
29 |
|
30 |
@dataclass
|
31 |
-
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
|
32 |
class LCMSchedulerOutput(BaseOutput):
|
33 |
"""
|
34 |
Output class for the scheduler's `step` function output.
|
@@ -91,7 +94,8 @@ def betas_for_alpha_bar(
|
|
91 |
return torch.tensor(betas, dtype=torch.float32)
|
92 |
|
93 |
|
94 |
-
|
|
|
95 |
"""
|
96 |
Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
|
97 |
|
@@ -132,8 +136,10 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
|
132 |
`LCMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with
|
133 |
non-Markovian guidance.
|
134 |
|
135 |
-
This model inherits from [`SchedulerMixin`] and [`ConfigMixin`].
|
136 |
-
|
|
|
|
|
137 |
|
138 |
Args:
|
139 |
num_train_timesteps (`int`, defaults to 1000):
|
@@ -147,6 +153,9 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
|
147 |
`linear`, `scaled_linear`, or `squaredcos_cap_v2`.
|
148 |
trained_betas (`np.ndarray`, *optional*):
|
149 |
Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
|
|
|
|
|
|
|
150 |
clip_sample (`bool`, defaults to `True`):
|
151 |
Clip the predicted sample for numerical stability.
|
152 |
clip_sample_range (`float`, defaults to 1.0):
|
@@ -179,24 +188,24 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
|
179 |
[`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
|
180 |
"""
|
181 |
|
182 |
-
# _compatibles = [e.name for e in KarrasDiffusionSchedulers]
|
183 |
order = 1
|
184 |
|
185 |
@register_to_config
|
186 |
def __init__(
|
187 |
self,
|
188 |
num_train_timesteps: int = 1000,
|
189 |
-
beta_start: float = 0.
|
190 |
-
beta_end: float = 0.
|
191 |
-
beta_schedule: str = "
|
192 |
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
|
193 |
-
|
|
|
|
|
194 |
set_alpha_to_one: bool = True,
|
195 |
steps_offset: int = 0,
|
196 |
prediction_type: str = "epsilon",
|
197 |
thresholding: bool = False,
|
198 |
dynamic_thresholding_ratio: float = 0.995,
|
199 |
-
clip_sample_range: float = 1.0,
|
200 |
sample_max_value: float = 1.0,
|
201 |
timestep_spacing: str = "leading",
|
202 |
rescale_betas_zero_snr: bool = False,
|
@@ -236,6 +245,30 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
|
236 |
self.num_inference_steps = None
|
237 |
self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
|
238 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
|
240 |
"""
|
241 |
Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
|
@@ -246,23 +279,12 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
|
246 |
The input sample.
|
247 |
timestep (`int`, *optional*):
|
248 |
The current timestep in the diffusion chain.
|
249 |
-
|
250 |
Returns:
|
251 |
`torch.FloatTensor`:
|
252 |
A scaled input sample.
|
253 |
"""
|
254 |
return sample
|
255 |
|
256 |
-
def _get_variance(self, timestep, prev_timestep):
|
257 |
-
alpha_prod_t = self.alphas_cumprod[timestep]
|
258 |
-
alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
|
259 |
-
beta_prod_t = 1 - alpha_prod_t
|
260 |
-
beta_prod_t_prev = 1 - alpha_prod_t_prev
|
261 |
-
|
262 |
-
variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
|
263 |
-
|
264 |
-
return variance
|
265 |
-
|
266 |
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
|
267 |
def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
|
268 |
"""
|
@@ -275,13 +297,13 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
|
275 |
https://arxiv.org/abs/2205.11487
|
276 |
"""
|
277 |
dtype = sample.dtype
|
278 |
-
batch_size, channels,
|
279 |
|
280 |
if dtype not in (torch.float32, torch.float64):
|
281 |
sample = sample.float() # upcast for quantile calculation, and clamp not implemented for cpu half
|
282 |
|
283 |
# Flatten sample for doing quantile calculation along each image
|
284 |
-
sample = sample.reshape(batch_size, channels *
|
285 |
|
286 |
abs_sample = sample.abs() # "a certain percentile absolute pixel value"
|
287 |
|
@@ -289,22 +311,33 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
|
289 |
s = torch.clamp(
|
290 |
s, min=1, max=self.config.sample_max_value
|
291 |
) # When clamped to min=1, equivalent to standard clipping to [-1, 1]
|
292 |
-
|
293 |
s = s.unsqueeze(1) # (batch_size, 1) because clamp will broadcast along dim=0
|
294 |
sample = torch.clamp(sample, -s, s) / s # "we threshold xt0 to the range [-s, s] and then divide by s"
|
295 |
|
296 |
-
sample = sample.reshape(batch_size, channels,
|
297 |
sample = sample.to(dtype)
|
298 |
|
299 |
return sample
|
300 |
|
301 |
-
def set_timesteps(
|
|
|
|
|
|
|
|
|
|
|
302 |
"""
|
303 |
Sets the discrete timesteps used for the diffusion chain (to be run before inference).
|
304 |
|
305 |
Args:
|
306 |
num_inference_steps (`int`):
|
307 |
The number of diffusion steps used when generating samples with a pre-trained model.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
308 |
"""
|
309 |
|
310 |
if num_inference_steps > self.config.num_train_timesteps:
|
@@ -315,36 +348,51 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
|
315 |
)
|
316 |
|
317 |
self.num_inference_steps = num_inference_steps
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
322 |
skipping_step = len(lcm_origin_timesteps) // num_inference_steps
|
323 |
-
|
324 |
-
|
325 |
-
|
|
|
|
|
|
|
326 |
|
327 |
def get_scalings_for_boundary_condition_discrete(self, t):
|
328 |
-
self.sigma_data = 0.5
|
329 |
-
|
330 |
-
# By dividing 0.1: This is almost a delta function at t=0.
|
331 |
-
c_skip = self.sigma_data**2 / (
|
332 |
-
|
333 |
-
)
|
334 |
-
c_out = (( t / 0.1) / ((t / 0.1) **2 + self.sigma_data**2) ** 0.5)
|
335 |
return c_skip, c_out
|
336 |
-
|
337 |
-
|
338 |
def step(
|
339 |
self,
|
340 |
model_output: torch.FloatTensor,
|
341 |
-
timeindex: int,
|
342 |
timestep: int,
|
343 |
sample: torch.FloatTensor,
|
344 |
-
|
345 |
-
use_clipped_model_output: bool = False,
|
346 |
-
generator=None,
|
347 |
-
variance_noise: Optional[torch.FloatTensor] = None,
|
348 |
return_dict: bool = True,
|
349 |
) -> Union[LCMSchedulerOutput, Tuple]:
|
350 |
"""
|
@@ -358,77 +406,79 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
|
358 |
The current discrete timestep in the diffusion chain.
|
359 |
sample (`torch.FloatTensor`):
|
360 |
A current instance of a sample created by the diffusion process.
|
361 |
-
eta (`float`):
|
362 |
-
The weight of noise for added noise in diffusion step.
|
363 |
-
use_clipped_model_output (`bool`, defaults to `False`):
|
364 |
-
If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
|
365 |
-
because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
|
366 |
-
clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
|
367 |
-
`use_clipped_model_output` has no effect.
|
368 |
generator (`torch.Generator`, *optional*):
|
369 |
A random number generator.
|
370 |
-
variance_noise (`torch.FloatTensor`):
|
371 |
-
Alternative to generating noise with `generator` by directly providing the noise for the variance
|
372 |
-
itself. Useful for methods such as [`CycleDiffusion`].
|
373 |
return_dict (`bool`, *optional*, defaults to `True`):
|
374 |
Whether or not to return a [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] or `tuple`.
|
375 |
-
|
376 |
Returns:
|
377 |
[`~schedulers.scheduling_utils.LCMSchedulerOutput`] or `tuple`:
|
378 |
If return_dict is `True`, [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] is returned, otherwise a
|
379 |
tuple is returned where the first element is the sample tensor.
|
380 |
-
|
381 |
"""
|
382 |
if self.num_inference_steps is None:
|
383 |
raise ValueError(
|
384 |
"Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
|
385 |
)
|
386 |
-
|
|
|
|
|
|
|
387 |
# 1. get previous step value
|
388 |
-
|
389 |
-
if
|
390 |
-
prev_timestep = self.timesteps[
|
391 |
else:
|
392 |
prev_timestep = timestep
|
393 |
-
|
394 |
# 2. compute alphas, betas
|
395 |
alpha_prod_t = self.alphas_cumprod[timestep]
|
396 |
alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
|
397 |
-
|
398 |
beta_prod_t = 1 - alpha_prod_t
|
399 |
beta_prod_t_prev = 1 - alpha_prod_t_prev
|
400 |
-
|
401 |
# 3. Get scalings for boundary conditions
|
402 |
c_skip, c_out = self.get_scalings_for_boundary_condition_discrete(timestep)
|
403 |
-
|
404 |
-
# 4.
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
420 |
# Noise is not used for one-step sampling.
|
421 |
if len(self.timesteps) > 1:
|
422 |
-
noise =
|
423 |
prev_sample = alpha_prod_t_prev.sqrt() * denoised + beta_prod_t_prev.sqrt() * noise
|
424 |
else:
|
425 |
prev_sample = denoised
|
426 |
-
|
|
|
|
|
|
|
427 |
if not return_dict:
|
428 |
return (prev_sample, denoised)
|
429 |
-
|
430 |
return LCMSchedulerOutput(prev_sample=prev_sample, denoised=denoised)
|
431 |
-
|
432 |
|
433 |
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
|
434 |
def add_noise(
|
|
|
22 |
import numpy as np
|
23 |
import torch
|
24 |
|
25 |
+
from diffusers.configuration_utils import ConfigMixin, register_to_config
|
26 |
+
from diffusers.utils import BaseOutput, logging
|
27 |
+
from diffusers.utils.torch_utils import randn_tensor
|
28 |
+
from diffusers.schedulers.scheduling_utils import SchedulerMixin
|
29 |
+
|
30 |
+
|
31 |
+
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
32 |
|
33 |
|
34 |
@dataclass
|
|
|
35 |
class LCMSchedulerOutput(BaseOutput):
|
36 |
"""
|
37 |
Output class for the scheduler's `step` function output.
|
|
|
94 |
return torch.tensor(betas, dtype=torch.float32)
|
95 |
|
96 |
|
97 |
+
# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
|
98 |
+
def rescale_zero_terminal_snr(betas: torch.FloatTensor) -> torch.FloatTensor:
|
99 |
"""
|
100 |
Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
|
101 |
|
|
|
136 |
`LCMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with
|
137 |
non-Markovian guidance.
|
138 |
|
139 |
+
This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. [`~ConfigMixin`] takes care of storing all config
|
140 |
+
attributes that are passed in the scheduler's `__init__` function, such as `num_train_timesteps`. They can be
|
141 |
+
accessed via `scheduler.config.num_train_timesteps`. [`SchedulerMixin`] provides general loading and saving
|
142 |
+
functionality via the [`SchedulerMixin.save_pretrained`] and [`~SchedulerMixin.from_pretrained`] functions.
|
143 |
|
144 |
Args:
|
145 |
num_train_timesteps (`int`, defaults to 1000):
|
|
|
153 |
`linear`, `scaled_linear`, or `squaredcos_cap_v2`.
|
154 |
trained_betas (`np.ndarray`, *optional*):
|
155 |
Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
|
156 |
+
original_inference_steps (`int`, *optional*, defaults to 50):
|
157 |
+
The default number of inference steps used to generate a linearly-spaced timestep schedule, from which we
|
158 |
+
will ultimately take `num_inference_steps` evenly spaced timesteps to form the final timestep schedule.
|
159 |
clip_sample (`bool`, defaults to `True`):
|
160 |
Clip the predicted sample for numerical stability.
|
161 |
clip_sample_range (`float`, defaults to 1.0):
|
|
|
188 |
[`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
|
189 |
"""
|
190 |
|
|
|
191 |
order = 1
|
192 |
|
193 |
@register_to_config
|
194 |
def __init__(
|
195 |
self,
|
196 |
num_train_timesteps: int = 1000,
|
197 |
+
beta_start: float = 0.00085,
|
198 |
+
beta_end: float = 0.012,
|
199 |
+
beta_schedule: str = "scaled_linear",
|
200 |
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
|
201 |
+
original_inference_steps: int = 50,
|
202 |
+
clip_sample: bool = False,
|
203 |
+
clip_sample_range: float = 1.0,
|
204 |
set_alpha_to_one: bool = True,
|
205 |
steps_offset: int = 0,
|
206 |
prediction_type: str = "epsilon",
|
207 |
thresholding: bool = False,
|
208 |
dynamic_thresholding_ratio: float = 0.995,
|
|
|
209 |
sample_max_value: float = 1.0,
|
210 |
timestep_spacing: str = "leading",
|
211 |
rescale_betas_zero_snr: bool = False,
|
|
|
245 |
self.num_inference_steps = None
|
246 |
self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
|
247 |
|
248 |
+
self._step_index = None
|
249 |
+
|
250 |
+
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
|
251 |
+
def _init_step_index(self, timestep):
|
252 |
+
if isinstance(timestep, torch.Tensor):
|
253 |
+
timestep = timestep.to(self.timesteps.device)
|
254 |
+
|
255 |
+
index_candidates = (self.timesteps == timestep).nonzero()
|
256 |
+
|
257 |
+
# The sigma index that is taken for the **very** first `step`
|
258 |
+
# is always the second index (or the last index if there is only 1)
|
259 |
+
# This way we can ensure we don't accidentally skip a sigma in
|
260 |
+
# case we start in the middle of the denoising schedule (e.g. for image-to-image)
|
261 |
+
if len(index_candidates) > 1:
|
262 |
+
step_index = index_candidates[1]
|
263 |
+
else:
|
264 |
+
step_index = index_candidates[0]
|
265 |
+
|
266 |
+
self._step_index = step_index.item()
|
267 |
+
|
268 |
+
@property
|
269 |
+
def step_index(self):
|
270 |
+
return self._step_index
|
271 |
+
|
272 |
def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
|
273 |
"""
|
274 |
Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
|
|
|
279 |
The input sample.
|
280 |
timestep (`int`, *optional*):
|
281 |
The current timestep in the diffusion chain.
|
|
|
282 |
Returns:
|
283 |
`torch.FloatTensor`:
|
284 |
A scaled input sample.
|
285 |
"""
|
286 |
return sample
|
287 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
288 |
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
|
289 |
def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
|
290 |
"""
|
|
|
297 |
https://arxiv.org/abs/2205.11487
|
298 |
"""
|
299 |
dtype = sample.dtype
|
300 |
+
batch_size, channels, *remaining_dims = sample.shape
|
301 |
|
302 |
if dtype not in (torch.float32, torch.float64):
|
303 |
sample = sample.float() # upcast for quantile calculation, and clamp not implemented for cpu half
|
304 |
|
305 |
# Flatten sample for doing quantile calculation along each image
|
306 |
+
sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
|
307 |
|
308 |
abs_sample = sample.abs() # "a certain percentile absolute pixel value"
|
309 |
|
|
|
311 |
s = torch.clamp(
|
312 |
s, min=1, max=self.config.sample_max_value
|
313 |
) # When clamped to min=1, equivalent to standard clipping to [-1, 1]
|
|
|
314 |
s = s.unsqueeze(1) # (batch_size, 1) because clamp will broadcast along dim=0
|
315 |
sample = torch.clamp(sample, -s, s) / s # "we threshold xt0 to the range [-s, s] and then divide by s"
|
316 |
|
317 |
+
sample = sample.reshape(batch_size, channels, *remaining_dims)
|
318 |
sample = sample.to(dtype)
|
319 |
|
320 |
return sample
|
321 |
|
322 |
+
def set_timesteps(
|
323 |
+
self,
|
324 |
+
num_inference_steps: int,
|
325 |
+
device: Union[str, torch.device] = None,
|
326 |
+
original_inference_steps: Optional[int] = None,
|
327 |
+
):
|
328 |
"""
|
329 |
Sets the discrete timesteps used for the diffusion chain (to be run before inference).
|
330 |
|
331 |
Args:
|
332 |
num_inference_steps (`int`):
|
333 |
The number of diffusion steps used when generating samples with a pre-trained model.
|
334 |
+
device (`str` or `torch.device`, *optional*):
|
335 |
+
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
336 |
+
original_inference_steps (`int`, *optional*):
|
337 |
+
The original number of inference steps, which will be used to generate a linearly-spaced timestep
|
338 |
+
schedule (which is different from the standard `diffusers` implementation). We will then take
|
339 |
+
`num_inference_steps` timesteps from this schedule, evenly spaced in terms of indices, and use that as
|
340 |
+
our final timestep schedule. If not set, this will default to the `original_inference_steps` attribute.
|
341 |
"""
|
342 |
|
343 |
if num_inference_steps > self.config.num_train_timesteps:
|
|
|
348 |
)
|
349 |
|
350 |
self.num_inference_steps = num_inference_steps
|
351 |
+
original_steps = (
|
352 |
+
original_inference_steps if original_inference_steps is not None else self.original_inference_steps
|
353 |
+
)
|
354 |
+
|
355 |
+
if original_steps > self.config.num_train_timesteps:
|
356 |
+
raise ValueError(
|
357 |
+
f"`original_steps`: {original_steps} cannot be larger than `self.config.train_timesteps`:"
|
358 |
+
f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
|
359 |
+
f" maximal {self.config.num_train_timesteps} timesteps."
|
360 |
+
)
|
361 |
+
|
362 |
+
if num_inference_steps > original_steps:
|
363 |
+
raise ValueError(
|
364 |
+
f"`num_inference_steps`: {num_inference_steps} cannot be larger than `original_inference_steps`:"
|
365 |
+
f" {original_steps} because the final timestep schedule will be a subset of the"
|
366 |
+
f" `original_inference_steps`-sized initial timestep schedule."
|
367 |
+
)
|
368 |
+
|
369 |
+
# LCM Timesteps Setting
|
370 |
+
# Currently, only linear spacing is supported.
|
371 |
+
c = self.config.num_train_timesteps // original_steps
|
372 |
+
# LCM Training Steps Schedule
|
373 |
+
lcm_origin_timesteps = np.asarray(list(range(1, original_steps + 1))) * c - 1
|
374 |
skipping_step = len(lcm_origin_timesteps) // num_inference_steps
|
375 |
+
# LCM Inference Steps Schedule
|
376 |
+
timesteps = lcm_origin_timesteps[::-skipping_step][:num_inference_steps]
|
377 |
+
|
378 |
+
self.timesteps = torch.from_numpy(timesteps.copy()).to(device=device, dtype=torch.long)
|
379 |
+
|
380 |
+
self._step_index = None
|
381 |
|
382 |
def get_scalings_for_boundary_condition_discrete(self, t):
|
383 |
+
self.sigma_data = 0.5 # Default: 0.5
|
384 |
+
|
385 |
+
# By dividing 0.1: This is almost a delta function at t=0.
|
386 |
+
c_skip = self.sigma_data**2 / ((t / 0.1) ** 2 + self.sigma_data**2)
|
387 |
+
c_out = (t / 0.1) / ((t / 0.1) ** 2 + self.sigma_data**2) ** 0.5
|
|
|
|
|
388 |
return c_skip, c_out
|
389 |
+
|
|
|
390 |
def step(
|
391 |
self,
|
392 |
model_output: torch.FloatTensor,
|
|
|
393 |
timestep: int,
|
394 |
sample: torch.FloatTensor,
|
395 |
+
generator: Optional[torch.Generator] = None,
|
|
|
|
|
|
|
396 |
return_dict: bool = True,
|
397 |
) -> Union[LCMSchedulerOutput, Tuple]:
|
398 |
"""
|
|
|
406 |
The current discrete timestep in the diffusion chain.
|
407 |
sample (`torch.FloatTensor`):
|
408 |
A current instance of a sample created by the diffusion process.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
409 |
generator (`torch.Generator`, *optional*):
|
410 |
A random number generator.
|
|
|
|
|
|
|
411 |
return_dict (`bool`, *optional*, defaults to `True`):
|
412 |
Whether or not to return a [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] or `tuple`.
|
|
|
413 |
Returns:
|
414 |
[`~schedulers.scheduling_utils.LCMSchedulerOutput`] or `tuple`:
|
415 |
If return_dict is `True`, [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] is returned, otherwise a
|
416 |
tuple is returned where the first element is the sample tensor.
|
|
|
417 |
"""
|
418 |
if self.num_inference_steps is None:
|
419 |
raise ValueError(
|
420 |
"Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
|
421 |
)
|
422 |
+
|
423 |
+
if self.step_index is None:
|
424 |
+
self._init_step_index(timestep)
|
425 |
+
|
426 |
# 1. get previous step value
|
427 |
+
prev_step_index = self.step_index + 1
|
428 |
+
if prev_step_index < len(self.timesteps):
|
429 |
+
prev_timestep = self.timesteps[prev_step_index]
|
430 |
else:
|
431 |
prev_timestep = timestep
|
432 |
+
|
433 |
# 2. compute alphas, betas
|
434 |
alpha_prod_t = self.alphas_cumprod[timestep]
|
435 |
alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
|
436 |
+
|
437 |
beta_prod_t = 1 - alpha_prod_t
|
438 |
beta_prod_t_prev = 1 - alpha_prod_t_prev
|
439 |
+
|
440 |
# 3. Get scalings for boundary conditions
|
441 |
c_skip, c_out = self.get_scalings_for_boundary_condition_discrete(timestep)
|
442 |
+
|
443 |
+
# 4. Compute the predicted original sample x_0 based on the model parameterization
|
444 |
+
if self.config.prediction_type == "epsilon": # noise-prediction
|
445 |
+
predicted_original_sample = (sample - beta_prod_t.sqrt() * model_output) / alpha_prod_t.sqrt()
|
446 |
+
elif self.config.prediction_type == "sample": # x-prediction
|
447 |
+
predicted_original_sample = model_output
|
448 |
+
elif self.config.prediction_type == "v_prediction": # v-prediction
|
449 |
+
predicted_original_sample = alpha_prod_t.sqrt() * sample - beta_prod_t.sqrt() * model_output
|
450 |
+
else:
|
451 |
+
raise ValueError(
|
452 |
+
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or"
|
453 |
+
" `v_prediction` for `LCMScheduler`."
|
454 |
+
)
|
455 |
+
|
456 |
+
# 5. Clip or threshold "predicted x_0"
|
457 |
+
if self.config.thresholding:
|
458 |
+
predicted_original_sample = self._threshold_sample(predicted_original_sample)
|
459 |
+
elif self.config.clip_sample:
|
460 |
+
predicted_original_sample = predicted_original_sample.clamp(
|
461 |
+
-self.config.clip_sample_range, self.config.clip_sample_range
|
462 |
+
)
|
463 |
+
|
464 |
+
# 6. Denoise model output using boundary conditions
|
465 |
+
denoised = c_out * predicted_original_sample + c_skip * sample
|
466 |
+
|
467 |
+
# 7. Sample and inject noise z ~ N(0, I) for MultiStep Inference
|
468 |
# Noise is not used for one-step sampling.
|
469 |
if len(self.timesteps) > 1:
|
470 |
+
noise = randn_tensor(model_output.shape, generator=generator, device=model_output.device)
|
471 |
prev_sample = alpha_prod_t_prev.sqrt() * denoised + beta_prod_t_prev.sqrt() * noise
|
472 |
else:
|
473 |
prev_sample = denoised
|
474 |
+
|
475 |
+
# upon completion increase step index by one
|
476 |
+
self._step_index += 1
|
477 |
+
|
478 |
if not return_dict:
|
479 |
return (prev_sample, denoised)
|
480 |
+
|
481 |
return LCMSchedulerOutput(prev_sample=prev_sample, denoised=denoised)
|
|
|
482 |
|
483 |
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
|
484 |
def add_noise(
|
model_index.json
CHANGED
@@ -12,8 +12,8 @@
|
|
12 |
"StableDiffusionSafetyChecker"
|
13 |
],
|
14 |
"scheduler": [
|
15 |
-
|
16 |
-
|
17 |
],
|
18 |
"text_encoder": [
|
19 |
"transformers",
|
|
|
12 |
"StableDiffusionSafetyChecker"
|
13 |
],
|
14 |
"scheduler": [
|
15 |
+
null,
|
16 |
+
null
|
17 |
],
|
18 |
"text_encoder": [
|
19 |
"transformers",
|
scheduler/scheduler_config.json
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
{
|
2 |
-
"_class_name": "
|
3 |
-
"_diffusers_version": "0.
|
4 |
"beta_end": 0.012,
|
5 |
"beta_schedule": "scaled_linear",
|
6 |
"beta_start": 0.00085,
|
@@ -8,12 +8,12 @@
|
|
8 |
"clip_sample_range": 1.0,
|
9 |
"dynamic_thresholding_ratio": 0.995,
|
10 |
"num_train_timesteps": 1000,
|
|
|
11 |
"prediction_type": "epsilon",
|
12 |
"rescale_betas_zero_snr": false,
|
13 |
"sample_max_value": 1.0,
|
14 |
"set_alpha_to_one": true,
|
15 |
-
"
|
16 |
-
"steps_offset": 1,
|
17 |
"thresholding": false,
|
18 |
"timestep_spacing": "leading",
|
19 |
"trained_betas": null
|
|
|
1 |
{
|
2 |
+
"_class_name": "LCMScheduler",
|
3 |
+
"_diffusers_version": "0.22.0.dev0",
|
4 |
"beta_end": 0.012,
|
5 |
"beta_schedule": "scaled_linear",
|
6 |
"beta_start": 0.00085,
|
|
|
8 |
"clip_sample_range": 1.0,
|
9 |
"dynamic_thresholding_ratio": 0.995,
|
10 |
"num_train_timesteps": 1000,
|
11 |
+
"original_inference_steps": 50,
|
12 |
"prediction_type": "epsilon",
|
13 |
"rescale_betas_zero_snr": false,
|
14 |
"sample_max_value": 1.0,
|
15 |
"set_alpha_to_one": true,
|
16 |
+
"steps_offset": 0,
|
|
|
17 |
"thresholding": false,
|
18 |
"timestep_spacing": "leading",
|
19 |
"trained_betas": null
|