Spaces:

neural-ti
/

NeTI

Sleeping

App Files Files Community

neural-ti commited on May 30, 2023

Commit

1f53cbd

•

1 Parent(s): 891bcde

Delete src

Browse files

Files changed (17) hide show

src/__init__.py +0 -0
src/checkpoint_handler.py +0 -107
src/config.py +0 -146
src/constants.py +0 -83
src/models/__init__.py +0 -0
src/models/net_clip_text_embedding.py +0 -60
src/models/neti_clip_text_encoder.py +0 -160
src/models/neti_mapper.py +0 -90
src/models/positional_encoding.py +0 -57
src/models/xti_attention_processor.py +0 -57
src/prompt_manager.py +0 -63
src/scripts/__init__.py +0 -0
src/scripts/inference.py +0 -170
src/sd_pipeline_call.py +0 -146
src/utils/__init__.py +0 -0
src/utils/types.py +0 -20
src/utils/vis_utils.py +0 -17

src/__init__.py DELETED Viewed

File without changes

src/checkpoint_handler.py DELETED Viewed

@@ -1,107 +0,0 @@
-from pathlib import Path
-from typing import Tuple
-import pyrallis
-import torch
-from accelerate import Accelerator
-from torch import nn
-from transformers import CLIPTokenizer
-from src.models.neti_clip_text_encoder import NeTICLIPTextModel
-from src.models.neti_mapper import NeTIMapper
-from src.models.positional_encoding import NeTIPositionalEncoding, BasicEncoder
-from src.config import RunConfig
-class CheckpointHandler:
-    def __init__(self, cfg: RunConfig, placeholder_token_string: str, placeholder_token_id: int, save_root: Path):
-        self.cfg = cfg
-        self.placeholder_token_string = placeholder_token_string
-        self.placeholder_token_id = placeholder_token_id
-        self.save_root = save_root
-    def save_model(self, text_encoder: NeTICLIPTextModel,
-                   accelerator: Accelerator,
-                   embeds_save_name: str,
-                   mapper_save_name: str):
-        self.save_learned_embeds(text_encoder, accelerator, embeds_save_name)
-        self.save_mapper(text_encoder, mapper_save_name)
-    def save_learned_embeds(self, text_encoder: NeTICLIPTextModel, accelerator: Accelerator, save_name: str):
-        """
-        Save learned embeddings. This embedding isn't really learned, but we'll add it to the tokenizer at inference
-        to take the place of our placeholder token.
-        """
-        learned_embeds = accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[self.placeholder_token_id]
-        learned_embeds = learned_embeds.detach().cpu()
-        learned_embeds_dict = {self.placeholder_token_string: learned_embeds}
-        torch.save(learned_embeds_dict, self.save_root / save_name)
-    def save_mapper(self, text_encoder: NeTICLIPTextModel, save_name: str):
-        """ Save the mapper and config to be used at inference. """
-        cfg_ = RunConfig(**self.cfg.__dict__.copy())
-        state_dict = {
-            "state_dict": text_encoder.text_model.embeddings.mapper.state_dict(),
-            "cfg": pyrallis.encode(cfg_),
-            "encoder": text_encoder.text_model.embeddings.mapper.encoder
-        }
-        torch.save(state_dict, self.save_root / save_name)
-    @staticmethod
-    def load_mapper(mapper_path: Path) -> Tuple[RunConfig, NeTIMapper]:
-        mapper_ckpt = torch.load(mapper_path, map_location="cpu")
-        cfg = pyrallis.decode(RunConfig, mapper_ckpt['cfg'])
-        neti_mapper = NeTIMapper(output_dim=768,
-                                 use_nested_dropout=cfg.model.use_nested_dropout,
-                                 nested_dropout_prob=cfg.model.nested_dropout_prob,
-                                 norm_scale=cfg.model.target_norm,
-                                 use_positional_encoding=cfg.model.use_positional_encoding,
-                                 num_pe_time_anchors=cfg.model.num_pe_time_anchors,
-                                 pe_sigmas=cfg.model.pe_sigmas,
-                                 output_bypass=cfg.model.output_bypass)
-        neti_mapper.load_state_dict(mapper_ckpt['state_dict'], strict=True)
-        encoder = mapper_ckpt['encoder']
-        if isinstance(encoder, NeTIPositionalEncoding):
-            encoder.w = nn.Parameter(mapper_ckpt['encoder'].w.cuda())
-        elif isinstance(encoder, BasicEncoder):
-            encoder.normalized_timesteps = mapper_ckpt['encoder'].normalized_timesteps.cuda()
-            encoder.normalized_unet_layers = mapper_ckpt['encoder'].normalized_unet_layers.cuda()
-        neti_mapper.encoder = encoder.cuda()
-        neti_mapper.cuda()
-        neti_mapper.eval()
-        return cfg, neti_mapper
-    @staticmethod
-    def load_learned_embed_in_clip(learned_embeds_path: Path,
-                                   text_encoder: NeTICLIPTextModel,
-                                   tokenizer: CLIPTokenizer) -> Tuple[str, int]:
-        loaded_learned_embeds = torch.load(learned_embeds_path, map_location="cpu")
-        # separate token and the embeds
-        trained_tokens = list(loaded_learned_embeds.keys())
-        embeds = list(loaded_learned_embeds.values())
-        # cast to dtype of text_encoder
-        dtype = text_encoder.get_input_embeddings().weight.dtype
-        embeds = [e.to(dtype) for e in embeds]
-        # add the tokens in tokenizer
-        num_added_tokens = tokenizer.add_tokens(trained_tokens)
-        if num_added_tokens == 0:
-            raise ValueError(f"The tokenizer already contains the token {trained_tokens[0]}. "
-                             f"Please pass a different `token` that is not already in the tokenizer.")
-        # resize the token embeddings
-        text_encoder.resize_token_embeddings(len(tokenizer))
-        # get the id for the token and assign the embeds
-        placeholder_token_ids = [tokenizer.convert_tokens_to_ids(t) for t in trained_tokens]
-        for idx, (token, token_id, embed) in enumerate(zip(trained_tokens, placeholder_token_ids, embeds)):
-            text_encoder.get_input_embeddings().weight.data[token_id] = embed
-        assert len(trained_tokens) == 1, "Only one placeholder token is supported"
-        placeholder_token = trained_tokens[0]
-        placeholder_token_id = placeholder_token_ids[0]
-        return placeholder_token, placeholder_token_id

src/config.py DELETED Viewed

@@ -1,146 +0,0 @@
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import List, Optional, Dict
-from src.constants import VALIDATION_PROMPTS
-from src.utils.types import PESigmas
-@dataclass
-class LogConfig:
-    """ Parameters for logging and saving """
-    # Name of experiment. This will be the name of the output folder
-    exp_name: str
-    # The output directory where the model predictions and checkpoints will be written
-    exp_dir: Path = Path("./outputs")
-    # Save interval
-    save_steps: int = 250
-    # [TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to
-    # `output_dir/runs/**CURRENT_DATETIME_HOSTNAME`
-    logging_dir: Path = Path("logs")
-    # The integration to report the results to. Supported platforms are "tensorboard" '
-    # (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
-    report_to: str = "tensorboard"
-    # Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator`
-    checkpoints_total_limit: Optional[int] = None
-@dataclass
-class DataConfig:
-    """ Parameters for data """
-    # A folder containing the training data
-    train_data_dir: Path
-    # A token to use as a placeholder for the concept
-    placeholder_token: str
-    # Super category token to use for normalizing the mapper output
-    super_category_token: Optional[str] = "object"
-    # Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process
-    dataloader_num_workers: int = 8
-    # Choose between 'object' and 'style' - used for selecting the prompts for training
-    learnable_property: str = "object"
-    # How many times to repeat the training data
-    repeats: int = 100
-    # The resolution for input images, all the images in the train/validation dataset will be resized to this resolution
-    resolution: int = 512
-    # Whether to center crop images before resizing to resolution
-    center_crop: bool = False
-@dataclass
-class ModelConfig:
-    """ Parameters for defining all models """
-    # Path to pretrained model or model identifier from huggingface.co/models
-    pretrained_model_name_or_path: str = "CompVis/stable-diffusion-v1-4"
-    # Whether to use our Nested Dropout technique
-    use_nested_dropout: bool = True
-    # Probability to apply nested dropout during training
-    nested_dropout_prob: float = 0.5
-    # Whether to normalize the norm of the mapper's output vector
-    normalize_mapper_output: bool = True
-    # Target norm for the mapper's output vector
-    target_norm: Optional[float] = None
-    # Whether to use positional encoding over the input to the mapper
-    use_positional_encoding: bool = True
-    # Sigmas used for computing positional encoding
-    pe_sigmas: Dict[str, float] = field(default_factory=lambda: {'sigma_t': 0.03, 'sigma_l': 2.0})
-    # Number of time anchors for computing our positional encodings
-    num_pe_time_anchors: int = 10
-    # Whether to output the textual bypass vector
-    output_bypass: bool = True
-    # Revision of pretrained model identifier from huggingface.co/models
-    revision: Optional[str] = None
-    # Whether training should be resumed from a previous checkpoint.
-    mapper_checkpoint_path: Optional[Path] = None
-    def __post_init__(self):
-        if self.pe_sigmas is not None:
-            assert len(self.pe_sigmas) == 2, "Should provide exactly two sigma values: one for two and one for layers!"
-            self.pe_sigmas = PESigmas(sigma_t=self.pe_sigmas['sigma_t'], sigma_l=self.pe_sigmas['sigma_l'])
-@dataclass
-class EvalConfig:
-    """ Parameters for validation """
-    # A list of prompts that will be used during validation to verify that the model is learning
-    validation_prompts: List[str] = field(default_factory=lambda: VALIDATION_PROMPTS)
-    # Number of images that should be generated during validation with `validation_prompt`
-    num_validation_images: int = 4
-    # Seeds to use for generating the validation images
-    validation_seeds: Optional[List[int]] = field(default_factory=lambda: [42, 420, 501, 5456])
-    # Run validation every X steps.
-    validation_steps: int = 100
-    # Number of denoising steps
-    num_denoising_steps: int = 50
-    def __post_init__(self):
-        if self.validation_seeds is None:
-            self.validation_seeds = list(range(self.num_validation_images))
-        assert len(self.validation_seeds) == self.num_validation_images, \
-            "Length of validation_seeds should equal num_validation_images"
-@dataclass
-class OptimConfig:
-    """ Parameters for the optimization process """
-    # Total number of training steps to perform.
-    max_train_steps: Optional[int] = 1_000
-    # Learning rate
-    learning_rate: float = 1e-3
-    # Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size
-    scale_lr: bool = True
-    # Batch size (per device) for the training dataloader
-    train_batch_size: int = 2
-    # Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass
-    gradient_checkpointing: bool = False
-    # Number of updates steps to accumulate before performing a backward/update pass
-    gradient_accumulation_steps: int = 4
-    # A seed for reproducible training
-    seed: Optional[int] = None
-    # The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",
-    # "constant", "constant_with_warmup"]
-    lr_scheduler: str = "constant"
-    # Number of steps for the warmup in the lr scheduler
-    lr_warmup_steps: int = 0
-    # The beta1 parameter for the Adam optimizer
-    adam_beta1: float = 0.9
-    # The beta2 parameter for the Adam optimizer
-    adam_beta2: float = 0.999
-    # Weight decay to use
-    adam_weight_decay: float = 1e-2
-    # Epsilon value for the Adam optimizer
-    adam_epsilon: float = 1e-08
-    # Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10.
-    # and an Nvidia Ampere GPU.
-    mixed_precision: str = "no"
-    # Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see
-    # https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
-    allow_tf32: bool = False
-@dataclass
-class RunConfig:
-    """ The main configuration for the coach trainer """
-    log: LogConfig = field(default_factory=LogConfig)
-    data: DataConfig = field(default_factory=DataConfig)
-    model: ModelConfig = field(default_factory=ModelConfig)
-    eval: EvalConfig = field(default_factory=EvalConfig)
-    optim: OptimConfig = field(default_factory=OptimConfig)

src/constants.py DELETED Viewed

@@ -1,83 +0,0 @@
-UNET_LAYERS = ['IN01', 'IN02', 'IN04', 'IN05', 'IN07', 'IN08', 'MID',
-               'OUT03', 'OUT04', 'OUT05', 'OUT06', 'OUT07', 'OUT08', 'OUT09', 'OUT10', 'OUT11']
-SD_INFERENCE_TIMESTEPS = [999, 979, 959, 939, 919, 899, 879, 859, 839, 819, 799, 779, 759, 739, 719, 699, 679, 659,
-                          639, 619, 599, 579, 559, 539, 519, 500, 480, 460, 440, 420, 400, 380, 360, 340, 320, 300,
-                          280, 260, 240, 220, 200, 180, 160, 140, 120, 100, 80, 60, 40, 20]
-PROMPTS = [
-    "A photo of a {}",
-    "A photo of {} in the jungle",
-    "A photo of {} on a beach",
-    "A photo of {} in Times Square",
-    "A photo of {} in the moon",
-    "A painting of {} in the style of Monet",
-    "Oil painting of {}",
-    "A Marc Chagall painting of {}",
-    "A manga drawing of {}",
-    'A watercolor painting of {}',
-    "A statue of {}",
-    "App icon of {}",
-    "A sand sculpture of {}",
-    "Colorful graffiti of {}",
-    "A photograph of two {} on a table",
-]
-VALIDATION_PROMPTS = [
-    "A photo of a {}",
-    "A photo of a {} on a beach",
-    "App icon of {}",
-    "A painting of {} in the style of Monet",
-]
-IMAGENET_TEMPLATES_SMALL = [
-    "a photo of a {}",
-    "a rendering of a {}",
-    "a cropped photo of the {}",
-    "the photo of a {}",
-    "a photo of a clean {}",
-    "a photo of a dirty {}",
-    "a dark photo of the {}",
-    "a photo of my {}",
-    "a photo of the cool {}",
-    "a close-up photo of a {}",
-    "a bright photo of the {}",
-    "a cropped photo of a {}",
-    "a photo of the {}",
-    "a good photo of the {}",
-    "a photo of one {}",
-    "a close-up photo of the {}",
-    "a rendition of the {}",
-    "a photo of the clean {}",
-    "a rendition of a {}",
-    "a photo of a nice {}",
-    "a good photo of a {}",
-    "a photo of the nice {}",
-    "a photo of the small {}",
-    "a photo of the weird {}",
-    "a photo of the large {}",
-    "a photo of a cool {}",
-    "a photo of a small {}",
-]
-IMAGENET_STYLE_TEMPLATES_SMALL = [
-    "a painting in the style of {}",
-    "a rendering in the style of {}",
-    "a cropped painting in the style of {}",
-    "the painting in the style of {}",
-    "a clean painting in the style of {}",
-    "a dirty painting in the style of {}",
-    "a dark painting in the style of {}",
-    "a picture in the style of {}",
-    "a cool painting in the style of {}",
-    "a close-up painting in the style of {}",
-    "a bright painting in the style of {}",
-    "a cropped painting in the style of {}",
-    "a good painting in the style of {}",
-    "a close-up painting in the style of {}",
-    "a rendition in the style of {}",
-    "a nice painting in the style of {}",
-    "a small painting in the style of {}",
-    "a weird painting in the style of {}",
-    "a large painting in the style of {}",
-]

src/models/__init__.py DELETED Viewed

File without changes

src/models/net_clip_text_embedding.py DELETED Viewed

@@ -1,60 +0,0 @@
-from typing import Optional, Tuple
-import torch
-from torch import nn
-from transformers import CLIPTextConfig
-from src.models.neti_mapper import NeTIMapper
-from src.utils.types import NeTIBatch
-class NeTICLIPTextEmbeddings(nn.Module):
-    """ Modification of CLIPTextEmbedding to allow for the use of a NeTIMapper to overwrite the concept token. """
-    def __init__(self, config: CLIPTextConfig):
-        super().__init__()
-        embed_dim = config.hidden_size
-        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
-        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
-    def set_mapper(self, mapper: NeTIMapper):
-        self.mapper = mapper
-    def forward(self, input_ids: Optional[torch.LongTensor] = None,
-                position_ids: Optional[torch.LongTensor] = None,
-                inputs_embeds: Optional[torch.FloatTensor] = None,
-                batch: Optional[NeTIBatch] = None) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-        if batch is not None:
-            input_ids = batch.input_ids
-        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
-        if position_ids is None:
-            position_ids = self.position_ids[:, :seq_length]
-        if inputs_embeds is None:
-            inputs_embeds = self.token_embedding(input_ids)
-        ####################################################################
-        # NeTI logic - Use mapper to overwrite the learnable token embedding
-        ####################################################################
-        bypass_outputs = None
-        if batch is not None:
-            mapper_outputs = self.mapper(timestep=batch.timesteps.float(),
-                                         unet_layer=batch.unet_layers.float(),
-                                         truncation_idx=batch.truncation_idx)
-            mapper_outputs = mapper_outputs.to(dtype=inputs_embeds.dtype, device=inputs_embeds.device)
-            if self.mapper.output_bypass:
-                bypass_outputs = mapper_outputs[:, mapper_outputs.shape[1] // 2:]
-                mapper_outputs = mapper_outputs[:, :mapper_outputs.shape[1] // 2]
-            # Overwrite the index of the placeholder token with the mapper output for each entry in the batch
-            learnable_idxs = (input_ids == batch.placeholder_token_id).nonzero(as_tuple=True)[1]
-            inputs_embeds[torch.arange(input_ids.shape[0]), learnable_idxs] = mapper_outputs
-        position_embeddings = self.position_embedding(position_ids)
-        embeddings = inputs_embeds + position_embeddings
-        return embeddings, bypass_outputs

src/models/neti_clip_text_encoder.py DELETED Viewed

@@ -1,160 +0,0 @@
-from typing import Optional, Tuple, Union
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from transformers.modeling_outputs import BaseModelOutputWithPooling
-from transformers.models.clip.modeling_clip import CLIPTextConfig, CLIPTextModel, CLIPEncoder
-from transformers.models.clip.modeling_clip import CLIPTextTransformer, _expand_mask
-from src.models.net_clip_text_embedding import NeTICLIPTextEmbeddings
-from src.utils.types import NeTIBatch
-class NeTICLIPTextModel(CLIPTextModel):
-    """ Modification of CLIPTextModel to use our NeTI mapper for computing the embeddings of the concept. """
-    def __init__(self, config: CLIPTextConfig):
-        super().__init__(config)
-        self.text_model = NeTICLIPTextTransformer(config)
-        self.post_init()
-    def forward(self, input_ids: Optional[torch.Tensor] = None,
-                attention_mask: Optional[torch.Tensor] = None,
-                position_ids: Optional[torch.Tensor] = None,
-                output_attentions: Optional[bool] = None,
-                output_hidden_states: Optional[bool] = None,
-                return_dict: Optional[bool] = None,
-                batch: Optional[NeTIBatch] = None) -> Union[Tuple, BaseModelOutputWithPooling]:
-        return self.text_model.forward(
-            batch=batch,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-class NeTICLIPTextTransformer(CLIPTextTransformer):
-    """ Modification of CLIPTextTransformer to use our NeTI mapper for computing the embeddings of the concept. """
-    def __init__(self, config: CLIPTextConfig):
-        super().__init__(config=config)
-        self.config = config
-        embed_dim = config.hidden_size
-        self.embeddings = NeTICLIPTextEmbeddings(config)
-        self.encoder = CLIPEncoder(config)
-        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
-    def forward(self, input_ids: Optional[torch.Tensor] = None,
-                attention_mask: Optional[torch.Tensor] = None,
-                position_ids: Optional[torch.Tensor] = None,
-                output_attentions: Optional[bool] = None,
-                output_hidden_states: Optional[bool] = None,
-                return_dict: Optional[bool] = None,
-                batch: Optional[NeTIBatch] = None) -> Union[Tuple, BaseModelOutputWithPooling]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        bypass_output = None
-        if input_ids is not None:  # Regular embedding logic
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-            hidden_states, _ = self.embeddings(input_ids=input_ids, position_ids=position_ids)
-        ###########################
-        # NeTI logic
-        ###########################
-        elif batch is not None:
-            input_shape = batch.input_ids.size()
-            batch.input_ids = batch.input_ids.view(-1, input_shape[-1])
-            hidden_states, bypass_output = self.embeddings(batch=batch, position_ids=position_ids)
-        else:
-            raise ValueError("You have to specify either batch or input_ids!")
-        bsz, seq_len = input_shape
-        # CLIP's text model uses causal mask, prepare it here.
-        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
-        causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
-            hidden_states.device
-        )
-        # expand attention_mask
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
-        encoder_outputs = self.encoder(
-            inputs_embeds=hidden_states,
-            attention_mask=attention_mask,
-            causal_attention_mask=causal_attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        last_hidden_state = encoder_outputs[0]
-        last_hidden_state_with_bypass = last_hidden_state.clone()
-        ###############################################
-        # NeTI logic - compute the scaled bypass output
-        ###############################################
-        if bypass_output is not None:
-            learnable_idxs = (batch.input_ids == batch.placeholder_token_id).nonzero(as_tuple=True)[1]
-            existing_state = last_hidden_state_with_bypass[torch.arange(last_hidden_state.shape[0]), learnable_idxs]
-            bypass_output = bypass_output / bypass_output.norm(dim=1, keepdim=True) \
-                            * existing_state.norm(dim=1, keepdim=True)
-            new_state = existing_state + 0.2 * bypass_output
-            new_state = new_state.to(dtype=hidden_states.dtype)
-            last_hidden_state_with_bypass[torch.arange(last_hidden_state.shape[0]), learnable_idxs] = new_state
-        last_hidden_state = self.final_layer_norm(last_hidden_state)
-        last_hidden_state_with_bypass = self.final_layer_norm(last_hidden_state_with_bypass)
-        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
-        # take features from the eot embedding (eot_token is the highest number in each sequence)
-        # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
-        if input_ids is not None:
-            pooled_output = last_hidden_state[
-                torch.arange(last_hidden_state.shape[0]), input_ids.to(torch.int).argmax(dim=-1)
-            ]
-            pooled_output_with_bypass = last_hidden_state_with_bypass[
-                torch.arange(last_hidden_state_with_bypass.shape[0]), input_ids.to(torch.int).argmax(dim=-1)
-            ]
-        elif batch is not None:
-            pooled_output = last_hidden_state[
-                torch.arange(last_hidden_state.shape[0]), batch.input_ids.to(torch.int).argmax(dim=-1)
-            ]
-            pooled_output_with_bypass = last_hidden_state_with_bypass[
-                torch.arange(last_hidden_state_with_bypass.shape[0]), batch.input_ids.to(torch.int).argmax(dim=-1)
-            ]
-        else:
-            raise ValueError("You have to specify either batch or input_ids!")
-        if bypass_output is not None:
-            return BaseModelOutputWithPooling(
-                last_hidden_state=last_hidden_state,
-                pooler_output=pooled_output,
-                hidden_states=encoder_outputs.hidden_states,
-                attentions=encoder_outputs.attentions,
-            ), BaseModelOutputWithPooling(
-                last_hidden_state=last_hidden_state_with_bypass,
-                pooler_output=pooled_output_with_bypass,
-                hidden_states=encoder_outputs.hidden_states,
-                attentions=encoder_outputs.attentions,
-            )
-        else:
-            return BaseModelOutputWithPooling(
-                last_hidden_state=last_hidden_state,
-                pooler_output=pooled_output,
-                hidden_states=encoder_outputs.hidden_states,
-                attentions=encoder_outputs.attentions,
-            ), None

src/models/neti_mapper.py DELETED Viewed

@@ -1,90 +0,0 @@
-import random
-from typing import Optional, List
-import torch
-import torch.nn.functional as F
-from torch import nn
-from src.constants import UNET_LAYERS
-from src.models.positional_encoding import NeTIPositionalEncoding, BasicEncoder
-from src.utils.types import PESigmas
-class NeTIMapper(nn.Module):
-    """ Main logic of our NeTI mapper. """
-    def __init__(self, output_dim: int = 768,
-                 unet_layers: List[str] = UNET_LAYERS,
-                 use_nested_dropout: bool = True,
-                 nested_dropout_prob: float = 0.5,
-                 norm_scale: Optional[torch.Tensor] = None,
-                 use_positional_encoding: bool = True,
-                 num_pe_time_anchors: int = 10,
-                 pe_sigmas: PESigmas = PESigmas(sigma_t=0.03, sigma_l=2.0),
-                 output_bypass: bool = True):
-        super().__init__()
-        self.use_nested_dropout = use_nested_dropout
-        self.nested_dropout_prob = nested_dropout_prob
-        self.norm_scale = norm_scale
-        self.output_bypass = output_bypass
-        if self.output_bypass:
-            output_dim *= 2  # Output two vectors
-        self.use_positional_encoding = use_positional_encoding
-        if self.use_positional_encoding:
-            self.encoder = NeTIPositionalEncoding(sigma_t=pe_sigmas.sigma_t, sigma_l=pe_sigmas.sigma_l).cuda()
-            self.input_dim = num_pe_time_anchors * len(unet_layers)
-        else:
-            self.encoder = BasicEncoder().cuda()
-            self.input_dim = 2
-        self.set_net(num_unet_layers=len(unet_layers),
-                     num_time_anchors=num_pe_time_anchors,
-                     output_dim=output_dim)
-    def set_net(self, num_unet_layers: int, num_time_anchors: int, output_dim: int = 768):
-        self.input_layer = self.set_input_layer(num_unet_layers, num_time_anchors)
-        self.net = nn.Sequential(self.input_layer,
-                                 nn.Linear(self.input_dim, 128), nn.LayerNorm(128), nn.LeakyReLU(),
-                                 nn.Linear(128, 128), nn.LayerNorm(128), nn.LeakyReLU())
-        self.output_layer = nn.Sequential(nn.Linear(128, output_dim))
-    def set_input_layer(self, num_unet_layers: int, num_time_anchors: int) -> nn.Module:
-        if self.use_positional_encoding:
-            input_layer = nn.Linear(self.encoder.num_w * 2, self.input_dim)
-            input_layer.weight.data = self.encoder.init_layer(num_time_anchors, num_unet_layers)
-        else:
-            input_layer = nn.Identity()
-        return input_layer
-    def forward(self, timestep: torch.Tensor, unet_layer: torch.Tensor, truncation_idx: int = None) -> torch.Tensor:
-        embedding = self.extract_hidden_representation(timestep, unet_layer)
-        if self.use_nested_dropout:
-            embedding = self.apply_nested_dropout(embedding, truncation_idx=truncation_idx)
-        embedding = self.get_output(embedding)
-        return embedding
-    def get_encoded_input(self, timestep: torch.Tensor, unet_layer: torch.Tensor) -> torch.Tensor:
-        return self.encoder.encode(timestep, unet_layer)
-    def extract_hidden_representation(self, timestep: torch.Tensor, unet_layer: torch.Tensor) -> torch.Tensor:
-        encoded_input = self.get_encoded_input(timestep, unet_layer)
-        embedding = self.net(encoded_input)
-        return embedding
-    def apply_nested_dropout(self, embedding: torch.Tensor, truncation_idx: int = None) -> torch.Tensor:
-        if self.training:
-            if random.random() < self.nested_dropout_prob:
-                dropout_idxs = torch.randint(low=0, high=embedding.shape[1], size=(embedding.shape[0],))
-                for idx in torch.arange(embedding.shape[0]):
-                    embedding[idx][dropout_idxs[idx]:] = 0
-        if not self.training and truncation_idx is not None:
-            for idx in torch.arange(embedding.shape[0]):
-                embedding[idx][truncation_idx:] = 0
-        return embedding
-    def get_output(self, embedding: torch.Tensor) -> torch.Tensor:
-        embedding = self.output_layer(embedding)
-        if self.norm_scale is not None:
-            embedding = F.normalize(embedding, dim=-1) * self.norm_scale
-        return embedding

src/models/positional_encoding.py DELETED Viewed

@@ -1,57 +0,0 @@
-from typing import Union
-import torch
-from torch import nn
-class NeTIPositionalEncoding(nn.Module):
-    def __init__(self, sigma_t: float, sigma_l: float, num_w: int = 1024):
-        super().__init__()
-        self.sigma_t = sigma_t
-        self.sigma_l = sigma_l
-        self.num_w = num_w
-        self.w = torch.randn((num_w, 2))
-        self.w[:, 0] *= sigma_t
-        self.w[:, 1] *= sigma_l
-        self.w = nn.Parameter(self.w).cuda()
-    def encode(self, t: Union[int, torch.Tensor], l: Union[int, torch.Tensor]):
-        """ Maps the given time and layer input into a 2048-dimensional vector. """
-        if type(t) == int or t.ndim == 0:
-            x = torch.tensor([t, l]).float()
-        else:
-            x = torch.stack([t, l], dim=1).T
-        x = x.cuda()
-        v = torch.cat([torch.sin(self.w.detach() @ x), torch.cos(self.w.detach() @ x)])
-        if type(t) == int:
-            v_norm = v / v.norm()
-        else:
-            v_norm = v / v.norm(dim=0)
-            v_norm = v_norm.T
-        return v_norm
-    def init_layer(self, num_time_anchors: int, num_layers: int) -> torch.Tensor:
-        """ Computes the weights for the positional encoding layer of size 160x2048."""
-        anchor_vectors = []
-        for t_anchor in range(0, 1000, 1000 // num_time_anchors):
-            for l_anchor in range(0, num_layers):
-                anchor_vectors.append(self.encode(t_anchor, l_anchor).float())
-        A = torch.stack(anchor_vectors)
-        return A
-class BasicEncoder(nn.Module):
-    """ Simply normalizes the given timestep and unet layer to be between -1 and 1. """
-    def __init__(self, num_denoising_timesteps: int = 1000, num_unet_layers: int = 16):
-        super().__init__()
-        self.normalized_timesteps = (torch.arange(num_denoising_timesteps) / (num_denoising_timesteps - 1)) * 2 - 1
-        self.normalized_unet_layers = (torch.arange(num_unet_layers) / (num_unet_layers - 1)) * 2 - 1
-        self.normalized_timesteps = nn.Parameter(self.normalized_timesteps).cuda()
-        self.normalized_unet_layers = nn.Parameter(self.normalized_unet_layers).cuda()
-    def encode(self, timestep: torch.Tensor, unet_layer: torch.Tensor) -> torch.Tensor:
-        normalized_input = torch.stack([self.normalized_timesteps[timestep.long()],
-                                        self.normalized_unet_layers[unet_layer.long()]]).T
-        return normalized_input

src/models/xti_attention_processor.py DELETED Viewed

@@ -1,57 +0,0 @@
-from typing import Dict, Optional
-import torch
-from diffusers.models.cross_attention import CrossAttention
-class XTIAttenProc:
-    def __call__(self, attn: CrossAttention,
-                 hidden_states: torch.Tensor,
-                 encoder_hidden_states: Optional[Dict[str, torch.Tensor]] = None,
-                 attention_mask: Optional[torch.Tensor] = None):
-        _ehs_bypass = None
-        if encoder_hidden_states is not None:
-            if isinstance(encoder_hidden_states, dict):
-                this_idx = encoder_hidden_states["this_idx"]
-                _ehs = encoder_hidden_states[f"CONTEXT_TENSOR_{this_idx}"]
-                if f"CONTEXT_TENSOR_BYPASS_{this_idx}" in encoder_hidden_states:
-                    _ehs_bypass = encoder_hidden_states[f"CONTEXT_TENSOR_BYPASS_{this_idx}"]
-                encoder_hidden_states["this_idx"] += 1
-                encoder_hidden_states["this_idx"] %= 16
-            else:
-                _ehs = encoder_hidden_states
-        else:
-            _ehs = None
-        batch_size, sequence_length, _ = (hidden_states.shape if _ehs is None else _ehs.shape)
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-        query = attn.to_q(hidden_states)
-        if _ehs is None:
-            _ehs = hidden_states
-        elif attn.cross_attention_norm:
-            _ehs = attn.norm_cross(_ehs)
-            _ehs_bypass = attn.norm_cross(_ehs_bypass)
-        key = attn.to_k(_ehs)
-        if _ehs_bypass is not None:
-            value = attn.to_v(_ehs_bypass)
-        else:
-            value = attn.to_v(_ehs)
-        query = attn.head_to_batch_dim(query)
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
-        attention_probs = attn.get_attention_scores(query, key, attention_mask)
-        hidden_states = torch.bmm(attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-        return hidden_states

src/prompt_manager.py DELETED Viewed

@@ -1,63 +0,0 @@
-from typing import Optional, List, Dict, Any
-import torch
-from tqdm import tqdm
-from transformers import CLIPTokenizer
-from src import constants
-from src.models.neti_clip_text_encoder import NeTICLIPTextModel
-from src.utils.types import NeTIBatch
-class PromptManager:
-    """ Class for computing all time and space embeddings for a given prompt. """
-    def __init__(self, tokenizer: CLIPTokenizer,
-                 text_encoder: NeTICLIPTextModel,
-                 timesteps: List[int] = constants.SD_INFERENCE_TIMESTEPS,
-                 unet_layers: List[str] = constants.UNET_LAYERS,
-                 placeholder_token_id: Optional[List] = None,
-                 placeholder_token: Optional[List] = None,
-                 torch_dtype: torch.dtype = torch.float32):
-        self.tokenizer = tokenizer
-        self.text_encoder = text_encoder
-        self.timesteps = timesteps
-        self.unet_layers = unet_layers
-        self.placeholder_token = placeholder_token
-        self.placeholder_token_id = placeholder_token_id
-        self.dtype = torch_dtype
-    def embed_prompt(self, text: str,
-                     truncation_idx: Optional[int] = None,
-                     num_images_per_prompt: int = 1) -> List[Dict[str, Any]]:
-        """
-        Compute the conditioning vectors for the given prompt. We assume that the prompt is defined using `{}`
-        for indicating where to place the placeholder token string. See constants.VALIDATION_PROMPTS for examples.
-        """
-        text = text.format(self.placeholder_token)
-        ids = self.tokenizer(
-            text,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            return_tensors="pt",
-        ).input_ids
-        # Compute embeddings for each timestep and each U-Net layer
-        print(f"Computing embeddings over {len(self.timesteps)} timesteps and {len(self.unet_layers)} U-Net layers.")
-        hidden_states_per_timestep = []
-        for timestep in tqdm(self.timesteps):
-            _hs = {"this_idx": 0}.copy()
-            for layer_idx, unet_layer in enumerate(self.unet_layers):
-                batch = NeTIBatch(input_ids=ids.to(device=self.text_encoder.device),
-                                  timesteps=timestep.unsqueeze(0).to(device=self.text_encoder.device),
-                                  unet_layers=torch.tensor(layer_idx, device=self.text_encoder.device).unsqueeze(0),
-                                  placeholder_token_id=self.placeholder_token_id,
-                                  truncation_idx=truncation_idx)
-                layer_hs, layer_hs_bypass = self.text_encoder(batch=batch)
-                layer_hs = layer_hs[0].to(dtype=self.dtype)
-                _hs[f"CONTEXT_TENSOR_{layer_idx}"] = layer_hs.repeat(num_images_per_prompt, 1, 1)
-                if layer_hs_bypass is not None:
-                    layer_hs_bypass = layer_hs_bypass[0].to(dtype=self.dtype)
-                    _hs[f"CONTEXT_TENSOR_BYPASS_{layer_idx}"] = layer_hs_bypass.repeat(num_images_per_prompt, 1, 1)
-            hidden_states_per_timestep.append(_hs)
-        print("Done.")
-        return hidden_states_per_timestep

src/scripts/__init__.py DELETED Viewed

File without changes

src/scripts/inference.py DELETED Viewed

@@ -1,170 +0,0 @@
-import sys
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import Optional, List, Tuple, Union
-import numpy as np
-import pyrallis
-import torch
-from PIL import Image
-from diffusers import DPMSolverMultistepScheduler, StableDiffusionPipeline
-from transformers import CLIPTokenizer
-sys.path.append(".")
-sys.path.append("..")
-from src import constants
-from src.models.neti_clip_text_encoder import NeTICLIPTextModel
-from src.models.neti_mapper import NeTIMapper
-from src.prompt_manager import PromptManager
-from src.sd_pipeline_call import sd_pipeline_call
-from src.models.xti_attention_processor import XTIAttenProc
-from src.checkpoint_handler import CheckpointHandler
-from src.utils import vis_utils
-@dataclass
-class InferenceConfig:
-    # Specifies which checkpoint iteration we want to load
-    iteration: Optional[int] = None
-    # The input directory containing the saved models and embeddings
-    input_dir: Optional[Path] = None
-    # Where the save the inference results to
-    inference_dir: Optional[Path] = None
-    # Specific path to the mapper you want to load, overrides `input_dir`
-    mapper_checkpoint_path: Optional[Path] = None
-    # Specific path to the embeddings you want to load, overrides `input_dir`
-    learned_embeds_path: Optional[Path] = None
-    # List of prompts to run inference on
-    prompts: Optional[List[str]] = None
-    # Text file containing a prompts to run inference on (one prompt per line), overrides `prompts`
-    prompts_file_path: Optional[Path] = None
-    # List of random seeds to run on
-    seeds: List[int] = field(default_factory=lambda: [42])
-    # If you want to run with dropout at inference time, this specifies the truncation indices for applying dropout.
-    # None indicates that no dropout will be performed. If a list of indices is provided, will run all indices.
-    truncation_idxs: Optional[Union[int, List[int]]] = None
-    # Whether to run with torch.float16 or torch.float32
-    torch_dtype: str = "fp16"
-    def __post_init__(self):
-        assert bool(self.prompts) != bool(self.prompts_file_path), \
-            "You must provide either prompts or prompts_file_path, but not both!"
-        self._set_prompts()
-        self._set_input_paths()
-        self.inference_dir.mkdir(exist_ok=True, parents=True)
-        if type(self.truncation_idxs) == int:
-            self.truncation_idxs = [self.truncation_idxs]
-        self.torch_dtype = torch.float16 if self.torch_dtype == "fp16" else torch.float32
-    def _set_input_paths(self):
-        if self.inference_dir is None:
-            assert self.input_dir is not None, "You must pass an input_dir if you do not specify inference_dir"
-            self.inference_dir = self.input_dir / f"inference_{self.iteration}"
-        if self.mapper_checkpoint_path is None:
-            assert self.input_dir is not None, "You must pass an input_dir if you do not specify mapper_checkpoint_path"
-            self.mapper_checkpoint_path = self.input_dir / f"mapper-steps-{self.iteration}.pt"
-        if self.learned_embeds_path is None:
-            assert self.input_dir is not None, "You must pass an input_dir if you do not specify learned_embeds_path"
-            self.learned_embeds_path = self.input_dir / f"learned_embeds-steps-{self.iteration}.bin"
-    def _set_prompts(self):
-        if self.prompts_file_path is not None:
-            assert self.prompts_file_path.exists(), f"Prompts file {self.prompts_file_path} does not exist!"
-            self.prompts = self.prompts_file_path.read_text().splitlines()
-@pyrallis.wrap()
-def main(infer_cfg: InferenceConfig):
-    train_cfg, mapper = CheckpointHandler.load_mapper(infer_cfg.mapper_checkpoint_path)
-    pipeline, placeholder_token, placeholder_token_id = load_stable_diffusion_model(
-        pretrained_model_name_or_path=train_cfg.model.pretrained_model_name_or_path,
-        mapper=mapper,
-        learned_embeds_path=infer_cfg.learned_embeds_path,
-        torch_dtype=infer_cfg.torch_dtype
-    )
-    prompt_manager = PromptManager(tokenizer=pipeline.tokenizer,
-                                   text_encoder=pipeline.text_encoder,
-                                   timesteps=pipeline.scheduler.timesteps,
-                                   unet_layers=constants.UNET_LAYERS,
-                                   placeholder_token=placeholder_token,
-                                   placeholder_token_id=placeholder_token_id,
-                                   torch_dtype=infer_cfg.torch_dtype)
-    for prompt in infer_cfg.prompts:
-        output_path = infer_cfg.inference_dir / prompt.format(placeholder_token)
-        output_path.mkdir(exist_ok=True, parents=True)
-        for truncation_idx in infer_cfg.truncation_idxs:
-            print(f"Running with truncation index: {truncation_idx}")
-            prompt_image = run_inference(prompt=prompt,
-                                         pipeline=pipeline,
-                                         prompt_manager=prompt_manager,
-                                         seeds=infer_cfg.seeds,
-                                         output_path=output_path,
-                                         num_images_per_prompt=1,
-                                         truncation_idx=truncation_idx)
-            if truncation_idx is not None:
-                save_name = f"{prompt.format(placeholder_token)}_truncation_{truncation_idx}.png"
-            else:
-                save_name = f"{prompt.format(placeholder_token)}.png"
-            prompt_image.save(infer_cfg.inference_dir / save_name)
-def run_inference(prompt: str,
-                  pipeline: StableDiffusionPipeline,
-                  prompt_manager: PromptManager,
-                  seeds: List[int],
-                  output_path: Optional[Path] = None,
-                  num_images_per_prompt: int = 1,
-                  truncation_idx: Optional[int] = None) -> Image.Image:
-    with torch.autocast("cuda"):
-        with torch.no_grad():
-            prompt_embeds = prompt_manager.embed_prompt(prompt,
-                                                        num_images_per_prompt=num_images_per_prompt,
-                                                        truncation_idx=truncation_idx)
-    joined_images = []
-    for seed in seeds:
-        generator = torch.Generator(device='cuda').manual_seed(seed)
-        images = sd_pipeline_call(pipeline,
-                                  prompt_embeds=prompt_embeds,
-                                  generator=generator,
-                                  num_images_per_prompt=num_images_per_prompt).images
-        seed_image = Image.fromarray(np.concatenate(images, axis=1)).convert("RGB")
-        if output_path is not None:
-            save_name = f'{seed}_truncation_{truncation_idx}.png' if truncation_idx is not None else f'{seed}.png'
-            seed_image.save(output_path / save_name)
-        joined_images.append(seed_image)
-    joined_image = vis_utils.get_image_grid(joined_images)
-    return joined_image
-def load_stable_diffusion_model(pretrained_model_name_or_path: str,
-                                learned_embeds_path: Path,
-                                mapper: Optional[NeTIMapper] = None,
-                                num_denoising_steps: int = 50,
-                                torch_dtype: torch.dtype = torch.float16) -> Tuple[StableDiffusionPipeline, str, int]:
-    tokenizer = CLIPTokenizer.from_pretrained(
-        pretrained_model_name_or_path, subfolder="tokenizer")
-    text_encoder = NeTICLIPTextModel.from_pretrained(
-        pretrained_model_name_or_path, subfolder="text_encoder", torch_dtype=torch_dtype,
-    )
-    if mapper is not None:
-        text_encoder.text_model.embeddings.set_mapper(mapper)
-    placeholder_token, placeholder_token_id = CheckpointHandler.load_learned_embed_in_clip(
-        learned_embeds_path=learned_embeds_path,
-        text_encoder=text_encoder,
-        tokenizer=tokenizer
-    )
-    pipeline = StableDiffusionPipeline.from_pretrained(
-        pretrained_model_name_or_path,
-        torch_dtype=torch_dtype,
-        text_encoder=text_encoder,
-        tokenizer=tokenizer
-    ).to("cuda")
-    pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
-    pipeline.scheduler.set_timesteps(num_denoising_steps, device=pipeline.device)
-    pipeline.unet.set_attn_processor(XTIAttenProc())
-    return pipeline, placeholder_token, placeholder_token_id
-if __name__ == '__main__':
-    main()

src/sd_pipeline_call.py DELETED Viewed

@@ -1,146 +0,0 @@
-from typing import Any, Callable, Dict, List, Optional, Union
-import torch
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionPipeline
-@torch.no_grad()
-def sd_pipeline_call(
-        pipeline: StableDiffusionPipeline,
-        prompt_embeds: torch.FloatTensor,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None):
-    """ Modification of the standard SD pipeline call to support NeTI embeddings passed with prompt_embeds argument."""
-    # 0. Default height and width to unet
-    height = height or pipeline.unet.config.sample_size * pipeline.vae_scale_factor
-    width = width or pipeline.unet.config.sample_size * pipeline.vae_scale_factor
-    # 2. Define call parameters
-    batch_size = 1
-    device = pipeline._execution_device
-    neg_prompt = get_neg_prompt_input_ids(pipeline, negative_prompt)
-    negative_prompt_embeds, _ = pipeline.text_encoder(
-        input_ids=neg_prompt.input_ids.to(device),
-        attention_mask=None,
-    )
-    negative_prompt_embeds = negative_prompt_embeds[0]
-    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-    # corresponds to doing no classifier free guidance.
-    do_classifier_free_guidance = guidance_scale > 1.0
-    # 4. Prepare timesteps
-    pipeline.scheduler.set_timesteps(num_inference_steps, device=device)
-    timesteps = pipeline.scheduler.timesteps
-    # 5. Prepare latent variables
-    num_channels_latents = pipeline.unet.in_channels
-    latents = pipeline.prepare_latents(
-        batch_size * num_images_per_prompt,
-        num_channels_latents,
-        height,
-        width,
-        pipeline.text_encoder.dtype,
-        device,
-        generator,
-        latents,
-    )
-    # 6. Prepare extra step kwargs.
-    extra_step_kwargs = pipeline.prepare_extra_step_kwargs(generator, eta)
-    # 7. Denoising loop
-    num_warmup_steps = len(timesteps) - num_inference_steps * pipeline.scheduler.order
-    with pipeline.progress_bar(total=num_inference_steps) as progress_bar:
-        for i, t in enumerate(timesteps):
-            if do_classifier_free_guidance:
-                latent_model_input = latents
-                latent_model_input = pipeline.scheduler.scale_model_input(latent_model_input, t)
-                # predict the noise residual
-                noise_pred_uncond = pipeline.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=negative_prompt_embeds.repeat(num_images_per_prompt, 1, 1),
-                    cross_attention_kwargs=cross_attention_kwargs,
-                ).sample
-                ###############################################################
-                # NeTI logic: use the prompt embedding for the current timestep
-                ###############################################################
-                embed = prompt_embeds[i] if type(prompt_embeds) == list else prompt_embeds
-                noise_pred_text = pipeline.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=embed,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                ).sample
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-            # compute the previous noisy sample x_t -> x_t-1
-            latents = pipeline.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-            # call the callback, if provided
-            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % pipeline.scheduler.order == 0):
-                progress_bar.update()
-                if callback is not None and i % callback_steps == 0:
-                    callback(i, t, latents)
-    if output_type == "latent":
-        image = latents
-        has_nsfw_concept = None
-    elif output_type == "pil":
-        # 8. Post-processing
-        image = pipeline.decode_latents(latents)
-        # 9. Run safety checker
-        image, has_nsfw_concept = pipeline.run_safety_checker(image, device, pipeline.text_encoder.dtype)
-        # 10. Convert to PIL
-        image = pipeline.numpy_to_pil(image)
-    else:
-        # 8. Post-processing
-        image = pipeline.decode_latents(latents)
-        # 9. Run safety checker
-        image, has_nsfw_concept = pipeline.run_safety_checker(image, device, pipeline.text_encoder.dtype)
-    # Offload last model to CPU
-    if hasattr(pipeline, "final_offload_hook") and pipeline.final_offload_hook is not None:
-        pipeline.final_offload_hook.offload()
-    if not return_dict:
-        return image, has_nsfw_concept
-    return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
-def get_neg_prompt_input_ids(pipeline: StableDiffusionPipeline,
-                             negative_prompt: Optional[Union[str, List[str]]] = None):
-    if negative_prompt is None:
-        negative_prompt = ""
-    uncond_tokens = [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
-    uncond_input = pipeline.tokenizer(
-        uncond_tokens,
-        padding="max_length",
-        max_length=pipeline.tokenizer.model_max_length,
-        truncation=True,
-        return_tensors="pt",
-    )
-    return uncond_input

src/utils/__init__.py DELETED Viewed

File without changes

src/utils/types.py DELETED Viewed

@@ -1,20 +0,0 @@
-import enum
-from dataclasses import dataclass
-from typing import Optional
-import torch
-@dataclass
-class NeTIBatch:
-    input_ids: torch.Tensor
-    placeholder_token_id: int
-    timesteps: torch.Tensor
-    unet_layers: torch.Tensor
-    truncation_idx: Optional[int] = None
-@dataclass
-class PESigmas:
-    sigma_t: float
-    sigma_l: float

src/utils/vis_utils.py DELETED Viewed

@@ -1,17 +0,0 @@
-import math
-from typing import List
-from PIL import Image
-def get_image_grid(images: List[Image.Image]) -> Image:
-    num_images = len(images)
-    cols = int(math.ceil(math.sqrt(num_images)))
-    rows = int(math.ceil(num_images / cols))
-    width, height = images[0].size
-    grid_image = Image.new('RGB', (cols * width, rows * height))
-    for i, img in enumerate(images):
-        x = i % cols
-        y = i // cols
-        grid_image.paste(img, (x * width, y * height))
-    return grid_image