Spaces:

neural-ti
/

NeTI

Runtime error

App Files Files Community

neural-ti commited on May 30, 2023

Commit

ebb9992

•

1 Parent(s): 841c0f5

Add demo

Browse files

Files changed (21) hide show

README.md +3 -3
gradio_app.py +186 -0
requirements.txt +10 -0
src/__init__.py +0 -0
src/checkpoint_handler.py +107 -0
src/config.py +146 -0
src/constants.py +83 -0
src/models/__init__.py +0 -0
src/models/net_clip_text_embedding.py +60 -0
src/models/neti_clip_text_encoder.py +160 -0
src/models/neti_mapper.py +90 -0
src/models/positional_encoding.py +57 -0
src/models/xti_attention_processor.py +57 -0
src/prompt_manager.py +63 -0
src/scripts/__init__.py +0 -0
src/scripts/inference.py +170 -0
src/sd_pipeline_call.py +146 -0
src/utils/__init__.py +0 -0
src/utils/types.py +20 -0
src/utils/vis_utils.py +17 -0
style.css +3 -0

README.md CHANGED Viewed

@@ -1,11 +1,11 @@
 ---
 title: NeTI
 emoji: 🏃
-colorFrom: gray
-colorTo: blue
 sdk: gradio
 sdk_version: 3.32.0
-app_file: app.py
 pinned: false
 license: mit
 ---

 ---
 title: NeTI
 emoji: 🏃
+colorFrom: indigo
+colorTo: yellow
 sdk: gradio
 sdk_version: 3.32.0
+app_file: gradio_app.py
 pinned: false
 license: mit
 ---

gradio_app.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import sys
+from pathlib import Path
+from typing import List, Optional
+import gradio as gr
+import torch
+from PIL import Image
+from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
+from huggingface_hub import snapshot_download
+from transformers import CLIPTokenizer
+from src import constants
+from src.checkpoint_handler import CheckpointHandler
+from src.models.neti_clip_text_encoder import NeTICLIPTextModel
+from src.models.xti_attention_processor import XTIAttenProc
+from src.prompt_manager import PromptManager
+from src.scripts.inference import run_inference
+sys.path.append(".")
+sys.path.append("..")
+DESCRIPTION = '''
+# A Neural Space-Time Representation for Text-to-Image Personalization
+<p style="text-align: center;">
+    This is a demo for our <a href="https://arxiv.org/abs/2305.15391">paper</a>: ''A Neural Space-Time Representation
+    for Text-to-Image Personalization''.
+    <br>
+    Project page and code is available <a href="https://neuraltextualinversion.github.io/NeTI/">here</a>.
+    <br>
+    We introduce a new text-conditioning latent space P* that is dependent on both the denoising process timestep and
+    the U-Net layers.
+    This space-time representation is learned implicitly via a small mapping network.
+    <br>
+    Here, you can generate images using one of the concepts trained in our paper. Simply select your concept and
+    random seed.
+    <br>
+    You can also choose different truncation values to play with the reconstruction vs. editability of the concept.
+</p>
+'''
+CONCEPT_TO_PLACEHOLDER = {
+    'barn': '<barn>',
+    'cat': '<cat>',
+    'clock': '<clock>',
+    'colorful_teapot': '<colorful-teapot>',
+    'dangling_child': '<dangling-child>',
+    'dog': '<dog>',
+    'elephant': '<elephant>',
+    'fat_stone_bird': '<stone-bird>',
+    'headless_statue': '<headless-statue>',
+    'lecun': '<lecun>',
+    'maeve': '<maeve-dog>',
+    'metal_bird': '<metal-bird>',
+    'mugs_skulls': '<mug-skulls>',
+    'rainbow_cat': '<rainbow-cat>',
+    'red_bowl': '<red-bowl>',
+    'teddybear': '<teddybear>',
+    'tortoise_plushy': '<tortoise-plushy>',
+    'wooden_pot': '<wooden-pot>'
+}
+MODELS_PATH = Path('./trained_models')
+MODELS_PATH.mkdir(parents=True, exist_ok=True)
+def load_stable_diffusion_model(pretrained_model_name_or_path: str,
+                                num_denoising_steps: int = 50,
+                                torch_dtype: torch.dtype = torch.float16) -> StableDiffusionPipeline:
+    tokenizer = CLIPTokenizer.from_pretrained(
+        pretrained_model_name_or_path, subfolder="tokenizer")
+    text_encoder = NeTICLIPTextModel.from_pretrained(
+        pretrained_model_name_or_path, subfolder="text_encoder", torch_dtype=torch_dtype,
+    )
+    pipeline = StableDiffusionPipeline.from_pretrained(
+        pretrained_model_name_or_path,
+        torch_dtype=torch_dtype,
+        text_encoder=text_encoder,
+        tokenizer=tokenizer
+    ).to("cuda")
+    pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+    pipeline.scheduler.set_timesteps(num_denoising_steps, device=pipeline.device)
+    pipeline.unet.set_attn_processor(XTIAttenProc())
+    return pipeline
+def get_possible_concepts() -> List[str]:
+    objects = [x for x in MODELS_PATH.iterdir() if x.is_dir()]
+    return [x.name for x in objects]
+def load_sd_and_all_tokens():
+    mappers = {}
+    pipeline = load_stable_diffusion_model(pretrained_model_name_or_path="CompVis/stable-diffusion-v1-4")
+    print("Downloading all models from HF Hub...")
+    snapshot_download(repo_id="neural-ti/NeTI", local_dir='./trained_models')
+    print("Done.")
+    concepts = get_possible_concepts()
+    for concept in concepts:
+        print(f"Loading model for concept: {concept}")
+        learned_embeds_path = MODELS_PATH / concept / f"{concept}-learned_embeds.bin"
+        mapper_path = MODELS_PATH / concept / f"{concept}-mapper.pt"
+        train_cfg, mapper = CheckpointHandler.load_mapper(mapper_path=mapper_path)
+        placeholder_token, placeholder_token_id = CheckpointHandler.load_learned_embed_in_clip(
+            learned_embeds_path=learned_embeds_path,
+            text_encoder=pipeline.text_encoder,
+            tokenizer=pipeline.tokenizer
+        )
+        mappers[concept] = {
+            "mapper": mapper,
+            "placeholder_token": placeholder_token,
+            "placeholder_token_id": placeholder_token_id
+        }
+    return mappers, pipeline
+mappers, pipeline = load_sd_and_all_tokens()
+def main_pipeline(concept_name: str,
+                  prompt_input: str,
+                  seed: int,
+                  use_truncation: bool = False,
+                  truncation_idx: Optional[int] = None) -> Image.Image:
+    pipeline.text_encoder.text_model.embeddings.set_mapper(mappers[concept_name]["mapper"])
+    placeholder_token = mappers[concept_name]["placeholder_token"]
+    placeholder_token_id = mappers[concept_name]["placeholder_token_id"]
+    prompt_manager = PromptManager(tokenizer=pipeline.tokenizer,
+                                   text_encoder=pipeline.text_encoder,
+                                   timesteps=pipeline.scheduler.timesteps,
+                                   unet_layers=constants.UNET_LAYERS,
+                                   placeholder_token=placeholder_token,
+                                   placeholder_token_id=placeholder_token_id,
+                                   torch_dtype=torch.float16)
+    image = run_inference(prompt=prompt_input.replace("*", CONCEPT_TO_PLACEHOLDER[concept_name]),
+                          pipeline=pipeline,
+                          prompt_manager=prompt_manager,
+                          seeds=[int(seed)],
+                          num_images_per_prompt=1,
+                          truncation_idx=truncation_idx if use_truncation else None)
+    return [image]
+with gr.Blocks(css='style.css') as demo:
+    gr.Markdown(DESCRIPTION)
+    gr.HTML('''<a href="https://huggingface.co/spaces/neural-ti/NeTI?duplicate=true"><img src="https://bit.ly/3gLdBN6"
+            alt="Duplicate Space"></a>''')
+    with gr.Row():
+        with gr.Column():
+            concept = gr.Dropdown(get_possible_concepts(), multiselect=False, label="Concept",
+                                  info="Choose your concept")
+            prompt = gr.Textbox(label="Input prompt", info="Input prompt with placeholder for concept. "
+                                                           "Please use * to specify the concept.")
+            random_seed = gr.Number(value=42, label="Random seed", precision=0)
+            use_truncation = gr.Checkbox(label="Use inference-time dropout",
+                                         info="Whether to use our dropout technique when computing the concept "
+                                              "embeddings.")
+            truncation_idx = gr.Slider(8, 128, label="Truncation index",
+                                       info="If using truncation, which index to truncate from. Lower numbers tend to "
+                                            "result in more editable images, but at the cost of reconstruction.")
+            run_button = gr.Button('Generate')
+        with gr.Column():
+            result = gr.Gallery(label='Result')
+            inputs = [concept, prompt, random_seed, use_truncation, truncation_idx]
+            outputs = [result]
+            run_button.click(fn=main_pipeline, inputs=inputs, outputs=outputs)
+    with gr.Row():
+        examples = [
+            ["maeve", "A photo of * swimming in the ocean", 5196, True, 16],
+            ["dangling_child", "A photo of * in Times Square", 3552126062741487430, False, 8],
+            ["teddybear", "A photo of * at his graduation ceremony after finishing his PhD", 263, True, 32],
+            ["red_bowl", "A * vase filled with flowers", 13491504810502930872, False, 8],
+            ["metal_bird", "* in a comic book", 1028, True, 24],
+            ["fat_stone_bird", "A movie poster of The Rock, featuring * about on Godzilla", 7393181316156044422, True,
+             64],
+        ]
+        gr.Examples(examples=examples,
+                    inputs=[concept, prompt, random_seed, use_truncation, truncation_idx],
+                    outputs=[result],
+                    fn=main_pipeline,
+                    cache_examples=True)
+demo.queue(max_size=50).launch(share=False)

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+opencv-python==4.7.0.72
+matplotlib
+pyrallis==0.3.1
+loguru==0.7.0
+torch==1.13.1
+torchvision==0.14.1
+diffusers==0.14.0
+transformers==4.27.4
+accelerate==0.18.0
+gradio

src/__init__.py ADDED Viewed

File without changes

src/checkpoint_handler.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from pathlib import Path
+from typing import Tuple
+import pyrallis
+import torch
+from accelerate import Accelerator
+from torch import nn
+from transformers import CLIPTokenizer
+from src.models.neti_clip_text_encoder import NeTICLIPTextModel
+from src.models.neti_mapper import NeTIMapper
+from src.models.positional_encoding import NeTIPositionalEncoding, BasicEncoder
+from src.config import RunConfig
+class CheckpointHandler:
+    def __init__(self, cfg: RunConfig, placeholder_token_string: str, placeholder_token_id: int, save_root: Path):
+        self.cfg = cfg
+        self.placeholder_token_string = placeholder_token_string
+        self.placeholder_token_id = placeholder_token_id
+        self.save_root = save_root
+    def save_model(self, text_encoder: NeTICLIPTextModel,
+                   accelerator: Accelerator,
+                   embeds_save_name: str,
+                   mapper_save_name: str):
+        self.save_learned_embeds(text_encoder, accelerator, embeds_save_name)
+        self.save_mapper(text_encoder, mapper_save_name)
+    def save_learned_embeds(self, text_encoder: NeTICLIPTextModel, accelerator: Accelerator, save_name: str):
+        """
+        Save learned embeddings. This embedding isn't really learned, but we'll add it to the tokenizer at inference
+        to take the place of our placeholder token.
+        """
+        learned_embeds = accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[self.placeholder_token_id]
+        learned_embeds = learned_embeds.detach().cpu()
+        learned_embeds_dict = {self.placeholder_token_string: learned_embeds}
+        torch.save(learned_embeds_dict, self.save_root / save_name)
+    def save_mapper(self, text_encoder: NeTICLIPTextModel, save_name: str):
+        """ Save the mapper and config to be used at inference. """
+        cfg_ = RunConfig(**self.cfg.__dict__.copy())
+        state_dict = {
+            "state_dict": text_encoder.text_model.embeddings.mapper.state_dict(),
+            "cfg": pyrallis.encode(cfg_),
+            "encoder": text_encoder.text_model.embeddings.mapper.encoder
+        }
+        torch.save(state_dict, self.save_root / save_name)
+    @staticmethod
+    def load_mapper(mapper_path: Path) -> Tuple[RunConfig, NeTIMapper]:
+        mapper_ckpt = torch.load(mapper_path, map_location="cpu")
+        cfg = pyrallis.decode(RunConfig, mapper_ckpt['cfg'])
+        neti_mapper = NeTIMapper(output_dim=768,
+                                 use_nested_dropout=cfg.model.use_nested_dropout,
+                                 nested_dropout_prob=cfg.model.nested_dropout_prob,
+                                 norm_scale=cfg.model.target_norm,
+                                 use_positional_encoding=cfg.model.use_positional_encoding,
+                                 num_pe_time_anchors=cfg.model.num_pe_time_anchors,
+                                 pe_sigmas=cfg.model.pe_sigmas,
+                                 output_bypass=cfg.model.output_bypass)
+        neti_mapper.load_state_dict(mapper_ckpt['state_dict'], strict=True)
+        encoder = mapper_ckpt['encoder']
+        if isinstance(encoder, NeTIPositionalEncoding):
+            encoder.w = nn.Parameter(mapper_ckpt['encoder'].w.cuda())
+        elif isinstance(encoder, BasicEncoder):
+            encoder.normalized_timesteps = mapper_ckpt['encoder'].normalized_timesteps.cuda()
+            encoder.normalized_unet_layers = mapper_ckpt['encoder'].normalized_unet_layers.cuda()
+        neti_mapper.encoder = encoder.cuda()
+        neti_mapper.cuda()
+        neti_mapper.eval()
+        return cfg, neti_mapper
+    @staticmethod
+    def load_learned_embed_in_clip(learned_embeds_path: Path,
+                                   text_encoder: NeTICLIPTextModel,
+                                   tokenizer: CLIPTokenizer) -> Tuple[str, int]:
+        loaded_learned_embeds = torch.load(learned_embeds_path, map_location="cpu")
+        # separate token and the embeds
+        trained_tokens = list(loaded_learned_embeds.keys())
+        embeds = list(loaded_learned_embeds.values())
+        # cast to dtype of text_encoder
+        dtype = text_encoder.get_input_embeddings().weight.dtype
+        embeds = [e.to(dtype) for e in embeds]
+        # add the tokens in tokenizer
+        num_added_tokens = tokenizer.add_tokens(trained_tokens)
+        if num_added_tokens == 0:
+            raise ValueError(f"The tokenizer already contains the token {trained_tokens[0]}. "
+                             f"Please pass a different `token` that is not already in the tokenizer.")
+        # resize the token embeddings
+        text_encoder.resize_token_embeddings(len(tokenizer))
+        # get the id for the token and assign the embeds
+        placeholder_token_ids = [tokenizer.convert_tokens_to_ids(t) for t in trained_tokens]
+        for idx, (token, token_id, embed) in enumerate(zip(trained_tokens, placeholder_token_ids, embeds)):
+            text_encoder.get_input_embeddings().weight.data[token_id] = embed
+        assert len(trained_tokens) == 1, "Only one placeholder token is supported"
+        placeholder_token = trained_tokens[0]
+        placeholder_token_id = placeholder_token_ids[0]
+        return placeholder_token, placeholder_token_id

src/config.py ADDED Viewed

	@@ -0,0 +1,146 @@

+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import List, Optional, Dict
+from constants import VALIDATION_PROMPTS
+from utils.types import PESigmas
+@dataclass
+class LogConfig:
+    """ Parameters for logging and saving """
+    # Name of experiment. This will be the name of the output folder
+    exp_name: str
+    # The output directory where the model predictions and checkpoints will be written
+    exp_dir: Path = Path("./outputs")
+    # Save interval
+    save_steps: int = 250
+    # [TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to
+    # `output_dir/runs/**CURRENT_DATETIME_HOSTNAME`
+    logging_dir: Path = Path("logs")
+    # The integration to report the results to. Supported platforms are "tensorboard" '
+    # (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+    report_to: str = "tensorboard"
+    # Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator`
+    checkpoints_total_limit: Optional[int] = None
+@dataclass
+class DataConfig:
+    """ Parameters for data """
+    # A folder containing the training data
+    train_data_dir: Path
+    # A token to use as a placeholder for the concept
+    placeholder_token: str
+    # Super category token to use for normalizing the mapper output
+    super_category_token: Optional[str] = "object"
+    # Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process
+    dataloader_num_workers: int = 8
+    # Choose between 'object' and 'style' - used for selecting the prompts for training
+    learnable_property: str = "object"
+    # How many times to repeat the training data
+    repeats: int = 100
+    # The resolution for input images, all the images in the train/validation dataset will be resized to this resolution
+    resolution: int = 512
+    # Whether to center crop images before resizing to resolution
+    center_crop: bool = False
+@dataclass
+class ModelConfig:
+    """ Parameters for defining all models """
+    # Path to pretrained model or model identifier from huggingface.co/models
+    pretrained_model_name_or_path: str = "CompVis/stable-diffusion-v1-4"
+    # Whether to use our Nested Dropout technique
+    use_nested_dropout: bool = True
+    # Probability to apply nested dropout during training
+    nested_dropout_prob: float = 0.5
+    # Whether to normalize the norm of the mapper's output vector
+    normalize_mapper_output: bool = True
+    # Target norm for the mapper's output vector
+    target_norm: Optional[float] = None
+    # Whether to use positional encoding over the input to the mapper
+    use_positional_encoding: bool = True
+    # Sigmas used for computing positional encoding
+    pe_sigmas: Dict[str, float] = field(default_factory=lambda: {'sigma_t': 0.03, 'sigma_l': 2.0})
+    # Number of time anchors for computing our positional encodings
+    num_pe_time_anchors: int = 10
+    # Whether to output the textual bypass vector
+    output_bypass: bool = True
+    # Revision of pretrained model identifier from huggingface.co/models
+    revision: Optional[str] = None
+    # Whether training should be resumed from a previous checkpoint.
+    mapper_checkpoint_path: Optional[Path] = None
+    def __post_init__(self):
+        if self.pe_sigmas is not None:
+            assert len(self.pe_sigmas) == 2, "Should provide exactly two sigma values: one for two and one for layers!"
+            self.pe_sigmas = PESigmas(sigma_t=self.pe_sigmas['sigma_t'], sigma_l=self.pe_sigmas['sigma_l'])
+@dataclass
+class EvalConfig:
+    """ Parameters for validation """
+    # A list of prompts that will be used during validation to verify that the model is learning
+    validation_prompts: List[str] = field(default_factory=lambda: VALIDATION_PROMPTS)
+    # Number of images that should be generated during validation with `validation_prompt`
+    num_validation_images: int = 4
+    # Seeds to use for generating the validation images
+    validation_seeds: Optional[List[int]] = field(default_factory=lambda: [42, 420, 501, 5456])
+    # Run validation every X steps.
+    validation_steps: int = 100
+    # Number of denoising steps
+    num_denoising_steps: int = 50
+    def __post_init__(self):
+        if self.validation_seeds is None:
+            self.validation_seeds = list(range(self.num_validation_images))
+        assert len(self.validation_seeds) == self.num_validation_images, \
+            "Length of validation_seeds should equal num_validation_images"
+@dataclass
+class OptimConfig:
+    """ Parameters for the optimization process """
+    # Total number of training steps to perform.
+    max_train_steps: Optional[int] = 1_000
+    # Learning rate
+    learning_rate: float = 1e-3
+    # Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size
+    scale_lr: bool = True
+    # Batch size (per device) for the training dataloader
+    train_batch_size: int = 2
+    # Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass
+    gradient_checkpointing: bool = False
+    # Number of updates steps to accumulate before performing a backward/update pass
+    gradient_accumulation_steps: int = 4
+    # A seed for reproducible training
+    seed: Optional[int] = None
+    # The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",
+    # "constant", "constant_with_warmup"]
+    lr_scheduler: str = "constant"
+    # Number of steps for the warmup in the lr scheduler
+    lr_warmup_steps: int = 0
+    # The beta1 parameter for the Adam optimizer
+    adam_beta1: float = 0.9
+    # The beta2 parameter for the Adam optimizer
+    adam_beta2: float = 0.999
+    # Weight decay to use
+    adam_weight_decay: float = 1e-2
+    # Epsilon value for the Adam optimizer
+    adam_epsilon: float = 1e-08
+    # Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10.
+    # and an Nvidia Ampere GPU.
+    mixed_precision: str = "no"
+    # Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see
+    # https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    allow_tf32: bool = False
+@dataclass
+class RunConfig:
+    """ The main configuration for the coach trainer """
+    log: LogConfig = field(default_factory=LogConfig)
+    data: DataConfig = field(default_factory=DataConfig)
+    model: ModelConfig = field(default_factory=ModelConfig)
+    eval: EvalConfig = field(default_factory=EvalConfig)
+    optim: OptimConfig = field(default_factory=OptimConfig)

src/constants.py ADDED Viewed

	@@ -0,0 +1,83 @@

+UNET_LAYERS = ['IN01', 'IN02', 'IN04', 'IN05', 'IN07', 'IN08', 'MID',
+               'OUT03', 'OUT04', 'OUT05', 'OUT06', 'OUT07', 'OUT08', 'OUT09', 'OUT10', 'OUT11']
+SD_INFERENCE_TIMESTEPS = [999, 979, 959, 939, 919, 899, 879, 859, 839, 819, 799, 779, 759, 739, 719, 699, 679, 659,
+                          639, 619, 599, 579, 559, 539, 519, 500, 480, 460, 440, 420, 400, 380, 360, 340, 320, 300,
+                          280, 260, 240, 220, 200, 180, 160, 140, 120, 100, 80, 60, 40, 20]
+PROMPTS = [
+    "A photo of a {}",
+    "A photo of {} in the jungle",
+    "A photo of {} on a beach",
+    "A photo of {} in Times Square",
+    "A photo of {} in the moon",
+    "A painting of {} in the style of Monet",
+    "Oil painting of {}",
+    "A Marc Chagall painting of {}",
+    "A manga drawing of {}",
+    'A watercolor painting of {}',
+    "A statue of {}",
+    "App icon of {}",
+    "A sand sculpture of {}",
+    "Colorful graffiti of {}",
+    "A photograph of two {} on a table",
+]
+VALIDATION_PROMPTS = [
+    "A photo of a {}",
+    "A photo of a {} on a beach",
+    "App icon of {}",
+    "A painting of {} in the style of Monet",
+]
+IMAGENET_TEMPLATES_SMALL = [
+    "a photo of a {}",
+    "a rendering of a {}",
+    "a cropped photo of the {}",
+    "the photo of a {}",
+    "a photo of a clean {}",
+    "a photo of a dirty {}",
+    "a dark photo of the {}",
+    "a photo of my {}",
+    "a photo of the cool {}",
+    "a close-up photo of a {}",
+    "a bright photo of the {}",
+    "a cropped photo of a {}",
+    "a photo of the {}",
+    "a good photo of the {}",
+    "a photo of one {}",
+    "a close-up photo of the {}",
+    "a rendition of the {}",
+    "a photo of the clean {}",
+    "a rendition of a {}",
+    "a photo of a nice {}",
+    "a good photo of a {}",
+    "a photo of the nice {}",
+    "a photo of the small {}",
+    "a photo of the weird {}",
+    "a photo of the large {}",
+    "a photo of a cool {}",
+    "a photo of a small {}",
+]
+IMAGENET_STYLE_TEMPLATES_SMALL = [
+    "a painting in the style of {}",
+    "a rendering in the style of {}",
+    "a cropped painting in the style of {}",
+    "the painting in the style of {}",
+    "a clean painting in the style of {}",
+    "a dirty painting in the style of {}",
+    "a dark painting in the style of {}",
+    "a picture in the style of {}",
+    "a cool painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a bright painting in the style of {}",
+    "a cropped painting in the style of {}",
+    "a good painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a rendition in the style of {}",
+    "a nice painting in the style of {}",
+    "a small painting in the style of {}",
+    "a weird painting in the style of {}",
+    "a large painting in the style of {}",
+]

src/models/__init__.py ADDED Viewed

File without changes

src/models/net_clip_text_embedding.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from typing import Optional, Tuple
+import torch
+from torch import nn
+from transformers import CLIPTextConfig
+from src.models.neti_mapper import NeTIMapper
+from src.utils.types import NeTIBatch
+class NeTICLIPTextEmbeddings(nn.Module):
+    """ Modification of CLIPTextEmbedding to allow for the use of a NeTIMapper to overwrite the concept token. """
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+    def set_mapper(self, mapper: NeTIMapper):
+        self.mapper = mapper
+    def forward(self, input_ids: Optional[torch.LongTensor] = None,
+                position_ids: Optional[torch.LongTensor] = None,
+                inputs_embeds: Optional[torch.FloatTensor] = None,
+                batch: Optional[NeTIBatch] = None) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        if batch is not None:
+            input_ids = batch.input_ids
+        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+        ####################################################################
+        # NeTI logic - Use mapper to overwrite the learnable token embedding
+        ####################################################################
+        bypass_outputs = None
+        if batch is not None:
+            mapper_outputs = self.mapper(timestep=batch.timesteps.float(),
+                                         unet_layer=batch.unet_layers.float(),
+                                         truncation_idx=batch.truncation_idx)
+            mapper_outputs = mapper_outputs.to(dtype=inputs_embeds.dtype, device=inputs_embeds.device)
+            if self.mapper.output_bypass:
+                bypass_outputs = mapper_outputs[:, mapper_outputs.shape[1] // 2:]
+                mapper_outputs = mapper_outputs[:, :mapper_outputs.shape[1] // 2]
+            # Overwrite the index of the placeholder token with the mapper output for each entry in the batch
+            learnable_idxs = (input_ids == batch.placeholder_token_id).nonzero(as_tuple=True)[1]
+            inputs_embeds[torch.arange(input_ids.shape[0]), learnable_idxs] = mapper_outputs
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+        return embeddings, bypass_outputs

src/models/neti_clip_text_encoder.py ADDED Viewed

	@@ -0,0 +1,160 @@

+from typing import Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from transformers.modeling_outputs import BaseModelOutputWithPooling
+from transformers.models.clip.modeling_clip import CLIPTextConfig, CLIPTextModel, CLIPEncoder
+from transformers.models.clip.modeling_clip import CLIPTextTransformer, _expand_mask
+from src.models.net_clip_text_embedding import NeTICLIPTextEmbeddings
+from src.utils.types import NeTIBatch
+class NeTICLIPTextModel(CLIPTextModel):
+    """ Modification of CLIPTextModel to use our NeTI mapper for computing the embeddings of the concept. """
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__(config)
+        self.text_model = NeTICLIPTextTransformer(config)
+        self.post_init()
+    def forward(self, input_ids: Optional[torch.Tensor] = None,
+                attention_mask: Optional[torch.Tensor] = None,
+                position_ids: Optional[torch.Tensor] = None,
+                output_attentions: Optional[bool] = None,
+                output_hidden_states: Optional[bool] = None,
+                return_dict: Optional[bool] = None,
+                batch: Optional[NeTIBatch] = None) -> Union[Tuple, BaseModelOutputWithPooling]:
+        return self.text_model.forward(
+            batch=batch,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+class NeTICLIPTextTransformer(CLIPTextTransformer):
+    """ Modification of CLIPTextTransformer to use our NeTI mapper for computing the embeddings of the concept. """
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__(config=config)
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = NeTICLIPTextEmbeddings(config)
+        self.encoder = CLIPEncoder(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+    def forward(self, input_ids: Optional[torch.Tensor] = None,
+                attention_mask: Optional[torch.Tensor] = None,
+                position_ids: Optional[torch.Tensor] = None,
+                output_attentions: Optional[bool] = None,
+                output_hidden_states: Optional[bool] = None,
+                return_dict: Optional[bool] = None,
+                batch: Optional[NeTIBatch] = None) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        bypass_output = None
+        if input_ids is not None:  # Regular embedding logic
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+            hidden_states, _ = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+        ###########################
+        # NeTI logic
+        ###########################
+        elif batch is not None:
+            input_shape = batch.input_ids.size()
+            batch.input_ids = batch.input_ids.view(-1, input_shape[-1])
+            hidden_states, bypass_output = self.embeddings(batch=batch, position_ids=position_ids)
+        else:
+            raise ValueError("You have to specify either batch or input_ids!")
+        bsz, seq_len = input_shape
+        # CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
+            hidden_states.device
+        )
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state_with_bypass = last_hidden_state.clone()
+        ###############################################
+        # NeTI logic - compute the scaled bypass output
+        ###############################################
+        if bypass_output is not None:
+            learnable_idxs = (batch.input_ids == batch.placeholder_token_id).nonzero(as_tuple=True)[1]
+            existing_state = last_hidden_state_with_bypass[torch.arange(last_hidden_state.shape[0]), learnable_idxs]
+            bypass_output = bypass_output / bypass_output.norm(dim=1, keepdim=True) \
+                            * existing_state.norm(dim=1, keepdim=True)
+            new_state = existing_state + 0.2 * bypass_output
+            new_state = new_state.to(dtype=hidden_states.dtype)
+            last_hidden_state_with_bypass[torch.arange(last_hidden_state.shape[0]), learnable_idxs] = new_state
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+        last_hidden_state_with_bypass = self.final_layer_norm(last_hidden_state_with_bypass)
+        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+        if input_ids is not None:
+            pooled_output = last_hidden_state[
+                torch.arange(last_hidden_state.shape[0]), input_ids.to(torch.int).argmax(dim=-1)
+            ]
+            pooled_output_with_bypass = last_hidden_state_with_bypass[
+                torch.arange(last_hidden_state_with_bypass.shape[0]), input_ids.to(torch.int).argmax(dim=-1)
+            ]
+        elif batch is not None:
+            pooled_output = last_hidden_state[
+                torch.arange(last_hidden_state.shape[0]), batch.input_ids.to(torch.int).argmax(dim=-1)
+            ]
+            pooled_output_with_bypass = last_hidden_state_with_bypass[
+                torch.arange(last_hidden_state_with_bypass.shape[0]), batch.input_ids.to(torch.int).argmax(dim=-1)
+            ]
+        else:
+            raise ValueError("You have to specify either batch or input_ids!")
+        if bypass_output is not None:
+            return BaseModelOutputWithPooling(
+                last_hidden_state=last_hidden_state,
+                pooler_output=pooled_output,
+                hidden_states=encoder_outputs.hidden_states,
+                attentions=encoder_outputs.attentions,
+            ), BaseModelOutputWithPooling(
+                last_hidden_state=last_hidden_state_with_bypass,
+                pooler_output=pooled_output_with_bypass,
+                hidden_states=encoder_outputs.hidden_states,
+                attentions=encoder_outputs.attentions,
+            )
+        else:
+            return BaseModelOutputWithPooling(
+                last_hidden_state=last_hidden_state,
+                pooler_output=pooled_output,
+                hidden_states=encoder_outputs.hidden_states,
+                attentions=encoder_outputs.attentions,
+            ), None

src/models/neti_mapper.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import random
+from typing import Optional, List
+import torch
+import torch.nn.functional as F
+from torch import nn
+from src.constants import UNET_LAYERS
+from src.models.positional_encoding import NeTIPositionalEncoding, BasicEncoder
+from src.utils.types import PESigmas
+class NeTIMapper(nn.Module):
+    """ Main logic of our NeTI mapper. """
+    def __init__(self, output_dim: int = 768,
+                 unet_layers: List[str] = UNET_LAYERS,
+                 use_nested_dropout: bool = True,
+                 nested_dropout_prob: float = 0.5,
+                 norm_scale: Optional[torch.Tensor] = None,
+                 use_positional_encoding: bool = True,
+                 num_pe_time_anchors: int = 10,
+                 pe_sigmas: PESigmas = PESigmas(sigma_t=0.03, sigma_l=2.0),
+                 output_bypass: bool = True):
+        super().__init__()
+        self.use_nested_dropout = use_nested_dropout
+        self.nested_dropout_prob = nested_dropout_prob
+        self.norm_scale = norm_scale
+        self.output_bypass = output_bypass
+        if self.output_bypass:
+            output_dim *= 2  # Output two vectors
+        self.use_positional_encoding = use_positional_encoding
+        if self.use_positional_encoding:
+            self.encoder = NeTIPositionalEncoding(sigma_t=pe_sigmas.sigma_t, sigma_l=pe_sigmas.sigma_l).cuda()
+            self.input_dim = num_pe_time_anchors * len(unet_layers)
+        else:
+            self.encoder = BasicEncoder().cuda()
+            self.input_dim = 2
+        self.set_net(num_unet_layers=len(unet_layers),
+                     num_time_anchors=num_pe_time_anchors,
+                     output_dim=output_dim)
+    def set_net(self, num_unet_layers: int, num_time_anchors: int, output_dim: int = 768):
+        self.input_layer = self.set_input_layer(num_unet_layers, num_time_anchors)
+        self.net = nn.Sequential(self.input_layer,
+                                 nn.Linear(self.input_dim, 128), nn.LayerNorm(128), nn.LeakyReLU(),
+                                 nn.Linear(128, 128), nn.LayerNorm(128), nn.LeakyReLU())
+        self.output_layer = nn.Sequential(nn.Linear(128, output_dim))
+    def set_input_layer(self, num_unet_layers: int, num_time_anchors: int) -> nn.Module:
+        if self.use_positional_encoding:
+            input_layer = nn.Linear(self.encoder.num_w * 2, self.input_dim)
+            input_layer.weight.data = self.encoder.init_layer(num_time_anchors, num_unet_layers)
+        else:
+            input_layer = nn.Identity()
+        return input_layer
+    def forward(self, timestep: torch.Tensor, unet_layer: torch.Tensor, truncation_idx: int = None) -> torch.Tensor:
+        embedding = self.extract_hidden_representation(timestep, unet_layer)
+        if self.use_nested_dropout:
+            embedding = self.apply_nested_dropout(embedding, truncation_idx=truncation_idx)
+        embedding = self.get_output(embedding)
+        return embedding
+    def get_encoded_input(self, timestep: torch.Tensor, unet_layer: torch.Tensor) -> torch.Tensor:
+        return self.encoder.encode(timestep, unet_layer)
+    def extract_hidden_representation(self, timestep: torch.Tensor, unet_layer: torch.Tensor) -> torch.Tensor:
+        encoded_input = self.get_encoded_input(timestep, unet_layer)
+        embedding = self.net(encoded_input)
+        return embedding
+    def apply_nested_dropout(self, embedding: torch.Tensor, truncation_idx: int = None) -> torch.Tensor:
+        if self.training:
+            if random.random() < self.nested_dropout_prob:
+                dropout_idxs = torch.randint(low=0, high=embedding.shape[1], size=(embedding.shape[0],))
+                for idx in torch.arange(embedding.shape[0]):
+                    embedding[idx][dropout_idxs[idx]:] = 0
+        if not self.training and truncation_idx is not None:
+            for idx in torch.arange(embedding.shape[0]):
+                embedding[idx][truncation_idx:] = 0
+        return embedding
+    def get_output(self, embedding: torch.Tensor) -> torch.Tensor:
+        embedding = self.output_layer(embedding)
+        if self.norm_scale is not None:
+            embedding = F.normalize(embedding, dim=-1) * self.norm_scale
+        return embedding

src/models/positional_encoding.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from typing import Union
+import torch
+from torch import nn
+class NeTIPositionalEncoding(nn.Module):
+    def __init__(self, sigma_t: float, sigma_l: float, num_w: int = 1024):
+        super().__init__()
+        self.sigma_t = sigma_t
+        self.sigma_l = sigma_l
+        self.num_w = num_w
+        self.w = torch.randn((num_w, 2))
+        self.w[:, 0] *= sigma_t
+        self.w[:, 1] *= sigma_l
+        self.w = nn.Parameter(self.w).cuda()
+    def encode(self, t: Union[int, torch.Tensor], l: Union[int, torch.Tensor]):
+        """ Maps the given time and layer input into a 2048-dimensional vector. """
+        if type(t) == int or t.ndim == 0:
+            x = torch.tensor([t, l]).float()
+        else:
+            x = torch.stack([t, l], dim=1).T
+        x = x.cuda()
+        v = torch.cat([torch.sin(self.w.detach() @ x), torch.cos(self.w.detach() @ x)])
+        if type(t) == int:
+            v_norm = v / v.norm()
+        else:
+            v_norm = v / v.norm(dim=0)
+            v_norm = v_norm.T
+        return v_norm
+    def init_layer(self, num_time_anchors: int, num_layers: int) -> torch.Tensor:
+        """ Computes the weights for the positional encoding layer of size 160x2048."""
+        anchor_vectors = []
+        for t_anchor in range(0, 1000, 1000 // num_time_anchors):
+            for l_anchor in range(0, num_layers):
+                anchor_vectors.append(self.encode(t_anchor, l_anchor).float())
+        A = torch.stack(anchor_vectors)
+        return A
+class BasicEncoder(nn.Module):
+    """ Simply normalizes the given timestep and unet layer to be between -1 and 1. """
+    def __init__(self, num_denoising_timesteps: int = 1000, num_unet_layers: int = 16):
+        super().__init__()
+        self.normalized_timesteps = (torch.arange(num_denoising_timesteps) / (num_denoising_timesteps - 1)) * 2 - 1
+        self.normalized_unet_layers = (torch.arange(num_unet_layers) / (num_unet_layers - 1)) * 2 - 1
+        self.normalized_timesteps = nn.Parameter(self.normalized_timesteps).cuda()
+        self.normalized_unet_layers = nn.Parameter(self.normalized_unet_layers).cuda()
+    def encode(self, timestep: torch.Tensor, unet_layer: torch.Tensor) -> torch.Tensor:
+        normalized_input = torch.stack([self.normalized_timesteps[timestep.long()],
+                                        self.normalized_unet_layers[unet_layer.long()]]).T
+        return normalized_input

src/models/xti_attention_processor.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from typing import Dict, Optional
+import torch
+from diffusers.models.cross_attention import CrossAttention
+class XTIAttenProc:
+    def __call__(self, attn: CrossAttention,
+                 hidden_states: torch.Tensor,
+                 encoder_hidden_states: Optional[Dict[str, torch.Tensor]] = None,
+                 attention_mask: Optional[torch.Tensor] = None):
+        _ehs_bypass = None
+        if encoder_hidden_states is not None:
+            if isinstance(encoder_hidden_states, dict):
+                this_idx = encoder_hidden_states["this_idx"]
+                _ehs = encoder_hidden_states[f"CONTEXT_TENSOR_{this_idx}"]
+                if f"CONTEXT_TENSOR_BYPASS_{this_idx}" in encoder_hidden_states:
+                    _ehs_bypass = encoder_hidden_states[f"CONTEXT_TENSOR_BYPASS_{this_idx}"]
+                encoder_hidden_states["this_idx"] += 1
+                encoder_hidden_states["this_idx"] %= 16
+            else:
+                _ehs = encoder_hidden_states
+        else:
+            _ehs = None
+        batch_size, sequence_length, _ = (hidden_states.shape if _ehs is None else _ehs.shape)
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        query = attn.to_q(hidden_states)
+        if _ehs is None:
+            _ehs = hidden_states
+        elif attn.cross_attention_norm:
+            _ehs = attn.norm_cross(_ehs)
+            _ehs_bypass = attn.norm_cross(_ehs_bypass)
+        key = attn.to_k(_ehs)
+        if _ehs_bypass is not None:
+            value = attn.to_v(_ehs_bypass)
+        else:
+            value = attn.to_v(_ehs)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states

src/prompt_manager.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from typing import Optional, List, Dict, Any
+import torch
+from tqdm import tqdm
+from transformers import CLIPTokenizer
+from src import constants
+from src.models.neti_clip_text_encoder import NeTICLIPTextModel
+from src.utils.types import NeTIBatch
+class PromptManager:
+    """ Class for computing all time and space embeddings for a given prompt. """
+    def __init__(self, tokenizer: CLIPTokenizer,
+                 text_encoder: NeTICLIPTextModel,
+                 timesteps: List[int] = constants.SD_INFERENCE_TIMESTEPS,
+                 unet_layers: List[str] = constants.UNET_LAYERS,
+                 placeholder_token_id: Optional[List] = None,
+                 placeholder_token: Optional[List] = None,
+                 torch_dtype: torch.dtype = torch.float32):
+        self.tokenizer = tokenizer
+        self.text_encoder = text_encoder
+        self.timesteps = timesteps
+        self.unet_layers = unet_layers
+        self.placeholder_token = placeholder_token
+        self.placeholder_token_id = placeholder_token_id
+        self.dtype = torch_dtype
+    def embed_prompt(self, text: str,
+                     truncation_idx: Optional[int] = None,
+                     num_images_per_prompt: int = 1) -> List[Dict[str, Any]]:
+        """
+        Compute the conditioning vectors for the given prompt. We assume that the prompt is defined using `{}`
+        for indicating where to place the placeholder token string. See constants.VALIDATION_PROMPTS for examples.
+        """
+        text = text.format(self.placeholder_token)
+        ids = self.tokenizer(
+            text,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids
+        # Compute embeddings for each timestep and each U-Net layer
+        print(f"Computing embeddings over {len(self.timesteps)} timesteps and {len(self.unet_layers)} U-Net layers.")
+        hidden_states_per_timestep = []
+        for timestep in tqdm(self.timesteps):
+            _hs = {"this_idx": 0}.copy()
+            for layer_idx, unet_layer in enumerate(self.unet_layers):
+                batch = NeTIBatch(input_ids=ids.to(device=self.text_encoder.device),
+                                  timesteps=timestep.unsqueeze(0).to(device=self.text_encoder.device),
+                                  unet_layers=torch.tensor(layer_idx, device=self.text_encoder.device).unsqueeze(0),
+                                  placeholder_token_id=self.placeholder_token_id,
+                                  truncation_idx=truncation_idx)
+                layer_hs, layer_hs_bypass = self.text_encoder(batch=batch)
+                layer_hs = layer_hs[0].to(dtype=self.dtype)
+                _hs[f"CONTEXT_TENSOR_{layer_idx}"] = layer_hs.repeat(num_images_per_prompt, 1, 1)
+                if layer_hs_bypass is not None:
+                    layer_hs_bypass = layer_hs_bypass[0].to(dtype=self.dtype)
+                    _hs[f"CONTEXT_TENSOR_BYPASS_{layer_idx}"] = layer_hs_bypass.repeat(num_images_per_prompt, 1, 1)
+            hidden_states_per_timestep.append(_hs)
+        print("Done.")
+        return hidden_states_per_timestep

src/scripts/__init__.py ADDED Viewed

File without changes

src/scripts/inference.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import sys
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional, List, Tuple, Union
+import numpy as np
+import pyrallis
+import torch
+from PIL import Image
+from diffusers import DPMSolverMultistepScheduler, StableDiffusionPipeline
+from transformers import CLIPTokenizer
+sys.path.append(".")
+sys.path.append("..")
+from src import constants
+from src.models.neti_clip_text_encoder import NeTICLIPTextModel
+from src.models.neti_mapper import NeTIMapper
+from src.prompt_manager import PromptManager
+from src.sd_pipeline_call import sd_pipeline_call
+from src.models.xti_attention_processor import XTIAttenProc
+from src.checkpoint_handler import CheckpointHandler
+from src.utils import vis_utils
+@dataclass
+class InferenceConfig:
+    # Specifies which checkpoint iteration we want to load
+    iteration: Optional[int] = None
+    # The input directory containing the saved models and embeddings
+    input_dir: Optional[Path] = None
+    # Where the save the inference results to
+    inference_dir: Optional[Path] = None
+    # Specific path to the mapper you want to load, overrides `input_dir`
+    mapper_checkpoint_path: Optional[Path] = None
+    # Specific path to the embeddings you want to load, overrides `input_dir`
+    learned_embeds_path: Optional[Path] = None
+    # List of prompts to run inference on
+    prompts: Optional[List[str]] = None
+    # Text file containing a prompts to run inference on (one prompt per line), overrides `prompts`
+    prompts_file_path: Optional[Path] = None
+    # List of random seeds to run on
+    seeds: List[int] = field(default_factory=lambda: [42])
+    # If you want to run with dropout at inference time, this specifies the truncation indices for applying dropout.
+    # None indicates that no dropout will be performed. If a list of indices is provided, will run all indices.
+    truncation_idxs: Optional[Union[int, List[int]]] = None
+    # Whether to run with torch.float16 or torch.float32
+    torch_dtype: str = "fp16"
+    def __post_init__(self):
+        assert bool(self.prompts) != bool(self.prompts_file_path), \
+            "You must provide either prompts or prompts_file_path, but not both!"
+        self._set_prompts()
+        self._set_input_paths()
+        self.inference_dir.mkdir(exist_ok=True, parents=True)
+        if type(self.truncation_idxs) == int:
+            self.truncation_idxs = [self.truncation_idxs]
+        self.torch_dtype = torch.float16 if self.torch_dtype == "fp16" else torch.float32
+    def _set_input_paths(self):
+        if self.inference_dir is None:
+            assert self.input_dir is not None, "You must pass an input_dir if you do not specify inference_dir"
+            self.inference_dir = self.input_dir / f"inference_{self.iteration}"
+        if self.mapper_checkpoint_path is None:
+            assert self.input_dir is not None, "You must pass an input_dir if you do not specify mapper_checkpoint_path"
+            self.mapper_checkpoint_path = self.input_dir / f"mapper-steps-{self.iteration}.pt"
+        if self.learned_embeds_path is None:
+            assert self.input_dir is not None, "You must pass an input_dir if you do not specify learned_embeds_path"
+            self.learned_embeds_path = self.input_dir / f"learned_embeds-steps-{self.iteration}.bin"
+    def _set_prompts(self):
+        if self.prompts_file_path is not None:
+            assert self.prompts_file_path.exists(), f"Prompts file {self.prompts_file_path} does not exist!"
+            self.prompts = self.prompts_file_path.read_text().splitlines()
+@pyrallis.wrap()
+def main(infer_cfg: InferenceConfig):
+    train_cfg, mapper = CheckpointHandler.load_mapper(infer_cfg.mapper_checkpoint_path)
+    pipeline, placeholder_token, placeholder_token_id = load_stable_diffusion_model(
+        pretrained_model_name_or_path=train_cfg.model.pretrained_model_name_or_path,
+        mapper=mapper,
+        learned_embeds_path=infer_cfg.learned_embeds_path,
+        torch_dtype=infer_cfg.torch_dtype
+    )
+    prompt_manager = PromptManager(tokenizer=pipeline.tokenizer,
+                                   text_encoder=pipeline.text_encoder,
+                                   timesteps=pipeline.scheduler.timesteps,
+                                   unet_layers=constants.UNET_LAYERS,
+                                   placeholder_token=placeholder_token,
+                                   placeholder_token_id=placeholder_token_id,
+                                   torch_dtype=infer_cfg.torch_dtype)
+    for prompt in infer_cfg.prompts:
+        output_path = infer_cfg.inference_dir / prompt.format(placeholder_token)
+        output_path.mkdir(exist_ok=True, parents=True)
+        for truncation_idx in infer_cfg.truncation_idxs:
+            print(f"Running with truncation index: {truncation_idx}")
+            prompt_image = run_inference(prompt=prompt,
+                                         pipeline=pipeline,
+                                         prompt_manager=prompt_manager,
+                                         seeds=infer_cfg.seeds,
+                                         output_path=output_path,
+                                         num_images_per_prompt=1,
+                                         truncation_idx=truncation_idx)
+            if truncation_idx is not None:
+                save_name = f"{prompt.format(placeholder_token)}_truncation_{truncation_idx}.png"
+            else:
+                save_name = f"{prompt.format(placeholder_token)}.png"
+            prompt_image.save(infer_cfg.inference_dir / save_name)
+def run_inference(prompt: str,
+                  pipeline: StableDiffusionPipeline,
+                  prompt_manager: PromptManager,
+                  seeds: List[int],
+                  output_path: Optional[Path] = None,
+                  num_images_per_prompt: int = 1,
+                  truncation_idx: Optional[int] = None) -> Image.Image:
+    with torch.autocast("cuda"):
+        with torch.no_grad():
+            prompt_embeds = prompt_manager.embed_prompt(prompt,
+                                                        num_images_per_prompt=num_images_per_prompt,
+                                                        truncation_idx=truncation_idx)
+    joined_images = []
+    for seed in seeds:
+        generator = torch.Generator(device='cuda').manual_seed(seed)
+        images = sd_pipeline_call(pipeline,
+                                  prompt_embeds=prompt_embeds,
+                                  generator=generator,
+                                  num_images_per_prompt=num_images_per_prompt).images
+        seed_image = Image.fromarray(np.concatenate(images, axis=1)).convert("RGB")
+        if output_path is not None:
+            save_name = f'{seed}_truncation_{truncation_idx}.png' if truncation_idx is not None else f'{seed}.png'
+            seed_image.save(output_path / save_name)
+        joined_images.append(seed_image)
+    joined_image = vis_utils.get_image_grid(joined_images)
+    return joined_image
+def load_stable_diffusion_model(pretrained_model_name_or_path: str,
+                                learned_embeds_path: Path,
+                                mapper: Optional[NeTIMapper] = None,
+                                num_denoising_steps: int = 50,
+                                torch_dtype: torch.dtype = torch.float16) -> Tuple[StableDiffusionPipeline, str, int]:
+    tokenizer = CLIPTokenizer.from_pretrained(
+        pretrained_model_name_or_path, subfolder="tokenizer")
+    text_encoder = NeTICLIPTextModel.from_pretrained(
+        pretrained_model_name_or_path, subfolder="text_encoder", torch_dtype=torch_dtype,
+    )
+    if mapper is not None:
+        text_encoder.text_model.embeddings.set_mapper(mapper)
+    placeholder_token, placeholder_token_id = CheckpointHandler.load_learned_embed_in_clip(
+        learned_embeds_path=learned_embeds_path,
+        text_encoder=text_encoder,
+        tokenizer=tokenizer
+    )
+    pipeline = StableDiffusionPipeline.from_pretrained(
+        pretrained_model_name_or_path,
+        torch_dtype=torch_dtype,
+        text_encoder=text_encoder,
+        tokenizer=tokenizer
+    ).to("cuda")
+    pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+    pipeline.scheduler.set_timesteps(num_denoising_steps, device=pipeline.device)
+    pipeline.unet.set_attn_processor(XTIAttenProc())
+    return pipeline, placeholder_token, placeholder_token_id
+if __name__ == '__main__':
+    main()

src/sd_pipeline_call.py ADDED Viewed

	@@ -0,0 +1,146 @@

+from typing import Any, Callable, Dict, List, Optional, Union
+import torch
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionPipeline
+@torch.no_grad()
+def sd_pipeline_call(
+        pipeline: StableDiffusionPipeline,
+        prompt_embeds: torch.FloatTensor,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None):
+    """ Modification of the standard SD pipeline call to support NeTI embeddings passed with prompt_embeds argument."""
+    # 0. Default height and width to unet
+    height = height or pipeline.unet.config.sample_size * pipeline.vae_scale_factor
+    width = width or pipeline.unet.config.sample_size * pipeline.vae_scale_factor
+    # 2. Define call parameters
+    batch_size = 1
+    device = pipeline._execution_device
+    neg_prompt = get_neg_prompt_input_ids(pipeline, negative_prompt)
+    negative_prompt_embeds, _ = pipeline.text_encoder(
+        input_ids=neg_prompt.input_ids.to(device),
+        attention_mask=None,
+    )
+    negative_prompt_embeds = negative_prompt_embeds[0]
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    do_classifier_free_guidance = guidance_scale > 1.0
+    # 4. Prepare timesteps
+    pipeline.scheduler.set_timesteps(num_inference_steps, device=device)
+    timesteps = pipeline.scheduler.timesteps
+    # 5. Prepare latent variables
+    num_channels_latents = pipeline.unet.in_channels
+    latents = pipeline.prepare_latents(
+        batch_size * num_images_per_prompt,
+        num_channels_latents,
+        height,
+        width,
+        pipeline.text_encoder.dtype,
+        device,
+        generator,
+        latents,
+    )
+    # 6. Prepare extra step kwargs.
+    extra_step_kwargs = pipeline.prepare_extra_step_kwargs(generator, eta)
+    # 7. Denoising loop
+    num_warmup_steps = len(timesteps) - num_inference_steps * pipeline.scheduler.order
+    with pipeline.progress_bar(total=num_inference_steps) as progress_bar:
+        for i, t in enumerate(timesteps):
+            if do_classifier_free_guidance:
+                latent_model_input = latents
+                latent_model_input = pipeline.scheduler.scale_model_input(latent_model_input, t)
+                # predict the noise residual
+                noise_pred_uncond = pipeline.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=negative_prompt_embeds.repeat(num_images_per_prompt, 1, 1),
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+                ###############################################################
+                # NeTI logic: use the prompt embedding for the current timestep
+                ###############################################################
+                embed = prompt_embeds[i] if type(prompt_embeds) == list else prompt_embeds
+                noise_pred_text = pipeline.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=embed,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = pipeline.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % pipeline.scheduler.order == 0):
+                progress_bar.update()
+                if callback is not None and i % callback_steps == 0:
+                    callback(i, t, latents)
+    if output_type == "latent":
+        image = latents
+        has_nsfw_concept = None
+    elif output_type == "pil":
+        # 8. Post-processing
+        image = pipeline.decode_latents(latents)
+        # 9. Run safety checker
+        image, has_nsfw_concept = pipeline.run_safety_checker(image, device, pipeline.text_encoder.dtype)
+        # 10. Convert to PIL
+        image = pipeline.numpy_to_pil(image)
+    else:
+        # 8. Post-processing
+        image = pipeline.decode_latents(latents)
+        # 9. Run safety checker
+        image, has_nsfw_concept = pipeline.run_safety_checker(image, device, pipeline.text_encoder.dtype)
+    # Offload last model to CPU
+    if hasattr(pipeline, "final_offload_hook") and pipeline.final_offload_hook is not None:
+        pipeline.final_offload_hook.offload()
+    if not return_dict:
+        return image, has_nsfw_concept
+    return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+def get_neg_prompt_input_ids(pipeline: StableDiffusionPipeline,
+                             negative_prompt: Optional[Union[str, List[str]]] = None):
+    if negative_prompt is None:
+        negative_prompt = ""
+    uncond_tokens = [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+    uncond_input = pipeline.tokenizer(
+        uncond_tokens,
+        padding="max_length",
+        max_length=pipeline.tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+    return uncond_input

src/utils/__init__.py ADDED Viewed

File without changes

src/utils/types.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import enum
+from dataclasses import dataclass
+from typing import Optional
+import torch
+@dataclass
+class NeTIBatch:
+    input_ids: torch.Tensor
+    placeholder_token_id: int
+    timesteps: torch.Tensor
+    unet_layers: torch.Tensor
+    truncation_idx: Optional[int] = None
+@dataclass
+class PESigmas:
+    sigma_t: float
+    sigma_l: float

src/utils/vis_utils.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import math
+from typing import List
+from PIL import Image
+def get_image_grid(images: List[Image.Image]) -> Image:
+    num_images = len(images)
+    cols = int(math.ceil(math.sqrt(num_images)))
+    rows = int(math.ceil(num_images / cols))
+    width, height = images[0].size
+    grid_image = Image.new('RGB', (cols * width, rows * height))
+    for i, img in enumerate(images):
+        x = i % cols
+        y = i // cols
+        grid_image.paste(img, (x * width, y * height))
+    return grid_image

style.css ADDED Viewed

	@@ -0,0 +1,3 @@

+h1 {
+  text-align: center;
+}