Spaces:

sled-umich
/

InfEdit

Runtime error

App Files Files Community

6kplus commited on Dec 14, 2023

Commit

948429b

•

1 Parent(s): 0a4f406

Upload 30 files

Browse files

Files changed (30) hide show

.gitattributes +2 -0
README.md +2 -9
app_ead_instuct.py +620 -0
images/214000000000.jpg +0 -0
images/311000000002.jpg +0 -0
images/Doom_Slayer.jpg +0 -0
images/Elon_Musk.webp +0 -0
images/InfEdit.jpg +3 -0
images/angry.jpg +0 -0
images/bear.jpg +0 -0
images/computer.png +0 -0
images/corgi.jpg +0 -0
images/dragon.jpg +0 -0
images/droplet.png +0 -0
images/frieren.jpg +0 -0
images/genshin.png +0 -0
images/groundhog.png +0 -0
images/james.jpg +0 -0
images/miku.png +0 -0
images/moyu.png +0 -0
images/muffin.png +0 -0
images/osu.jfif +0 -0
images/sam.png +3 -0
images/summer.jpg +0 -0
nsfw.png +0 -0
pipeline_ead.py +707 -0
ptp_utils.py +180 -0
requirements.txt +8 -0
seq_aligner.py +314 -0
utils.py +6 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+images/InfEdit.jpg filter=lfs diff=lfs merge=lfs -text
+images/sam.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,13 +1,6 @@
 ---
 title: InfEdit
-emoji: 🌍
-colorFrom: purple
-colorTo: purple
 sdk: gradio
-sdk_version: 4.9.0
-app_file: app.py
-pinned: false
-license: cc-by-nc-sa-4.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: InfEdit
+app_file: app_ead_instuct.py
 sdk: gradio
+sdk_version: 4.7.1
 ---

app_ead_instuct.py ADDED Viewed

	@@ -0,0 +1,620 @@

+from diffusers import LCMScheduler
+from pipeline_ead import EditPipeline
+import os
+import gradio as gr
+import torch
+from PIL import Image
+import torch.nn.functional as nnf
+from typing import Optional, Union, Tuple, List, Callable, Dict
+import abc
+import ptp_utils
+import utils
+import numpy as np
+import seq_aligner
+import math
+LOW_RESOURCE = False
+MAX_NUM_WORDS = 77
+is_colab = utils.is_google_colab()
+colab_instruction = "" if is_colab else """
+Colab Instuction"""
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+model_id_or_path = "SimianLuo/LCM_Dreamshaper_v7"
+device_print = "GPU 🔥" if torch.cuda.is_available() else "CPU 🥶"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+if is_colab:
+    scheduler = LCMScheduler.from_config(model_id_or_path, subfolder="scheduler")
+    pipe = EditPipeline.from_pretrained(model_id_or_path, scheduler=scheduler, torch_dtype=torch_dtype)
+else:
+    # import streamlit as st
+    # scheduler = DDIMScheduler.from_config(model_id_or_path, use_auth_token=st.secrets["USER_TOKEN"], subfolder="scheduler")
+    # pipe = CycleDiffusionPipeline.from_pretrained(model_id_or_path, use_auth_token=st.secrets["USER_TOKEN"], scheduler=scheduler, torch_dtype=torch_dtype)
+    scheduler = LCMScheduler.from_config(model_id_or_path, use_auth_token=os.environ.get("USER_TOKEN"), subfolder="scheduler")
+    pipe = EditPipeline.from_pretrained(model_id_or_path, use_auth_token=os.environ.get("USER_TOKEN"), scheduler=scheduler, torch_dtype=torch_dtype)
+tokenizer = pipe.tokenizer
+encoder = pipe.text_encoder
+if torch.cuda.is_available():
+    pipe = pipe.to("cuda")
+class LocalBlend:
+    def get_mask(self,x_t,maps,word_idx, thresh, i):
+        # print(word_idx)
+        # print(maps.shape)
+        # for i in range(0,self.len):
+        #     self.save_image(maps[:,:,:,:,i].mean(0,keepdim=True),i,"map")
+        maps = maps * word_idx.reshape(1,1,1,1,-1)
+        maps = (maps[:,:,:,:,1:self.len-1]).mean(0,keepdim=True)
+        # maps = maps.mean(0,keepdim=True)
+        maps = (maps).max(-1)[0]
+        # self.save_image(maps,i,"map")
+        maps = nnf.interpolate(maps, size=(x_t.shape[2:]))
+        # maps = maps.mean(1,keepdim=True)\
+        maps = maps / maps.max(2, keepdim=True)[0].max(3, keepdim=True)[0]
+        mask = maps > thresh
+        return mask
+    def save_image(self,mask,i, caption):
+        image = mask[0, 0, :, :]
+        image = 255 * image / image.max()
+        # print(image.shape)
+        image = image.unsqueeze(-1).expand(*image.shape, 3)
+        # print(image.shape)
+        image = image.cpu().numpy().astype(np.uint8)
+        image = np.array(Image.fromarray(image).resize((256, 256)))
+        if not os.path.exists(f"inter/{caption}"):
+           os.mkdir(f"inter/{caption}")
+        ptp_utils.save_images(image, f"inter/{caption}/{i}.jpg")
+    def __call__(self, i, x_s, x_t, x_m, attention_store, alpha_prod, temperature=0.15, use_xm=False):
+        maps = attention_store["down_cross"][2:4] + attention_store["up_cross"][:3]
+        h,w = x_t.shape[2],x_t.shape[3]
+        h , w = ((h+1)//2+1)//2, ((w+1)//2+1)//2
+        # print(h,w)
+        # print(maps[0].shape)
+        maps = [item.reshape(2, -1, 1, h // int((h*w/item.shape[-2])**0.5),  w // int((h*w/item.shape[-2])**0.5), MAX_NUM_WORDS) for item in maps]
+        maps = torch.cat(maps, dim=1)
+        maps_s = maps[0,:]
+        maps_m = maps[1,:]
+        thresh_e = temperature / alpha_prod ** (0.5)
+        if thresh_e < self.thresh_e:
+          thresh_e = self.thresh_e
+        thresh_m = self.thresh_m
+        mask_e = self.get_mask(x_t, maps_m, self.alpha_e, thresh_e, i)
+        mask_m = self.get_mask(x_t, maps_s, (self.alpha_m-self.alpha_me), thresh_m, i)
+        mask_me = self.get_mask(x_t, maps_m, self.alpha_me, self.thresh_e, i)
+        if self.save_inter:
+            self.save_image(mask_e,i,"mask_e")
+            self.save_image(mask_m,i,"mask_m")
+            self.save_image(mask_me,i,"mask_me")
+        if self.alpha_e.sum() == 0:
+          x_t_out = x_t
+        else:
+          x_t_out = torch.where(mask_e, x_t, x_m)
+        x_t_out = torch.where(mask_m, x_s, x_t_out)
+        if use_xm:
+          x_t_out = torch.where(mask_me, x_m, x_t_out)
+        return x_m, x_t_out
+    def __init__(self,thresh_e=0.3, thresh_m=0.3, save_inter = False):
+        self.thresh_e = thresh_e
+        self.thresh_m = thresh_m
+        self.save_inter = save_inter
+    def set_map(self, ms, alpha, alpha_e, alpha_m,len):
+        self.m = ms
+        self.alpha = alpha
+        self.alpha_e = alpha_e
+        self.alpha_m = alpha_m
+        alpha_me = alpha_e.to(torch.bool) & alpha_m.to(torch.bool)
+        self.alpha_me = alpha_me.to(torch.float)
+        self.len = len
+class AttentionControl(abc.ABC):
+    def step_callback(self, x_t):
+        return x_t
+    def between_steps(self):
+        return
+    @property
+    def num_uncond_att_layers(self):
+        return self.num_att_layers if LOW_RESOURCE else 0
+    @abc.abstractmethod
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        raise NotImplementedError
+    def __call__(self, attn, is_cross: bool, place_in_unet: str):
+        if self.cur_att_layer >= self.num_uncond_att_layers:
+            if LOW_RESOURCE:
+                attn = self.forward(attn, is_cross, place_in_unet)
+            else:
+                h = attn.shape[0]
+                attn[h // 2:] = self.forward(attn[h // 2:], is_cross, place_in_unet)
+        self.cur_att_layer += 1
+        if self.cur_att_layer == self.num_att_layers // 2 + self.num_uncond_att_layers:
+            self.cur_att_layer = 0
+            self.cur_step += 1
+            self.between_steps()
+        return attn
+    def reset(self):
+        self.cur_step = 0
+        self.cur_att_layer = 0
+    def __init__(self):
+        self.cur_step = 0
+        self.num_att_layers = -1
+        self.cur_att_layer = 0
+class EmptyControl(AttentionControl):
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        return attn
+    def self_attn_forward(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        b = q.shape[0] // num_heads
+        out = torch.einsum("h i j, h j d -> h i d", attn, v)
+        return out
+class AttentionStore(AttentionControl):
+    @staticmethod
+    def get_empty_store():
+        return {"down_cross": [], "mid_cross": [], "up_cross": [],
+                "down_self": [],  "mid_self": [],  "up_self": []}
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        key = f"{place_in_unet}_{'cross' if is_cross else 'self'}"
+        if attn.shape[1] <= 32 ** 2:  # avoid memory overhead
+            self.step_store[key].append(attn)
+        return attn
+    def between_steps(self):
+        if len(self.attention_store) == 0:
+            self.attention_store = self.step_store
+        else:
+            for key in self.attention_store:
+                for i in range(len(self.attention_store[key])):
+                    self.attention_store[key][i] += self.step_store[key][i]
+        self.step_store = self.get_empty_store()
+    def get_average_attention(self):
+        average_attention = {key: [item / self.cur_step for item in self.attention_store[key]] for key in self.attention_store}
+        return average_attention
+    def reset(self):
+        super(AttentionStore, self).reset()
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+    def __init__(self):
+        super(AttentionStore, self).__init__()
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+class AttentionControlEdit(AttentionStore, abc.ABC):
+    def step_callback(self,i, t, x_s, x_t, x_m, alpha_prod):
+        if (self.local_blend is not None) and (i>0):
+            use_xm = (self.cur_step+self.start_steps+1 == self.num_steps)
+            x_m, x_t = self.local_blend(i, x_s, x_t, x_m, self.attention_store, alpha_prod, use_xm=use_xm)
+        return x_m, x_t
+    def replace_self_attention(self, attn_base, att_replace):
+        if att_replace.shape[2] <= 16 ** 2:
+            return attn_base.unsqueeze(0).expand(att_replace.shape[0], *attn_base.shape)
+        else:
+            return att_replace
+    @abc.abstractmethod
+    def replace_cross_attention(self, attn_base, att_replace):
+        raise NotImplementedError
+    def attn_batch(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        b = q.shape[0] // num_heads
+        sim = torch.einsum("h i d, h j d -> h i j", q, k) * kwargs.get("scale")
+        attn = sim.softmax(-1)
+        out = torch.einsum("h i j, h j d -> h i d", attn, v)
+        return out
+    def self_attn_forward(self, q, k, v, num_heads):
+        if q.shape[0]//num_heads == 3:
+            if (self.self_replace_steps <= ((self.cur_step+self.start_steps+1)*1.0 / self.num_steps) ):
+                q=torch.cat([q[:num_heads*2],q[num_heads:num_heads*2]])
+                k=torch.cat([k[:num_heads*2],k[:num_heads]])
+                v=torch.cat([v[:num_heads*2],v[:num_heads]])
+            else:
+                q=torch.cat([q[:num_heads],q[:num_heads],q[:num_heads]])
+                k=torch.cat([k[:num_heads],k[:num_heads],k[:num_heads]])
+                v=torch.cat([v[:num_heads*2],v[:num_heads]])
+            return q,k,v
+        else:
+            qu, qc = q.chunk(2)
+            ku, kc = k.chunk(2)
+            vu, vc = v.chunk(2)
+            if (self.self_replace_steps <= ((self.cur_step+self.start_steps+1)*1.0 / self.num_steps) ):
+                qu=torch.cat([qu[:num_heads*2],qu[num_heads:num_heads*2]])
+                qc=torch.cat([qc[:num_heads*2],qc[num_heads:num_heads*2]])
+                ku=torch.cat([ku[:num_heads*2],ku[:num_heads]])
+                kc=torch.cat([kc[:num_heads*2],kc[:num_heads]])
+                vu=torch.cat([vu[:num_heads*2],vu[:num_heads]])
+                vc=torch.cat([vc[:num_heads*2],vc[:num_heads]])
+            else:
+                qu=torch.cat([qu[:num_heads],qu[:num_heads],qu[:num_heads]])
+                qc=torch.cat([qc[:num_heads],qc[:num_heads],qc[:num_heads]])
+                ku=torch.cat([ku[:num_heads],ku[:num_heads],ku[:num_heads]])
+                kc=torch.cat([kc[:num_heads],kc[:num_heads],kc[:num_heads]])
+                vu=torch.cat([vu[:num_heads*2],vu[:num_heads]])
+                vc=torch.cat([vc[:num_heads*2],vc[:num_heads]])
+            return torch.cat([qu, qc], dim=0) ,torch.cat([ku, kc], dim=0), torch.cat([vu, vc], dim=0)
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        if is_cross :
+            h = attn.shape[0] // self.batch_size
+            attn = attn.reshape(self.batch_size,h,  *attn.shape[1:])
+            attn_base, attn_repalce,attn_masa = attn[0], attn[1], attn[2]
+            attn_replace_new = self.replace_cross_attention(attn_masa, attn_repalce)
+            attn_base_store = self.replace_cross_attention(attn_base, attn_repalce)
+            if (self.cross_replace_steps >= ((self.cur_step+self.start_steps+1)*1.0 / self.num_steps) ):
+                attn[1] = attn_base_store
+            attn_store=torch.cat([attn_base_store,attn_replace_new])
+            attn = attn.reshape(self.batch_size * h, *attn.shape[2:])
+            attn_store = attn_store.reshape(2 *h, *attn_store.shape[2:])
+            super(AttentionControlEdit, self).forward(attn_store, is_cross, place_in_unet)
+        return attn
+    def __init__(self, prompts, num_steps: int,start_steps: int,
+                 cross_replace_steps: Union[float, Tuple[float, float], Dict[str, Tuple[float, float]]],
+                 self_replace_steps: Union[float, Tuple[float, float]],
+                 local_blend: Optional[LocalBlend]):
+        super(AttentionControlEdit, self).__init__()
+        self.batch_size = len(prompts)+1
+        self.self_replace_steps = self_replace_steps
+        self.cross_replace_steps = cross_replace_steps
+        self.num_steps=num_steps
+        self.start_steps=start_steps
+        self.local_blend = local_blend
+class AttentionReplace(AttentionControlEdit):
+    def replace_cross_attention(self, attn_base, att_replace):
+        return torch.einsum('hpw,bwn->bhpn', attn_base, self.mapper)
+    def __init__(self, prompts, num_steps: int, cross_replace_steps: float, self_replace_steps: float,
+                 local_blend: Optional[LocalBlend] = None):
+        super(AttentionReplace, self).__init__(prompts, num_steps, cross_replace_steps, self_replace_steps, local_blend)
+        self.mapper = seq_aligner.get_replacement_mapper(prompts, tokenizer).to(device).to(torch_dtype)
+class AttentionRefine(AttentionControlEdit):
+    def replace_cross_attention(self, attn_masa, att_replace):
+        attn_masa_replace = attn_masa[:, :, self.mapper].squeeze()
+        attn_replace = attn_masa_replace * self.alphas + \
+                 att_replace * (1 - self.alphas)
+        return attn_replace
+    def __init__(self, prompts, prompt_specifiers, num_steps: int,start_steps: int, cross_replace_steps: float, self_replace_steps: float,
+                 local_blend: Optional[LocalBlend] = None):
+        super(AttentionRefine, self).__init__(prompts, num_steps,start_steps, cross_replace_steps, self_replace_steps, local_blend)
+        self.mapper, alphas, ms, alpha_e, alpha_m = seq_aligner.get_refinement_mapper(prompts, prompt_specifiers, tokenizer, encoder, device)
+        self.mapper, alphas, ms = self.mapper.to(device), alphas.to(device).to(torch_dtype), ms.to(device).to(torch_dtype)
+        self.alphas = alphas.reshape(alphas.shape[0], 1, 1, alphas.shape[1])
+        self.ms = ms.reshape(ms.shape[0], 1, 1, ms.shape[1])
+        ms = ms.to(device)
+        alpha_e = alpha_e.to(device)
+        alpha_m = alpha_m.to(device)
+        t_len = len(tokenizer(prompts[1])["input_ids"])
+        self.local_blend.set_map(ms,alphas,alpha_e,alpha_m,t_len)
+def get_equalizer(text: str, word_select: Union[int, Tuple[int, ...]], values: Union[List[float], Tuple[float, ...]]):
+    if type(word_select) is int or type(word_select) is str:
+        word_select = (word_select,)
+    equalizer = torch.ones(len(values), 77)
+    values = torch.tensor(values, dtype=torch_dtype)
+    for word in word_select:
+        inds = ptp_utils.get_word_inds(text, word, tokenizer)
+        equalizer[:, inds] = values
+    return equalizer
+def inference(img, source_prompt, target_prompt,
+          local, mutual,
+          positive_prompt, negative_prompt,
+          guidance_s, guidance_t,
+          num_inference_steps,
+          width, height, seed, strength,
+          cross_replace_steps, self_replace_steps,
+          thresh_e, thresh_m, denoise, user_instruct="", api_key=""):
+    print(img)
+    if user_instruct != "" and api_key != "":
+        source_prompt, target_prompt, local, mutual, replace_steps, num_inference_steps = get_params(api_key, user_instruct)
+        cross_replace_steps = replace_steps
+        self_replace_steps = replace_steps
+    torch.manual_seed(seed)
+    ratio = min(height / img.height, width / img.width)
+    img = img.resize((int(img.width * ratio), int(img.height * ratio)))
+    if denoise is False:
+        strength = 1
+    num_denoise_num = math.trunc(num_inference_steps*strength)
+    num_start = num_inference_steps-num_denoise_num
+    # create the CAC controller.
+    local_blend = LocalBlend(thresh_e=thresh_e, thresh_m=thresh_m, save_inter=False)
+    controller = AttentionRefine([source_prompt, target_prompt],[[local, mutual]],
+                    num_inference_steps,
+                    num_start,
+                    cross_replace_steps=cross_replace_steps,
+                    self_replace_steps=self_replace_steps,
+                    local_blend=local_blend
+                    )
+    ptp_utils.register_attention_control(pipe, controller)
+    results = pipe(prompt=target_prompt,
+                   source_prompt=source_prompt,
+                   positive_prompt=positive_prompt,
+                   negative_prompt=negative_prompt,
+                   image=img,
+                   num_inference_steps=num_inference_steps,
+                   eta=1,
+                   strength=strength,
+                   guidance_scale=guidance_t,
+                   source_guidance_scale=guidance_s,
+                   denoise_model=denoise,
+                   callback = controller.step_callback
+                   )
+    return replace_nsfw_images(results)
+def replace_nsfw_images(results):
+    for i in range(len(results.images)):
+        if results.nsfw_content_detected[i]:
+            results.images[i] = Image.open("nsfw.png")
+    return results.images[0]
+css = """.cycle-diffusion-div div{display:inline-flex;align-items:center;gap:.8rem;font-size:1.75rem}.cycle-diffusion-div div h1{font-weight:900;margin-bottom:7px}.cycle-diffusion-div p{margin-bottom:10px;font-size:94%}.cycle-diffusion-div p a{text-decoration:underline}.tabs{margin-top:0;margin-bottom:0}#gallery{min-height:20rem}
+"""
+intro = """
+<div style="display: flex;align-items: center;justify-content: center">
+    <img src="https://sled-group.github.io/InfEdit/image_assets/InfEdit.png" width="80" style="display: inline-block">
+    <h1 style="margin-left: 12px;text-align: center;margin-bottom: 7px;display: inline-block">InfEdit</h1>
+    <h3 style="display: inline-block;margin-left: 10px;margin-top: 6px;font-weight: 500">Inversion-Free Image Editing
+with Natural Language</h3>
+</div>
+"""
+param_bot_prompt = """
+You are a helpful assistant named InfEdit that provides input parameters to the image editing model based on user instructions. You should respond in valid json format.
+User:
+```
+{image descrption and editing commands | example: 'The image shows an apple on the table and I want to change the apple to a banana.'}
+```
+After receiving this, you will need to generate the appropriate params as input to the image editing models.
+Assistant:
+```
+{
+“source_prompt”: “{a string describes the input image, it needs to includes the thing user want to change | example: 'an apple on the table'}”,
+“target_prompt”: “{a string that matches the source prompt, but it needs to includes the thing user want to change | example: 'a banana on the table'}”,
+“target_sub”: “{a special substring from the target prompt}”,
+“mutual_sub”: “{a special mutual substring from source/target prompt}”
+“attention_control”: {a number between 0 and 1}
+“steps”: {a number between 8 and 50}
+}
+```
+You need to fill in the "target_sub" and "mutual_sub" by the guideline below.
+If the editing instruction is not about changing style or background:
+- The "target_sub" should be a special substring from the target prompt that highlights what you want to edit, it should be as short as possible and should only be noun ("banana" instead of "a banana").
+- The "mutual_sub" should be kept as an empty string.
+P.S. When you want to remove something, it's always better to use "empty", "nothing" or some appropriate words to replace it. Like remove an apple on the table, you can use "an apple on the table" and "nothing on the table" as your prompts, and use "nothing" as your target_sub.
+P.S. You should think carefully about what you want to modify, like "short hair" to "long hair", your target_sub should be "hair" instead of "long".
+P.S. When you are adding something, the target_sub should be the thing you want to add.
+If it's about style editing:
+- The "target_sub" should be kept as an empty string.
+- The "mutual_sub" should be kept as an empty string.
+If it's about background editing:
+- The "target_sub" should be kept as an empty string.
+- The "mutual_sub" should be a common substring from source/target prompt, and is the main object/character (noun) in the image. It should be as short as possible and only be noun ("banana" instead of "a banana", "man" instead of "running man").
+A specific case, if it's about change an object's abstract information, like pose, view or shape and want to keep the semantic feature same, like a dog to a running dog,
+- The "target_sub" should be a special substring from the target prompt that highlights what you want to edit, it should be as short as possible and should only be noun ("dog" instead of "a running dog").
+- The "mutual_sub" should be as same as target_sub because we want to "edit the dog but also keep the dog as same".
+You need to choose a specific value of “attention_control” by the guideline below.
+A larger value of “attention_control” means more consistency between the source image and the output.
+- the editing is on the feature level, like color, material and so on, and want to ensure the characteristics of the original object as much as possible, you should choose a large value. (Example: for color editing, you can choose 1, and for material you can choose 0.9)
+- the editing is on the object level, like edit a "cat" to a "dog", or a "horse" to a "zebra", and want to make them to be similar, you need to choose a relatively large value, we say 0.7 for example.
+- the editing is changing the style but want to keep the spatial features, you need to choose a relatively large value, we say 0.7 for example.
+- the editing need to change something's shape, like edit an "apple" to a "banana", a "flower" to a "knife", "short" hair to "long" hair, "round" to "square", which have very different shapes, you need to choose a relatively small value, we say 0.3 for example.
+- the editing is tring to change the spatial information, like change the pose and so on, you need to choose a relatively small value, we say 0.3 for example.
+- the editing should not consider the consistency with the input image, like add something new, remove something, or change the background, you can directly use 0.
+You need to choose a specific value of “steps” by the guideline below.
+More steps mean that the edit effect is more pronounced.
+- If the editing is super easy, like changing something to something with very similar features, you can choose 8 steps.
+- In most cases, you can choose 15 steps.
+- For style editing and remove tasks, you can choose a larger value, like 25 steps.
+- If you feel the task is extremely difficult (like some kinds of styles or removing very tiny stuffs), you can directly use 50 steps.
+"""
+def get_params(api_key, user_instruct):
+    from openai import OpenAI
+    client = OpenAI(api_key=api_key)
+    print("user_instruct", user_instruct)
+    response = client.chat.completions.create(
+    model="gpt-4-1106-preview",
+    messages=[
+        {"role": "system", "content": param_bot_prompt},
+        {"role": "user", "content": user_instruct}
+    ],
+    response_format={ "type": "json_object" },
+    )
+    param_dict = response.choices[0].message.content
+    print("param_dict", param_dict)
+    import json
+    param_dict = json.loads(param_dict)
+    return param_dict['source_prompt'], param_dict['target_prompt'], param_dict['target_sub'], param_dict['mutual_sub'], param_dict['attention_control'], param_dict['steps']
+with gr.Blocks(css=css) as demo:
+    gr.HTML(intro)
+    with gr.Accordion("README", open=False):
+        gr.HTML(
+            """
+            <p style="font-size: 0.95rem;margin: 0rem;line-height: 1.2em;margin-top:1em;display: inline-block">
+                <a href="https://sled-group.github.io/InfEdit/" target="_blank">project page</a> | <a href="https://arxiv.org" target="_blank">paper</a>| <a href="https://github.com/sled-group/InfEdit/tree/website" target="_blank">handbook</a>
+            </p>
+            We are now hosting on a A4000 GPU with 16 GiB memory.
+        """
+        )
+    with gr.Row():
+        with gr.Column(scale=55):
+            with gr.Group():
+                img = gr.Image(label="Input image", height=512, type="pil")
+                image_out = gr.Image(label="Output image", height=512)
+                # gallery = gr.Gallery(
+                #     label="Generated images", show_label=False, elem_id="gallery"
+                # ).style(grid=[1], height="auto")
+        with gr.Column(scale=45):
+            with gr.Tab("UAC options"):
+                with gr.Group():
+                    with gr.Row():
+                        source_prompt = gr.Textbox(label="Source prompt", placeholder="Source prompt describes the input image")
+                    with gr.Row():
+                        guidance_s = gr.Slider(label="Source guidance scale", value=1, minimum=1, maximum=10)
+                        positive_prompt = gr.Textbox(label="Positive prompt", placeholder="")
+                    with gr.Row():
+                        target_prompt = gr.Textbox(label="Target prompt", placeholder="Target prompt describes the output image")
+                    with gr.Row():
+                        guidance_t = gr.Slider(label="Target guidance scale", value=2, minimum=1, maximum=10)
+                        negative_prompt = gr.Textbox(label="Negative prompt", placeholder="")
+                    with gr.Row():
+                        local = gr.Textbox(label="Target blend", placeholder="")
+                        thresh_e = gr.Slider(label="Target blend thresh", value=0.6, minimum=0, maximum=1)
+                    with gr.Row():
+                        mutual = gr.Textbox(label="Source blend", placeholder="")
+                        thresh_m = gr.Slider(label="Source blend thresh", value=0.6, minimum=0, maximum=1)
+                    with gr.Row():
+                        cross_replace_steps = gr.Slider(label="Cross attn control schedule", value=0.7, minimum=0.0, maximum=1, step=0.01)
+                        self_replace_steps = gr.Slider(label="Self attn control schedule", value=0.3, minimum=0.0, maximum=1, step=0.01)
+                    with gr.Row():
+                        denoise = gr.Checkbox(label='Denoising Mode', value=False)
+                        strength = gr.Slider(label="Strength", value=0.7, minimum=0, maximum=1, step=0.01, visible=False)
+                        denoise.change(fn=lambda value: gr.update(visible=value), inputs=denoise, outputs=strength)
+                    with gr.Row():
+                        generate1 = gr.Button(value="Run")
+            with gr.Tab("Advanced options"):
+                with gr.Group():
+                    with gr.Row():
+                        num_inference_steps = gr.Slider(label="Inference steps", value=15, minimum=1, maximum=50, step=1)
+                        width = gr.Slider(label="Width", value=512, minimum=512, maximum=1024, step=8)
+                        height = gr.Slider(label="Height", value=512, minimum=512, maximum=1024, step=8)
+                    with gr.Row():
+                        seed = gr.Slider(0, 2147483647, label='Seed', value=0, step=1)
+                    with gr.Row():
+                        generate3 = gr.Button(value="Run")
+            with gr.Tab("Instruction following (+GPT4)"):
+                guide_str = """Describe the image you uploaded and tell me how you want to edit it."""
+                with gr.Group():
+                    api_key = gr.Textbox(label="YOUR OPENAI API KEY", placeholder="sk-xxx", lines = 1, type="password")
+                    user_instruct = gr.Textbox(label=guide_str, placeholder="The image shows an apple on the table and I want to change the apple to a banana.", lines = 3)
+                    # source_prompt, target_prompt, local, mutual = get_params(api_key, user_instruct)
+                    with gr.Row():
+                        generate4 = gr.Button(value="Run")
+    inputs1 = [img, source_prompt, target_prompt,
+          local, mutual,
+          positive_prompt, negative_prompt,
+          guidance_s, guidance_t,
+          num_inference_steps,
+          width, height, seed, strength,
+          cross_replace_steps, self_replace_steps,
+          thresh_e, thresh_m, denoise]
+    inputs4 =[img, source_prompt, target_prompt,
+          local, mutual,
+          positive_prompt, negative_prompt,
+          guidance_s, guidance_t,
+          num_inference_steps,
+          width, height, seed, strength,
+          cross_replace_steps, self_replace_steps,
+          thresh_e, thresh_m, denoise, user_instruct, api_key]
+    generate1.click(inference, inputs=inputs1, outputs=image_out)
+    generate3.click(inference, inputs=inputs1, outputs=image_out)
+    generate4.click(inference, inputs=inputs4, outputs=image_out)
+    ex = gr.Examples(
+        [
+          ["images/corgi.jpg","corgi","cat","cat","","","",1,2,15,512,512,0,1,0.7,0.7,0.6,0.6,False],
+          ["images/muffin.png","muffin","chihuahua","chihuahua","","","",1,2,15,512,512,0,1,0.65,0.6,0.4,0.7,False],
+          ["images/InfEdit.jpg","an anime girl holding a pad","an anime girl holding a book","book","girl ","","",1,2,15,512,512,0,1,0.8,0.8,0.6,0.6,False],
+          ["images/summer.jpg","a photo of summer scene","A photo of winter scene","","","","",1,2,15,512,512,0,1,1,1,0.6,0.7,False],
+          ["images/bear.jpg","A bear sitting on the ground","A bear standing on the ground","bear","","","",1,1.5,15,512,512,0,1,0.3,0.3,0.5,0.7,False],
+          ["images/james.jpg","a man playing basketball","a man playing soccer","soccer","man ","","",1,2,15,512,512,0,1,0,0,0.5,0.4,False],
+          ["images/osu.jfif","A football with OSU logo","A football with Umich logo","logo","","","",1,2,15,512,512,0,1,0.5,0,0.6,0.7,False],
+          ["images/groundhog.png","A anime groundhog head","A anime ferret head","head","","","",1,2,15,512,512,0,1,0.5,0.5,0.6,0.7,False],
+          ["images/miku.png","A anime girl with green hair and green eyes and shirt","A anime girl with red hair and red eyes and shirt","red hair and red eyes","shirt","","",1,2,15,512,512,0,1,1,1,0.2,0.8,False],
+          ["images/droplet.png","a blue droplet emoji with a smiling face with yellow dot","a red fire emoji with an angry face with yellow dot","","yellow dot","","",1,2,15,512,512,0,1,0.7,0.7,0.6,0.7,False],
+          ["images/moyu.png","an emoji holding a sign and a fish","an emoji holding a sign and a shark","shark","sign","","",1,2,15,512,512,0,1,0.7,0.7,0.5,0.7,False],
+          ["images/214000000000.jpg","a painting of a waterfall in the mountains","a painting of a waterfall and angels in the mountains","angels","","","",1,2,15,512,512,0,1,0,0,0.5,0.5,False],
+            ["images/311000000002.jpg","a lion in a suit sitting at a table with a laptop","a lion in a suit sitting at a table with nothing","nothing","","","",1,2,15,512,512,0,1,0,0,0.5,0.5,False],
+            ["images/genshin.png","anime girl, with blue logo","anime boy with golden hair named Link, from The Legend of Zelda, with legend of zelda logo","anime boy","","","",1,2,50,512,512,0,1,0.65,0.65,0.5,0.5,False],
+            ["images/angry.jpg","a man with bounding boxes at the door","a man with angry birds at the door","angry birds","a man","","",1,2,15,512,512,0,1,0.3,0.1,0.45,0.4,False],
+            ["images/Doom_Slayer.jpg","doom slayer from game doom","master chief from game halo","","","","",1,2,15,512,512,0,1,0.6,0.8,0.7,0.7,False],
+          ["images/Elon_Musk.webp","Elon Musk in front of a car","Mark Iv iron man suit in front of a car","Mark Iv iron man suit","car","","",1,2,15,512,512,0,1,0.5,0.3,0.6,0.7,False],
+            ["images/dragon.jpg","a mascot dragon","pixel art, a mascot dragon","","","","",1,2,25,512,512,0,1,0.7,0.7,0.6,0.6,False],
+            ["images/frieren.jpg","a anime girl with long white hair holding a bottle","a anime girl with long white hair holding a smartphone","smartphone","","","",1,2,15,512,512,0,1,0.7,0.7,0.7,0.7,False],
+            ["images/sam.png","a man with an openai logo","a man with a twitter logo","a twitter logo","a man","","",1,2,15,512,512,0,0.8,0,0,0.3,0.6,True],
+        ],
+        [img, source_prompt, target_prompt,
+          local, mutual,
+          positive_prompt, negative_prompt,
+          guidance_s, guidance_t,
+          num_inference_steps,
+          width, height, seed, strength,
+          cross_replace_steps, self_replace_steps,
+          thresh_e, thresh_m, denoise],
+        image_out, inference, cache_examples=True,examples_per_page=20)
+# if not is_colab:
+#     demo.queue(concurrency_count=1)
+# demo.launch(debug=False, share=False,server_name="0.0.0.0",server_port = 80)
+demo.launch(debug=False, share=False)

images/214000000000.jpg ADDED Viewed

images/311000000002.jpg ADDED Viewed

images/Doom_Slayer.jpg ADDED Viewed

images/Elon_Musk.webp ADDED Viewed

images/InfEdit.jpg ADDED Viewed

Git LFS Details

SHA256: ff5d2c81b8a5fe77a95385ecf79356e9a1d204f2c5837e42d928ee4e255c4abc
Pointer size: 132 Bytes
Size of remote file: 1.31 MB

images/angry.jpg ADDED Viewed

images/bear.jpg ADDED Viewed

images/computer.png ADDED Viewed

images/corgi.jpg ADDED Viewed

images/dragon.jpg ADDED Viewed

images/droplet.png ADDED Viewed

images/frieren.jpg ADDED Viewed

images/genshin.png ADDED Viewed

images/groundhog.png ADDED Viewed

images/james.jpg ADDED Viewed

images/miku.png ADDED Viewed

images/moyu.png ADDED Viewed

images/muffin.png ADDED Viewed

images/osu.jfif ADDED Viewed

Binary file (4.38 kB). View file

images/sam.png ADDED Viewed

Git LFS Details

SHA256: e149ce2aba40d726168884c53ebb41cfb33b4b188f349fdb94fa07d1da28b74f
Pointer size: 132 Bytes
Size of remote file: 1.23 MB

images/summer.jpg ADDED Viewed

nsfw.png ADDED Viewed

pipeline_ead.py ADDED Viewed

	@@ -0,0 +1,707 @@

+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+import numpy as np
+import PIL
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.schedulers import LCMScheduler
+from diffusers.utils import PIL_INTERPOLATION, deprecate, logging
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
+def preprocess(image):
+    deprecation_message = "The preprocess method is deprecated and will be removed in diffusers 1.0.0. Please use VaeImageProcessor.preprocess(...) instead"
+    deprecate("preprocess", "1.0.0", deprecation_message, standard_warn=False)
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+def ddcm_sampler(scheduler, x_s, x_t, timestep, e_s, e_t, x_0, noise, eta, to_next=True):
+    if scheduler.num_inference_steps is None:
+        raise ValueError(
+            "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+        )
+    if scheduler.step_index is None:
+        scheduler._init_step_index(timestep)
+    prev_step_index = scheduler.step_index + 1
+    if prev_step_index < len(scheduler.timesteps):
+        prev_timestep = scheduler.timesteps[prev_step_index]
+    else:
+        prev_timestep = timestep
+    alpha_prod_t = scheduler.alphas_cumprod[timestep]
+    alpha_prod_t_prev = (
+        scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod
+    )
+    beta_prod_t = 1 - alpha_prod_t
+    beta_prod_t_prev = 1 - alpha_prod_t_prev
+    variance = beta_prod_t_prev
+    std_dev_t = eta * variance
+    noise = std_dev_t ** (0.5) * noise
+    e_c = (x_s - alpha_prod_t ** (0.5) * x_0) / (1 - alpha_prod_t) ** (0.5)
+    pred_x0 = x_0 + ((x_t - x_s) - beta_prod_t ** (0.5) * (e_t - e_s)) / alpha_prod_t ** (0.5)
+    eps = (e_t - e_s) + e_c
+    dir_xt = (beta_prod_t_prev - std_dev_t) ** (0.5) * eps
+    # Noise is not used for one-step sampling.
+    if len(scheduler.timesteps) > 1:
+        prev_xt = alpha_prod_t_prev ** (0.5) * pred_x0 + dir_xt + noise
+        prev_xs = alpha_prod_t_prev ** (0.5) * x_0 + dir_xt + noise
+    else:
+        prev_xt = pred_x0
+        prev_xs = x_0
+    if to_next:
+      scheduler._step_index += 1
+    return prev_xs, prev_xt, pred_x0
+class EditPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor"]
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: LCMScheduler,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+        )
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+        return prompt_embeds
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+    ):
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        return prompt_embeds, negative_prompt_embeds
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.check_inputs
+    def check_inputs(
+        self, prompt, strength, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        return timesteps, num_inference_steps - t_start
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, denoise_model, generator=None):
+        image = image.to(device=device, dtype=dtype)
+        batch_size = image.shape[0]
+        if image.shape[1] == 4:
+            init_latents = image
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+            if isinstance(generator, list):
+                init_latents = [
+                    self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = self.vae.encode(image).latent_dist.sample(generator)
+            init_latents = self.vae.config.scaling_factor * init_latents
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
+        # add noise to latents using the timestep
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        # get latents
+        clean_latents = init_latents
+        if denoise_model:
+            init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+            latents = init_latents
+        else:
+            latents = noise
+        return latents, clean_latents
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        source_prompt: Union[str, List[str]],
+        negative_prompt: Union[str, List[str]]=None,
+        positive_prompt: Union[str, List[str]]=None,
+        image: PipelineImageInput = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        original_inference_steps: Optional[int]  = 50,
+        guidance_scale: Optional[float] = 7.5,
+        source_guidance_scale: Optional[float] = 1,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 1.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        denoise_model: Optional[bool] = True,
+    ):
+        # 1. Check inputs
+        self.check_inputs(prompt, strength, callback_steps)
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+        )
+        source_prompt_embeds_tuple = self.encode_prompt(
+            source_prompt, device, num_images_per_prompt, do_classifier_free_guidance, positive_prompt, None
+        )
+        if prompt_embeds_tuple[1] is not None:
+            prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+        else:
+            prompt_embeds = prompt_embeds_tuple[0]
+        if source_prompt_embeds_tuple[1] is not None:
+            source_prompt_embeds = torch.cat([source_prompt_embeds_tuple[1], source_prompt_embeds_tuple[0]])
+        else:
+            source_prompt_embeds = source_prompt_embeds_tuple[0]
+        # 4. Preprocess image
+        image = self.image_processor.preprocess(image)
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(
+          num_inference_steps=num_inference_steps,
+          device=device,
+          original_inference_steps=original_inference_steps)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        # 6. Prepare latent variables
+        latents, clean_latents = self.prepare_latents(
+            image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, denoise_model, generator
+        )
+        source_latents = latents
+        mutual_latents = latents
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        generator = extra_step_kwargs.pop("generator", None)
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                source_latent_model_input = (
+                    torch.cat([source_latents] * 2) if do_classifier_free_guidance else source_latents
+                )
+                mutual_latent_model_input = (
+                    torch.cat([mutual_latents] * 2) if do_classifier_free_guidance else mutual_latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                source_latent_model_input = self.scheduler.scale_model_input(source_latent_model_input, t)
+                mutual_latent_model_input = self.scheduler.scale_model_input(mutual_latent_model_input, t)
+                # predict the noise residual
+                if do_classifier_free_guidance:
+                    concat_latent_model_input = torch.stack(
+                        [
+                            source_latent_model_input[0],
+                            latent_model_input[0],
+                            mutual_latent_model_input[0],
+                            source_latent_model_input[1],
+                            latent_model_input[1],
+                            mutual_latent_model_input[1],
+                        ],
+                        dim=0,
+                    )
+                    concat_prompt_embeds = torch.stack(
+                        [
+                            source_prompt_embeds[0],
+                            prompt_embeds[0],
+                            source_prompt_embeds[0],
+                            source_prompt_embeds[1],
+                            prompt_embeds[1],
+                            source_prompt_embeds[1],
+                        ],
+                        dim=0,
+                    )
+                else:
+                    concat_latent_model_input = torch.cat(
+                        [
+                            source_latent_model_input,
+                            latent_model_input,
+                            mutual_latent_model_input,
+                        ],
+                        dim=0,
+                    )
+                    concat_prompt_embeds = torch.cat(
+                        [
+                            source_prompt_embeds,
+                            prompt_embeds,
+                            source_prompt_embeds,
+                        ],
+                        dim=0,
+                    )
+                concat_noise_pred = self.unet(
+                    concat_latent_model_input,
+                    t,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_hidden_states=concat_prompt_embeds,
+                ).sample
+                # perform guidance
+                if do_classifier_free_guidance:
+                    (
+                        source_noise_pred_uncond,
+                        noise_pred_uncond,
+                        mutual_noise_pred_uncond,
+                        source_noise_pred_text,
+                        noise_pred_text,
+                        mutual_noise_pred_text
+                    ) = concat_noise_pred.chunk(6, dim=0)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    source_noise_pred = source_noise_pred_uncond + source_guidance_scale * (
+                        source_noise_pred_text - source_noise_pred_uncond
+                    )
+                    mutual_noise_pred = mutual_noise_pred_uncond + source_guidance_scale * (
+                        mutual_noise_pred_text - mutual_noise_pred_uncond
+                    )
+                else:
+                    (source_noise_pred, noise_pred, mutual_noise_pred) = concat_noise_pred.chunk(3, dim=0)
+                noise = torch.randn(
+                    latents.shape, dtype=latents.dtype, device=latents.device, generator=generator
+                )
+                _, latents, pred_x0 = ddcm_sampler(
+                  self.scheduler, source_latents,
+                  latents, t,
+                  source_noise_pred, noise_pred,
+                  clean_latents, noise=noise,
+                  eta=eta, to_next=False,
+                  **extra_step_kwargs
+                )
+                source_latents, mutual_latents, pred_xm = ddcm_sampler(
+                  self.scheduler, source_latents,
+                  mutual_latents, t,
+                  source_noise_pred, mutual_noise_pred,
+                  clean_latents, noise=noise,
+                  eta=eta, **extra_step_kwargs
+                )
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        alpha_prod_t = self.scheduler.alphas_cumprod[t]
+                        mutual_latents, latents = callback(i, t, source_latents, latents, mutual_latents, alpha_prod_t)
+        # 9. Post-processing
+        if not output_type == "latent":
+            image = self.vae.decode(pred_x0 / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = pred_x0
+            has_nsfw_concept = None
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

ptp_utils.py ADDED Viewed

	@@ -0,0 +1,180 @@

+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import torch
+from typing import Optional, Union, Tuple, Dict
+from PIL import Image
+def save_images(images,dest, num_rows=1, offset_ratio=0.02):
+    if type(images) is list:
+        num_empty = len(images) % num_rows
+    elif images.ndim == 4:
+        num_empty = images.shape[0] % num_rows
+    else:
+        images = [images]
+        num_empty = 0
+    pil_img = Image.fromarray(images[-1])
+    pil_img.save(dest)
+    # display(pil_img)
+def save_image(images,dest, num_rows=1, offset_ratio=0.02):
+    print(images.shape)
+    pil_img = Image.fromarray(images[0])
+    pil_img.save(dest)
+def register_attention_control(model, controller):
+    class AttnProcessor():
+        def __init__(self,place_in_unet):
+            self.place_in_unet = place_in_unet
+        def __call__(self,
+            attn,
+            hidden_states,
+            encoder_hidden_states=None,
+            attention_mask=None,
+            temb=None,
+            scale=1.0,):
+            # The `Attention` class can call different attention processors / attention functions
+            residual = hidden_states
+            if attn.spatial_norm is not None:
+                hidden_states = attn.spatial_norm(hidden_states, temb)
+            input_ndim = hidden_states.ndim
+            if input_ndim == 4:
+                batch_size, channel, height, width = hidden_states.shape
+                hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+            h = attn.heads
+            is_cross = encoder_hidden_states is not None
+            if encoder_hidden_states is None:
+                encoder_hidden_states = hidden_states
+            elif attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+            batch_size, sequence_length, _ = (
+                hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+            )
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            q = attn.to_q(hidden_states)
+            k = attn.to_k(encoder_hidden_states)
+            v = attn.to_v(encoder_hidden_states)
+            q = attn.head_to_batch_dim(q)
+            k = attn.head_to_batch_dim(k)
+            v = attn.head_to_batch_dim(v)
+            if not is_cross:
+                q,k,v = controller.self_attn_forward(q, k, v, attn.heads)
+            attention_probs = attn.get_attention_scores(q, k, attention_mask)
+            if is_cross:
+                attention_probs  = controller(attention_probs , is_cross, self.place_in_unet)
+            # else:
+            #     out = controller.self_attn_forward(q, k, v, sim, attention_probs , is_cross, self.place_in_unet, attn.heads, scale=attn.scale)
+            hidden_states = torch.bmm(attention_probs, v)
+            hidden_states = attn.batch_to_head_dim(hidden_states)
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states, scale=scale)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+            if input_ndim == 4:
+                hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+            if attn.residual_connection:
+                hidden_states = hidden_states + residual
+            hidden_states = hidden_states / attn.rescale_output_factor
+            return hidden_states
+    def register_recr(net_, count, place_in_unet):
+        for idx, m in enumerate(net_.modules()):
+            # print(m.__class__.__name__)
+            if m.__class__.__name__ == "Attention":
+                count+=1
+                m.processor = AttnProcessor( place_in_unet)
+        return count
+    cross_att_count = 0
+    sub_nets = model.unet.named_children()
+    for net in sub_nets:
+        if "down" in net[0]:
+            cross_att_count += register_recr(net[1], 0, "down")
+        elif "up" in net[0]:
+            cross_att_count += register_recr(net[1], 0, "up")
+        elif "mid" in net[0]:
+            cross_att_count += register_recr(net[1], 0, "mid")
+    controller.num_att_layers = cross_att_count
+def get_word_inds(text: str, word_place: int, tokenizer):
+    split_text = text.split(" ")
+    if type(word_place) is str:
+        word_place = [i for i, word in enumerate(split_text) if word_place == word]
+    elif type(word_place) is int:
+        word_place = [word_place]
+    out = []
+    if len(word_place) > 0:
+        words_encode = [tokenizer.decode([item]).strip("#") for item in tokenizer.encode(text)][1:-1]
+        cur_len, ptr = 0, 0
+        for i in range(len(words_encode)):
+            cur_len += len(words_encode[i])
+            if ptr in word_place:
+                out.append(i + 1)
+            if cur_len >= len(split_text[ptr]):
+                ptr += 1
+                cur_len = 0
+    return np.array(out)
+def update_alpha_time_word(alpha, bounds: Union[float, Tuple[float, float]], prompt_ind: int, word_inds: Optional[torch.Tensor]=None):
+    if type(bounds) is float:
+        bounds = 0, bounds
+    start, end = int(bounds[0] * alpha.shape[0]), int(bounds[1] * alpha.shape[0])
+    if word_inds is None:
+        word_inds = torch.arange(alpha.shape[2])
+    alpha[: start, prompt_ind, word_inds] = 0
+    alpha[start: end, prompt_ind, word_inds] = 1
+    alpha[end:, prompt_ind, word_inds] = 0
+    return alpha
+def get_time_words_attention_alpha(prompts, num_steps, cross_replace_steps: Union[float, Tuple[float, float], Dict[str, Tuple[float, float]]],
+                                   tokenizer, max_num_words=77):
+    if type(cross_replace_steps) is not dict:
+        cross_replace_steps = {"default_": cross_replace_steps}
+    if "default_" not in cross_replace_steps:
+        cross_replace_steps["default_"] = (0., 1.)
+    alpha_time_words = torch.zeros(num_steps + 1, len(prompts) - 1, max_num_words)
+    for i in range(len(prompts) - 1):
+        alpha_time_words = update_alpha_time_word(alpha_time_words, cross_replace_steps["default_"],
+                                                  i)
+    for key, item in cross_replace_steps.items():
+        if key != "default_":
+             inds = [get_word_inds(prompts[i], key, tokenizer) for i in range(1, len(prompts))]
+             for i, ind in enumerate(inds):
+                 if len(ind) > 0:
+                    alpha_time_words = update_alpha_time_word(alpha_time_words, item, i, ind)
+    alpha_time_words = alpha_time_words.reshape(num_steps + 1, len(prompts) - 1, 1, 1, max_num_words) # time, batch, heads, pixels, words
+    return alpha_time_words

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+accelerate
+torch
+torchvision
+git+https://github.com/huggingface/diffusers.git
+Pillow
+transformers
+opencv-python
+openai

seq_aligner.py ADDED Viewed

	@@ -0,0 +1,314 @@

+import torch
+import copy
+import torch.nn.functional as F
+import numpy as np
+class ScoreParams:
+    def __init__(self, gap, match, mismatch):
+        self.gap = gap
+        self.match = match
+        self.mismatch = mismatch
+    def mis_match_char(self, x, y):
+        if x != y:
+            return self.mismatch
+        else:
+            return self.match
+def get_matrix(size_x, size_y, gap):
+    matrix = []
+    for i in range(len(size_x) + 1):
+        sub_matrix = []
+        for j in range(len(size_y) + 1):
+            sub_matrix.append(0)
+        matrix.append(sub_matrix)
+    for j in range(1, len(size_y) + 1):
+        matrix[0][j] = j*gap
+    for i in range(1, len(size_x) + 1):
+        matrix[i][0] = i*gap
+    return matrix
+def get_matrix(size_x, size_y, gap):
+    matrix = np.zeros((size_x + 1, size_y + 1), dtype=np.int32)
+    matrix[0, 1:] = (np.arange(size_y) + 1) * gap
+    matrix[1:, 0] = (np.arange(size_x) + 1) * gap
+    return matrix
+def get_traceback_matrix(size_x, size_y):
+    matrix = np.zeros((size_x + 1, size_y +1), dtype=np.int32)
+    matrix[0, 1:] = 1
+    matrix[1:, 0] = 2
+    matrix[0, 0] = 4
+    return matrix
+def global_align(x, y, score):
+    matrix = get_matrix(len(x), len(y), score.gap)
+    trace_back = get_traceback_matrix(len(x), len(y))
+    for i in range(1, len(x) + 1):
+        for j in range(1, len(y) + 1):
+            left = matrix[i, j - 1] + score.gap
+            up = matrix[i - 1, j] + score.gap
+            diag = matrix[i - 1, j - 1] + score.mis_match_char(x[i - 1], y[j - 1])
+            matrix[i, j] = max(left, up, diag)
+            if matrix[i, j] == left:
+                trace_back[i, j] = 1
+            elif matrix[i, j] == up:
+                trace_back[i, j] = 2
+            else:
+                trace_back[i, j] = 3
+    return matrix, trace_back
+def get_aligned_sequences(x, y, trace_back):
+    x_seq = []
+    y_seq = []
+    i = len(x)
+    j = len(y)
+    mapper_y_to_x = []
+    while i > 0 or j > 0:
+        if trace_back[i, j] == 3:
+            x_seq.append(x[i-1])
+            y_seq.append(y[j-1])
+            i = i-1
+            j = j-1
+            mapper_y_to_x.append((j, i))
+        elif trace_back[i][j] == 1:
+            x_seq.append('-')
+            y_seq.append(y[j-1])
+            j = j-1
+            mapper_y_to_x.append((j, -1))
+        elif trace_back[i][j] == 2:
+            x_seq.append(x[i-1])
+            y_seq.append('-')
+            i = i-1
+        elif trace_back[i][j] == 4:
+            break
+    mapper_y_to_x.reverse()
+    return x_seq, y_seq, torch.tensor(mapper_y_to_x, dtype=torch.int64)
+def get_mapper(x: str, y: str, specifier, tokenizer, encoder, device, max_len=77):
+    locol_prompt, mutual_prompt = specifier
+    x_seq = tokenizer.encode(x)
+    y_seq = tokenizer.encode(y)
+    e_seq = tokenizer.encode(locol_prompt)
+    m_seq = tokenizer.encode(mutual_prompt)
+    score = ScoreParams(0, 1, -1)
+    matrix, trace_back = global_align(x_seq, y_seq, score)
+    mapper_base = get_aligned_sequences(x_seq, y_seq, trace_back)[-1]
+    alphas = torch.ones(max_len)
+    alphas[: mapper_base.shape[0]] = mapper_base[:, 1].ne(-1).float()
+    mapper = torch.zeros(max_len, dtype=torch.int64)
+    mapper[:mapper_base.shape[0]] = mapper_base[:, 1]
+    mapper[mapper_base.shape[0]:] = len(y_seq) + torch.arange(max_len - len(y_seq))
+    m = copy.deepcopy(alphas)
+    alpha_e = torch.zeros_like(alphas)
+    alpha_m = torch.zeros_like(alphas)
+    # print("mapper of")
+    # print("<begin> "+x+" <end>")
+    # print("<begin> "+y+" <end>")
+    # print(mapper[:len(y_seq)])
+    # print(alphas[:len(y_seq)])
+    x = tokenizer(
+            x,
+            padding="max_length",
+            max_length=max_len,
+            truncation=True,
+            return_tensors="pt",
+        ).input_ids.to(device)
+    y = tokenizer(
+            y,
+            padding="max_length",
+            max_length=max_len,
+            truncation=True,
+            return_tensors="pt",
+        ).input_ids.to(device)
+    x_latent = encoder(x)[0].squeeze(0)
+    y_latent = encoder(y)[0].squeeze(0)
+    i = 0
+    while i<len(y_seq):
+        start = None
+        if alphas[i] == 0:
+            start = i
+            while alphas[i] == 0:
+                i += 1
+            max_sim = float('-inf')
+            max_s = None
+            max_t = None
+            for i_target in range(start, i):
+                for i_source in range(mapper[start-1]+1, mapper[i]):
+                    sim = F.cosine_similarity(x_latent[i_target], y_latent[i_source], dim=0)
+                    if sim > max_sim:
+                        max_sim = sim
+                        max_s = i_source
+                        max_t = i_target
+            if max_s is not None:
+                mapper[max_t] = max_s
+                alphas[max_t] = 1
+                for t in e_seq:
+                  if x_seq[max_s] == t:
+                    alpha_e[max_t] = 1
+        i += 1
+    # replace_alpha, replace_mapper = get_replace_inds(x_seq, y_seq, m_seq, m_seq)
+    # if replace_mapper != []:
+    #     mapper[replace_alpha]=torch.tensor(replace_mapper,device=mapper.device)
+    #     alpha_m[replace_alpha]=1
+    i = 1
+    j = 1
+    while (i < len(y_seq)-1) and (j < len(e_seq)-1):
+        found = True
+        while e_seq[j] != y_seq[i]:
+            i = i + 1
+            if i >= len(y_seq)-1:
+                print("blend word not found!")
+                found = False
+                break
+                raise ValueError("local prompt not found in target prompt")
+        if found:
+            alpha_e[i] = 1
+        j = j + 1
+    i = 1
+    j = 1
+    while (i < len(y_seq)-1) and (j < len(m_seq)-1):
+      while m_seq[j] != y_seq[i]:
+        i = i + 1
+      if m_seq[j] == x_seq[mapper[i]]:
+        alpha_m[i] = 1
+        j = j + 1
+      else:
+        raise ValueError("mutual prompt not found in target prompt")
+    # print("fixed mapper:")
+    # print(mapper[:len(y_seq)])
+    # print(alphas[:len(y_seq)])
+    # print(m[:len(y_seq)])
+    # print(alpha_e[:len(y_seq)])
+    # print(alpha_m[:len(y_seq)])
+    return mapper, alphas, m, alpha_e, alpha_m
+def get_refinement_mapper(prompts, specifiers, tokenizer, encoder, device, max_len=77):
+    x_seq = prompts[0]
+    mappers, alphas, ms, alpha_objs, alpha_descs = [], [], [], [], []
+    for i in range(1, len(prompts)):
+        mapper, alpha, m, alpha_obj, alpha_desc = get_mapper(x_seq, prompts[i], specifiers[i-1], tokenizer, encoder, device, max_len)
+        mappers.append(mapper)
+        alphas.append(alpha)
+        ms.append(m)
+        alpha_objs.append(alpha_obj)
+        alpha_descs.append(alpha_desc)
+    return torch.stack(mappers), torch.stack(alphas), torch.stack(ms),  torch.stack(alpha_objs), torch.stack(alpha_descs)
+def get_replace_inds(x_seq,y_seq,source_replace_seq,target_replace_seq):
+    replace_mapper=[]
+    replace_alpha=[]
+    source_found=False
+    source_match,target_match=[],[]
+    for j in range(len(x_seq)):
+        found=True
+        for i in range(1,len(source_replace_seq)-1):
+            if x_seq[j+i-1]!=source_replace_seq[i]:
+                found=False
+                break
+        if found:
+            source_found=True
+            for i in range(1,len(source_replace_seq)-1):
+                source_match.append(j+i-1)
+    for j in range(len(y_seq)):
+        found=True
+        for i in range(1,len(target_replace_seq)-1):
+            if y_seq[j+i-1]!=target_replace_seq[i]:
+                found=False
+                break
+        if found:
+            for i in range(1,len(source_replace_seq)-1):
+                target_match.append(j+i-1)
+    if not source_found:
+        raise ValueError("replacing object not found in prompt")
+    if (len(source_match)!=len(target_match)):
+        raise ValueError(f"the replacement word number doesn't match for word {i}!")
+    replace_alpha+=source_match
+    replace_mapper+=target_match
+    return replace_alpha,replace_mapper
+def get_word_inds(text: str, word_place: int, tokenizer):
+    split_text = text.split(" ")
+    if type(word_place) is str:
+        word_place = [i for i, word in enumerate(split_text) if word_place == word]
+    elif type(word_place) is int:
+        word_place = [word_place]
+    out = []
+    if len(word_place) > 0:
+        words_encode = [tokenizer.decode([item]).strip("#") for item in tokenizer.encode(text)][1:-1]
+        cur_len, ptr = 0, 0
+        for i in range(len(words_encode)):
+            cur_len += len(words_encode[i])
+            if ptr in word_place:
+                out.append(i + 1)
+            if cur_len >= len(split_text[ptr]):
+                ptr += 1
+                cur_len = 0
+    return np.array(out)
+def get_replacement_mapper_(x: str, y: str, tokenizer, max_len=77):
+    words_x = x.split(' ')
+    words_y = y.split(' ')
+    if len(words_x) != len(words_y):
+        raise ValueError(f"attention replacement edit can only be applied on prompts with the same length"
+                         f" but prompt A has {len(words_x)} words and prompt B has {len(words_y)} words.")
+    inds_replace = [i for i in range(len(words_y)) if words_y[i] != words_x[i]]
+    inds_source = [get_word_inds(x, i, tokenizer) for i in inds_replace]
+    inds_target = [get_word_inds(y, i, tokenizer) for i in inds_replace]
+    mapper = np.zeros((max_len, max_len))
+    i = j = 0
+    cur_inds = 0
+    while i < max_len and j < max_len:
+        if cur_inds < len(inds_source) and inds_source[cur_inds][0] == i:
+            inds_source_, inds_target_ = inds_source[cur_inds], inds_target[cur_inds]
+            if len(inds_source_) == len(inds_target_):
+                mapper[inds_source_, inds_target_] = 1
+            else:
+                ratio = 1 / len(inds_target_)
+                for i_t in inds_target_:
+                    mapper[inds_source_, i_t] = ratio
+            cur_inds += 1
+            i += len(inds_source_)
+            j += len(inds_target_)
+        elif cur_inds < len(inds_source):
+            mapper[i, j] = 1
+            i += 1
+            j += 1
+        else:
+            mapper[j, j] = 1
+            i += 1
+            j += 1
+    return torch.from_numpy(mapper).float()
+def get_replacement_mapper(prompts, tokenizer, max_len=77):
+    x_seq = prompts[0]
+    mappers = []
+    for i in range(1, len(prompts)):
+        mapper = get_replacement_mapper_(x_seq, prompts[i], tokenizer, max_len)
+        mappers.append(mapper)
+    return torch.stack(mappers)

utils.py ADDED Viewed

	@@ -0,0 +1,6 @@

+def is_google_colab():
+    try:
+        import google.colab
+        return True
+    except:
+        return False