Spaces:

shi-labs
/

Smooth-Diffusion

Build error

File size: 40,469 Bytes

################################################################################
# Copyright (C) 2023 Jiayi Guo, Xingqian Xu, Manushree Vasu - All Rights Reserved                         #
################################################################################

import gradio as gr
import os
import os.path as osp
import PIL
from PIL import Image
import numpy as np
from collections import OrderedDict
from easydict import EasyDict as edict
from functools import partial

import torch
import torchvision.transforms as tvtrans
import time
import argparse
import json
import hashlib
import copy
from tqdm import tqdm

from diffusers import StableDiffusionPipeline
from diffusers import DDIMScheduler
from app_utils import auto_dropdown

from huggingface_hub import hf_hub_download
import spaces

version = "Smooth Diffusion Demo v1.0"
refresh_symbol = "\U0001f504" # 🔄
recycle_symbol = '\U0000267b' #

##############
# model_book #
##############

choices = edict()
choices.diffuser = OrderedDict([
    ['SD-v1-5' , "runwayml/stable-diffusion-v1-5"],
    ['OJ-v4' , "prompthero/openjourney-v4"],
    ['RR-v2', "SG161222/Realistic_Vision_V2.0"],
])

choices.lora = OrderedDict([
    ['empty', ""],
    ['Smooth-LoRA-v1', hf_hub_download('shi-labs/smooth-diffusion-lora', 'smooth_lora.safetensors')],
])

choices.scheduler = OrderedDict([
    ['DDIM', DDIMScheduler],
])

choices.inversion = OrderedDict([
    ['NTI', 'NTI'],
    ['DDIM w/o text', 'DDIM w/o text'],
    ['DDIM', 'DDIM'], 
])

default = edict()
default.diffuser = 'RR-v2'
default.scheduler = 'DDIM'
default.lora = 'Smooth-LoRA-v1'
default.inversion = 'NTI'
default.step = 50
default.cfg_scale = 7.5
default.framen = 24
default.fps = 16
default.nullinv_inner_step = 10
default.threshold = 0.8
default.variation = 0.8

##########
# helper #
##########

def lerp(t, v0, v1):
    if isinstance(t, float):
        return v0*(1-t) + v1*t
    elif isinstance(t, (list, np.ndarray)):
        return [v0*(1-ti) + v1*ti for ti in t]

def slerp(t, v0, v1, DOT_THRESHOLD=0.9995):
    # mostly copied from
    # https://gist.github.com/dvschultz/3af50c40df002da3b751efab1daddf2c
    v0_unit = v0 / np.linalg.norm(v0)
    v1_unit = v1 / np.linalg.norm(v1)
    dot = np.sum(v0_unit * v1_unit)
    if np.abs(dot) > DOT_THRESHOLD:
        return lerp(t, v0, v1)
    # Calculate initial angle between v0 and v1
    theta_0 = np.arccos(dot)
    sin_theta_0 = np.sin(theta_0)
    # Angle at timestep t

    if isinstance(t, float):
        tlist = [t]
    elif isinstance(t, (list, np.ndarray)):
        tlist = t

    v2_list = []

    for ti in tlist:
        theta_t = theta_0 * ti
        sin_theta_t = np.sin(theta_t)
        # Finish the slerp algorithm
        s0 = np.sin(theta_0 - theta_t) / sin_theta_0
        s1 = sin_theta_t / sin_theta_0
        v2 = s0 * v0 + s1 * v1
        v2_list.append(v2)

    if isinstance(t, float):
        return v2_list[0]
    else:
        return v2_list

def offset_resize(image, width=512, height=512, left=0, right=0, top=0, bottom=0):
   
    image = np.array(image)[:, :, :3]
    h, w, c = image.shape
    left = min(left, w-1)
    right = min(right, w - left - 1)
    top = min(top, h - left - 1)
    bottom = min(bottom, h - top - 1)
    image = image[top:h-bottom, left:w-right]
    h, w, c = image.shape
    if h < w:
        offset = (w - h) // 2
        image = image[:, offset:offset + h]
    elif w < h:
        offset = (h - w) // 2
        image = image[offset:offset + w]
    image = Image.fromarray(image).resize((width, height))
    return image

def auto_dtype_device_shape(tlist, v0, v1, func,):
    vshape = v0.shape
    assert v0.shape == v1.shape
    assert isinstance(tlist, (list, np.ndarray))
    
    if isinstance(v0, torch.Tensor):
        is_torch = True
        dtype, device = v0.dtype, v0.device
        v0 = v0.to('cpu').numpy().astype(float).flatten()
        v1 = v1.to('cpu').numpy().astype(float).flatten()
    else:
        is_torch = False
        dtype = v0.dtype
        assert isinstance(v0, np.ndarray)
        assert isinstance(v1, np.ndarray)
        v0 = v0.astype(float).flatten()
        v1 = v1.astype(float).flatten()

    r = func(tlist, v0, v1)

    if is_torch:
        r = [torch.Tensor(ri).view(*vshape).to(dtype).to(device) for ri in r]
    else:
        r = [ri.astype(dtype) for ri in r]
    return r

auto_lerp = partial(auto_dtype_device_shape, func=lerp)
auto_slerp = partial(auto_dtype_device_shape, func=slerp)

def frames2mp4(vpath, frames, fps):
    import moviepy.editor as mpy
    frames = [np.array(framei) for framei in frames]
    clip = mpy.ImageSequenceClip(frames, fps=fps)
    clip.write_videofile(vpath, fps=fps)

def negseed_to_rndseed(seed):
    if seed < 0:
        seed = np.random.randint(0, np.iinfo(np.uint32).max-100)
    return seed

def regulate_image(pilim):
    w, h = pilim.size
    w = int(round(w/64)) * 64
    h = int(round(h/64)) * 64
    return pilim.resize([w, h], resample=PIL.Image.BILINEAR)

def txt_to_emb(model, prompt):
    text_input = model.tokenizer(
        prompt,
        padding="max_length",
        max_length=model.tokenizer.model_max_length,
        truncation=True,
        return_tensors="pt",)
    text_embeddings = model.text_encoder(text_input.input_ids.to(model.device))[0]
    return text_embeddings

def hash_pilim(pilim):
    hasha = hashlib.md5(pilim.tobytes()).hexdigest()
    return hasha

def hash_cfgdict(cfgdict):
    hashb = hashlib.md5(json.dumps(cfgdict, sort_keys=True).encode('utf-8')).hexdigest()
    return hashb

def remove_earliest_file(path, max_allowance=500, remove_ratio=0.1, ext=None):
    if len(os.listdir(path)) <= max_allowance:
        return
    def get_mtime(fname):
        return osp.getmtime(osp.join(path, fname))
    if ext is None:
        flist = sorted(os.listdir(path), key=get_mtime)
    else:
        flist = [fi for fi in os.listdir(path) if fi.endswith(ext)]
        flist = sorted(flist, key=get_mtime)
    exceedn = max(len(flist)-max_allowance, 0)
    removen = int(max_allowance*remove_ratio)
    removen = max(1, removen) + exceedn
    for fi in flist[0:removen]:
        os.remove(osp.join(path, fi))

def remove_decoupled_file(path, exta='.mp4', extb='.json'):
    tag_a = [osp.splitext(fi)[0] for fi in os.listdir(path) if fi.endswith(exta)]
    tag_b = [osp.splitext(fi)[0] for fi in os.listdir(path) if fi.endswith(extb)]
    tag_a_extra = set(tag_a) - set(tag_b)
    tag_b_extra = set(tag_b) - set(tag_a)
    [os.remove(osp.join(path, tagi+exta)) for tagi in tag_a_extra]
    [os.remove(osp.join(path, tagi+extb)) for tagi in tag_b_extra]

@spaces.GPU()
@torch.no_grad()
def t2i_core(model, xt, emb, nemb, step=30, cfg_scale=7.5, return_list=False):
    from nulltxtinv_wrapper import diffusion_step, latent2image
    model.scheduler.set_timesteps(step)
    xi = xt
    emb = txt_to_emb(model, "") if emb is None else emb
    nemb = txt_to_emb(model, "") if nemb is None else nemb
    if return_list:
        xi_list = [xi.clone()]
    for i, t in enumerate(tqdm(model.scheduler.timesteps)):
        embi = emb[i] if isinstance(emb, list) else emb
        nembi = nemb[i] if isinstance(nemb, list) else nemb
        context = torch.cat([nembi, embi])
        xi = diffusion_step(model, xi, context, t, cfg_scale, low_resource=False)
        if return_list:
            xi_list.append(xi.clone())
    x0 = xi
    im = latent2image(model.vae, x0, return_type='pil')

    if return_list:
        return im, xi_list
    else:
        return im

########
# main #
########

class wrapper(object):
    
    def __init__(self, 
                 fp16=False, 
                 tag_diffuser=None, 
                 tag_lora=None,
                 tag_scheduler=None,):

        self.device = "cuda" #if torch.cuda.is_available() else "cpu"
        if fp16:
            self.torch_dtype = torch.float16
        else:
            self.torch_dtype = torch.float32
        self.load_all(tag_diffuser, tag_lora, tag_scheduler)

        self.image_latent_dim = 4
        self.batchsize = 8
        self.seed = {}

        self.cache_video_folder = "temp/video"
        self.cache_video_maxn = 500
        self.cache_image_folder = "temp/image"
        self.cache_image_maxn = 500
        self.cache_inverse_folder = "temp/inverse"
        self.cache_inverse_maxn = 500

    
    def load_all(self, tag_diffuser, tag_lora, tag_scheduler):
        self.load_diffuser_lora(tag_diffuser, tag_lora)
        self.load_scheduler(tag_scheduler)
        return tag_diffuser, tag_lora, tag_scheduler

    
    def load_diffuser_lora(self, tag_diffuser, tag_lora):
        self.net = StableDiffusionPipeline.from_pretrained(
            choices.diffuser[tag_diffuser], torch_dtype=self.torch_dtype).to(self.device)
        self.net.safety_checker = None
        if tag_lora != 'empty':
            self.net.unet.load_attn_procs(
                choices.lora[tag_lora], use_safetensors=True,)
        self.tag_diffuser = tag_diffuser
        self.tag_lora = tag_lora
        return tag_diffuser, tag_lora

    
    def load_scheduler(self, tag_scheduler):
        self.net.scheduler = choices.scheduler[tag_scheduler].from_config(self.net.scheduler.config)
        self.tag_scheduler = tag_scheduler
        return tag_scheduler

    def reset_seed(self, which='ltintp'):
        return -1

    def recycle_seed(self, which='ltintp'):
        if which not in self.seed:
            return self.reset_seed(which=which)
        else:
            return self.seed[which]

    ##########
    # helper #
    ##########
    
    def precheck_model(self, tag_diffuser, tag_lora, tag_scheduler):
        if (tag_diffuser != self.tag_diffuser) or (tag_lora != self.tag_lora):
            self.load_all(tag_diffuser, tag_lora, tag_scheduler)
        if tag_scheduler != self.tag_scheduler:
            self.load_scheduler(tag_scheduler)

    ########
    # main #
    ########

    @spaces.GPU()
    def ddiminv(self, img, cfgdict):
        txt, step, cfg_scale = cfgdict['txt'], cfgdict['step'], cfgdict['cfg_scale']
        from nulltxtinv_wrapper import NullInversion
        null_inversion_model = NullInversion(self.net, step, cfg_scale)
        with torch.no_grad():
            emb = txt_to_emb(self.net, txt)
            nemb = txt_to_emb(self.net, "")
        xt = null_inversion_model.ddim_invert(img, txt)
        data = {
            'step' : step, 'cfg_scale' : cfg_scale, 'txt' : txt,
            'diffuser' : self.tag_diffuser, 'lora' : self.tag_lora,
            'xt': xt, 'emb': emb, 'nemb': nemb,}
        return data

    @spaces.GPU()
    def nullinv_or_loadcache(self, img, cfgdict, force_reinvert=False):
        hash = hash_pilim(img) + "--" + hash_cfgdict(cfgdict)
        cdir = self.cache_inverse_folder
        cfname = osp.join(cdir, hash+'.pth')

        if osp.isfile(cfname) and (not force_reinvert):
            cache_data = torch.load(cfname)
            dtype = next(self.net.unet.parameters()).dtype
            device = next(self.net.unet.parameters()).device
            cache_data['xt'] = cache_data['xt'].to(device=device, dtype=dtype)
            cache_data['emb'] = cache_data['emb'].to(device=device, dtype=dtype)
            cache_data['nemb'] = [
                nembi.to(device=device, dtype=dtype)
                    for nembi in cache_data['nemb']]
            return cache_data
        else:
            txt, step, cfg_scale = cfgdict['txt'], cfgdict['step'], cfgdict['cfg_scale']
            inner_step = cfgdict['inner_step']
            from nulltxtinv_wrapper import NullInversion
            null_inversion_model = NullInversion(self.net, step, cfg_scale)
            with torch.no_grad():
                emb = txt_to_emb(self.net, txt)
            xt, nemb = null_inversion_model.null_invert(img, txt, num_inner_steps=inner_step)
            cache_data = {
                'step' : step, 'cfg_scale' : cfg_scale, 'txt' : txt,
                'inner_step' : inner_step,
                'diffuser' : self.tag_diffuser, 'lora' : self.tag_lora,
                'xt' : xt.to('cpu'),
                'emb' : emb.to('cpu'),
                'nemb' : [nembi.to('cpu') for nembi in nemb],}
            os.makedirs(cdir, exist_ok=True)
            remove_earliest_file(cdir, max_allowance=self.cache_inverse_maxn)
            torch.save(cache_data, cfname)
            data = {
                'step' : step, 'cfg_scale' : cfg_scale, 'txt' : txt,
                'inner_step' : inner_step,
                'diffuser' : self.tag_diffuser, 'lora' : self.tag_lora,
                'xt' : xt, 'emb' : emb, 'nemb' : nemb,}
            return data

    @spaces.GPU()
    def nullinvdual_or_loadcachedual(self, img0, img1, cfgdict, force_reinvert=False):
        hash = hash_pilim(img0) + "--" + hash_pilim(img1) + "--" + hash_cfgdict(cfgdict)
        cdir = self.cache_inverse_folder
        cfname = osp.join(cdir, hash+'.pth')

        if osp.isfile(cfname) and (not force_reinvert):
            cache_data = torch.load(cfname)
            dtype = next(self.net.unet.parameters()).dtype
            device = next(self.net.unet.parameters()).device
            cache_data['xt0'] = cache_data['xt0'].to(device=device, dtype=dtype)
            cache_data['xt1'] = cache_data['xt1'].to(device=device, dtype=dtype)
            cache_data['emb0'] = cache_data['emb0'].to(device=device, dtype=dtype)
            cache_data['emb1'] = cache_data['emb1'].to(device=device, dtype=dtype)
            cache_data['nemb'] = [
                nembi.to(device=device, dtype=dtype)
                    for nembi in cache_data['nemb']]

            cache_data_a = copy.deepcopy(cache_data)
            cache_data_a['xt'] = cache_data_a.pop('xt0')
            cache_data_a['emb'] = cache_data_a.pop('emb0')
            cache_data_a.pop('xt1'); cache_data_a.pop('emb1')

            cache_data_b = cache_data
            cache_data_b['xt'] = cache_data_b.pop('xt1')
            cache_data_b['emb'] = cache_data_b.pop('emb1')
            cache_data_b.pop('xt0'); cache_data_b.pop('emb0')

            return cache_data_a, cache_data_b
        else:
            txt0, txt1, step, cfg_scale, inner_step = \
                cfgdict['txt0'], cfgdict['txt1'], cfgdict['step'], \
                cfgdict['cfg_scale'], cfgdict['inner_step']
            
            from nulltxtinv_wrapper import NullInversion
            null_inversion_model = NullInversion(self.net, step, cfg_scale)
            with torch.no_grad():
                emb0 = txt_to_emb(self.net, txt0)
                emb1 = txt_to_emb(self.net, txt1)
            
            xt0, xt1, nemb = null_inversion_model.null_invert_dual(
                img0, img1, txt0, txt1, num_inner_steps=inner_step)
            cache_data = {
                'step' : step, 'cfg_scale' : cfg_scale, 
                'txt0' : txt0, 'txt1' : txt1,
                'inner_step' : inner_step,
                'diffuser' : self.tag_diffuser, 'lora' : self.tag_lora,
                'xt0' : xt0.to('cpu'), 'xt1' : xt1.to('cpu'),
                'emb0' : emb0.to('cpu'), 'emb1' : emb1.to('cpu'),
                'nemb' : [nembi.to('cpu') for nembi in nemb],}
            os.makedirs(cdir, exist_ok=True)
            remove_earliest_file(cdir, max_allowance=self.cache_inverse_maxn)
            torch.save(cache_data, cfname)
            data0 = {
                'step' : step, 'cfg_scale' : cfg_scale, 'txt' : txt0,
                'inner_step' : inner_step,
                'diffuser' : self.tag_diffuser, 'lora' : self.tag_lora,
                'xt' : xt0, 'emb' : emb0, 'nemb' : nemb,}
            data1 = {
                'step' : step, 'cfg_scale' : cfg_scale, 'txt' : txt1,
                'inner_step' : inner_step,
                'diffuser' : self.tag_diffuser, 'lora' : self.tag_lora,
                'xt' : xt1, 'emb' : emb1, 'nemb' : nemb,}
            return data0, data1

    @spaces.GPU()
    def image_inversion(
            self, img, txt, 
            cfg_scale, step, 
            inversion, inner_step, force_reinvert):
        from nulltxtinv_wrapper import text2image_ldm
        if inversion == 'DDIM w/o text':
            txt = ''
        if not inversion == 'NTI':
            data = self.ddiminv(img, {'txt':txt, 'step':step, 'cfg_scale':cfg_scale,})
        else:
            data = self.nullinv_or_loadcache(
                img, {'txt':txt, 'step':step,
                      'cfg_scale':cfg_scale, 'inner_step':inner_step,
                      'diffuser' : self.tag_diffuser, 'lora' : self.tag_lora,}, force_reinvert)
        
        if inversion == 'NTI':
            img_inv, _ = text2image_ldm(
                self.net, [txt], step, cfg_scale, 
                latent=data['xt'], uncond_embeddings=data['nemb'])
        else:
            img_inv, _ = text2image_ldm(
            self.net, [txt], step, cfg_scale,
            latent=data['xt'], uncond_embeddings=None)
            
        return img_inv

    @spaces.GPU()
    def image_editing(
        self, img, txt_0, txt_1,
        cfg_scale, step, thresh, 
        inversion, inner_step, force_reinvert):
        from nulltxtinv_wrapper import text2image_ldm_imedit
        if inversion == 'DDIM w/o text':
            txt_0 = ''
        if not inversion == 'NTI':
            data = self.ddiminv(img, {'txt':txt_0, 'step':step, 'cfg_scale':cfg_scale,})
            img_edited, _ = text2image_ldm_imedit(
                self.net, thresh, [txt_0], [txt_1], step, cfg_scale,
                latent=data['xt'], uncond_embeddings=None)
        else:
            data = self.nullinv_or_loadcache(
                img, {'txt':txt_0, 'step':step,
                      'cfg_scale':cfg_scale, 'inner_step':inner_step,
                      'diffuser' : self.tag_diffuser, 'lora' : self.tag_lora,}, force_reinvert)
            img_edited, _ = text2image_ldm_imedit(
                self.net, thresh, [txt_0], [txt_1], step, cfg_scale,
                latent=data['xt'], uncond_embeddings=data['nemb'])
        
        return img_edited

    @spaces.GPU()
    def general_interpolation(
            self, xset0, xset1,
            cfg_scale, step, tlist,):

        xt0, emb0, nemb0 = xset0['xt'], xset0['emb'], xset0['nemb']
        xt1, emb1, nemb1 = xset1['xt'], xset1['emb'], xset1['nemb']
        framen = len(tlist)

        xt_list = auto_slerp(tlist, xt0, xt1)
        emb_list = auto_lerp(tlist, emb0, emb1)
        
        if isinstance(nemb0, list) and isinstance(nemb1, list):
            assert len(nemb0) == len(nemb1)
            nemb_list = [auto_lerp(tlist, e0, e1) for e0, e1 in zip(nemb0, nemb1)]
            nemb_islist = True
        else:
            nemb_list = auto_lerp(tlist, nemb0, nemb1)
            nemb_islist = False

        im_list = []
        for frameidx in range(0, len(xt_list), self.batchsize):
            xt_batch = [xt_list[idx] for idx in range(frameidx, min(frameidx+self.batchsize, framen))]
            xt_batch = torch.cat(xt_batch, dim=0)
            emb_batch = [emb_list[idx] for idx in range(frameidx, min(frameidx+self.batchsize, framen))]
            emb_batch = torch.cat(emb_batch, dim=0)
            if nemb_islist:
                nemb_batch = []
                for nembi in nemb_list:
                    nembi_batch = [nembi[idx] for idx in range(frameidx, min(frameidx+self.batchsize, framen))]
                    nembi_batch = torch.cat(nembi_batch, dim=0)
                    nemb_batch.append(nembi_batch)
            else:
                nemb_batch = [nemb_list[idx] for idx in range(frameidx, min(frameidx+self.batchsize, framen))]
                nemb_batch = torch.cat(nemb_batch, dim=0)

            im = t2i_core(
                self.net, xt_batch, emb_batch, nemb_batch, step, cfg_scale)
            im_list += im if isinstance(im, list) else [im]

        return im_list
    
    @spaces.GPU()
    def run_iminvs(
            self, img, text, 
            cfg_scale, step, 
            force_resize, width, height,
            inversion, inner_step, force_reinvert,
            tag_diffuser, tag_lora, tag_scheduler, ):
        
        self.precheck_model(tag_diffuser, tag_lora, tag_scheduler)
        
        if force_resize:
            img = offset_resize(img, width, height)
        else:
            img = regulate_image(img)

        recon_output = self.image_inversion(
            img, text, cfg_scale, step, 
            inversion, inner_step, force_reinvert)

        idir = self.cache_image_folder
        os.makedirs(idir, exist_ok=True)
        remove_earliest_file(idir, max_allowance=self.cache_image_maxn)
        sname = "time{}_iminvs_{}_{}".format(
            int(time.time()), self.tag_diffuser, self.tag_lora,)
        ipath = osp.join(idir, sname+'.png')
        recon_output.save(ipath)
        
        return [recon_output]
    
    @spaces.GPU()
    def run_imedit(
            self, img, txt_0,txt_1, 
            threshold, cfg_scale, step, 
            force_resize, width, height,
            inversion, inner_step, force_reinvert,
            tag_diffuser, tag_lora, tag_scheduler, ):
        
        self.precheck_model(tag_diffuser, tag_lora, tag_scheduler)
        if force_resize:
            img = offset_resize(img, width, height)
        else:
            img = regulate_image(img)

        edited_img= self.image_editing(
            img, txt_0,txt_1, cfg_scale, step, threshold,
            inversion, inner_step, force_reinvert)

        idir = self.cache_image_folder
        os.makedirs(idir, exist_ok=True)
        remove_earliest_file(idir, max_allowance=self.cache_image_maxn)
        sname = "time{}_imedit_{}_{}".format(
            int(time.time()), self.tag_diffuser, self.tag_lora,)
        ipath = osp.join(idir, sname+'.png')
        edited_img.save(ipath)
     
        return [edited_img]

    @spaces.GPU()
    def run_imintp(
            self, 
            img0, img1, txt0, txt1,
            cfg_scale, step, 
            framen, fps, 
            force_resize, width, height,
            inversion, inner_step, force_reinvert,
            tag_diffuser, tag_lora, tag_scheduler,):
        
        self.precheck_model(tag_diffuser, tag_lora, tag_scheduler)
        if txt1 == '':
            txt1 = txt0
        if force_resize:
            img0 = offset_resize(img0, width, height)
            img1 = offset_resize(img1, width, height)
        else:
            img0 = regulate_image(img0)
            img1 = regulate_image(img1)

        if inversion == 'DDIM':
            data0 = self.ddiminv(img0, {'txt':txt0, 'step':step, 'cfg_scale':cfg_scale,})
            data1 = self.ddiminv(img1, {'txt':txt1, 'step':step, 'cfg_scale':cfg_scale,})
        elif inversion == 'DDIM w/o text':
            data0 = self.ddiminv(img0, {'txt':"", 'step':step, 'cfg_scale':cfg_scale,})
            data1 = self.ddiminv(img1, {'txt':"", 'step':step, 'cfg_scale':cfg_scale,})
        else:
            data0, data1 = self.nullinvdual_or_loadcachedual(
                img0, img1, {'txt0':txt0, 'txt1':txt1, 'step':step,
                             'cfg_scale':cfg_scale, 'inner_step':inner_step,
                             'diffuser' : self.tag_diffuser, 'lora' : self.tag_lora,}, force_reinvert)

        tlist = np.linspace(0.0, 1.0, framen)

        iminv0 = t2i_core(self.net, data0['xt'], data0['emb'], data0['nemb'], step, cfg_scale)
        iminv1 = t2i_core(self.net, data1['xt'], data1['emb'], data1['nemb'], step, cfg_scale)
        frames = self.general_interpolation(data0, data1, cfg_scale, step, tlist)

        vdir = self.cache_video_folder
        os.makedirs(vdir, exist_ok=True)
        remove_earliest_file(vdir, max_allowance=self.cache_video_maxn)
        sname = "time{}_imintp_{}_{}_framen{}_fps{}".format(
            int(time.time()), self.tag_diffuser, self.tag_lora, framen, fps)
        vpath = osp.join(vdir, sname+'.mp4')
        frames2mp4(vpath, frames, fps)
        jpath = osp.join(vdir, sname+'.json')
        cfgdict = {
            "method" : "image_interpolation",
            "txt0" : txt0, "txt1" : txt1,
            "cfg_scale" : cfg_scale, "step" : step, 
            "framen" : framen, "fps" : fps,
            "force_resize" : force_resize, "width" : width, "height" : height,
            "inversion" : inversion, "inner_step" : inner_step, 
            "force_reinvert" : force_reinvert, 
            "tag_diffuser" : tag_diffuser, "tag_lora" : tag_lora, "tag_scheduler" : tag_scheduler,}
        with open(jpath, 'w') as f:
            json.dump(cfgdict, f, indent=4)

        return frames, vpath, [iminv0, iminv1]

#################
# get examples #
#################
cache_examples = False
def get_imintp_example():
    case = [
        [
            'assets/images/interpolation/cityview1.png', 
            'assets/images/interpolation/cityview2.png', 
            'A city view',],
        [
            'assets/images/interpolation/woman1.png', 
            'assets/images/interpolation/woman2.png', 
            'A woman face',],
        [
            'assets/images/interpolation/land1.png', 
            'assets/images/interpolation/land2.png', 
            'A beautiful landscape',],
        [
            'assets/images/interpolation/dog1.png', 
            'assets/images/interpolation/dog2.png', 
            'A realistic dog',],
        [
            'assets/images/interpolation/church1.png', 
            'assets/images/interpolation/church2.png', 
            'A church',],
        [
            'assets/images/interpolation/rabbit1.png', 
            'assets/images/interpolation/rabbit2.png', 
            'A cute rabbit',],
        [
            'assets/images/interpolation/horse1.png', 
            'assets/images/interpolation/horse2.png', 
            'A robot horse',],
    ]
    return case

def get_iminvs_example():
    case = [
        [
            'assets/images/inversion/000000560011.jpg', 
            'A mouse is next to a keyboard on a desk',],
        [
            'assets/images/inversion/000000029596.jpg', 
            'A room with a couch, table set with dinnerware and a television.',],
    ]
    return case


def get_imedit_example():
    case = [
        [
            'assets/images/editing/rabbit.png', 
            'A rabbit is eating a watermelon on the table', 
            'A cat is eating a watermelon on the table', 
            0.7,],
        [
            'assets/images/editing/cake.png', 
            'A chocolate cake with cream on it', 
            'A chocolate cake with strawberries on it', 
            0.9,],
        [
            'assets/images/editing/banana.png', 
            'A banana on the table', 
            'A banana and an apple on the table', 
            0.8,],
        
    ]
    return case


#################
# sub interface #
#################


def interface_imintp(wrapper_obj):
    with gr.Row():
        with gr.Column():
            img0 = gr.Image(label="Image Input 0", type='pil',  elem_id='customized_imbox')
        with gr.Column():
            img1 = gr.Image(label="Image Input 1", type='pil', elem_id='customized_imbox')
        with gr.Column():
            video_output = gr.Video(label="Video Result", format='mp4', elem_id='customized_imbox')
    with gr.Row(): 
        with gr.Column():      
            txt0 = gr.Textbox(label='Text Input', lines=1, placeholder="Input prompt...", )
        with gr.Column(): 
            with gr.Row():
                inversion = auto_dropdown('Inversion', choices.inversion, default.inversion)
                inner_step = gr.Slider(label="Inner Step (NTI)", value=default.nullinv_inner_step, minimum=1, maximum=10, step=1)
                force_reinvert = gr.Checkbox(label="Force ReInvert (NTI)", value=False)
                    

    with gr.Row():
        with gr.Column(): 
            with gr.Row():
                framen = gr.Slider(label="Frame Number", minimum=8, maximum=default.framen, value=default.framen, step=1)
                fps = gr.Slider(label="Video FPS", minimum=4, maximum=default.fps, value=default.fps, step=4)
            with gr.Row():
                button_run = gr.Button("Run") 
            

        with gr.Column():
            with gr.Accordion('Frame Results', open=False):
                frame_output = gr.Gallery(label="Frames", elem_id='customized_imbox')
            with gr.Accordion("Inversion Results", open=False):
                inv_output = gr.Gallery(label="Inversion Results", elem_id='customized_imbox')
            with gr.Accordion('Advanced Settings', open=False):
                with gr.Row():
                    tag_diffuser = auto_dropdown('Diffuser', choices.diffuser, default.diffuser)
                    tag_lora = auto_dropdown('Use LoRA', choices.lora, default.lora)
                    tag_scheduler = auto_dropdown('Scheduler', choices.scheduler, default.scheduler)
                with gr.Row():
                    cfg_scale = gr.Number(label="Scale", minimum=1, maximum=10, value=default.cfg_scale, step=0.5)
                    step = gr.Number(default.step, label="Step", precision=0)
                with gr.Row():
                    force_resize = gr.Checkbox(label="Force Resize", value=True)
                    inp_width = gr.Slider(label="Width", minimum=256, maximum=1024, value=512, step=64)
                    inp_height = gr.Slider(label="Height", minimum=256, maximum=1024, value=512, step=64)
                with gr.Row():
                    txt1 = gr.Textbox(label='Optional Different Text Input for Image Input 1', lines=1, placeholder="Input prompt...", )
            

    tag_diffuser.change(
        wrapper_obj.load_all,
        inputs = [tag_diffuser, tag_lora, tag_scheduler],
        outputs = [tag_diffuser, tag_lora, tag_scheduler],)

    tag_lora.change(
        wrapper_obj.load_all,
        inputs = [tag_diffuser, tag_lora, tag_scheduler],
        outputs = [tag_diffuser, tag_lora, tag_scheduler],)

    tag_scheduler.change(
        wrapper_obj.load_scheduler,
        inputs = [tag_scheduler],
        outputs = [tag_scheduler],)

    button_run.click(
        wrapper_obj.run_imintp,
        inputs=[img0, img1, txt0, txt1,
                cfg_scale, step, 
                framen, fps, 
                force_resize, inp_width, inp_height,
                inversion, inner_step, force_reinvert,
                tag_diffuser, tag_lora, tag_scheduler,],
        outputs=[frame_output, video_output, inv_output])

    gr.Examples(
        label='Examples', 
        examples=get_imintp_example(), 
        fn=wrapper_obj.run_imintp,
        inputs=[img0, img1, txt0,],
        outputs=[frame_output, video_output, inv_output],
        cache_examples=cache_examples,)

def interface_iminvs(wrapper_obj):
    with gr.Row():
        image_input = gr.Image(label="Image input", type='pil', elem_id='customized_imbox')
        recon_output = gr.Gallery(label="Reconstruction output", elem_id='customized_imbox')
    with gr.Row():
        with gr.Column():
            prompt = gr.Textbox(label='Text Input', lines=1, placeholder="Input prompt...", )
            with gr.Row():
                button_run = gr.Button("Run")
            
            
        with gr.Column():
            with gr.Row():
                inversion = auto_dropdown('Inversion', choices.inversion, default.inversion)
                inner_step = gr.Slider(label="Inner Step (NTI)", value=default.nullinv_inner_step, minimum=1, maximum=10, step=1)
                force_reinvert = gr.Checkbox(label="Force ReInvert (NTI)", value=False)
            with gr.Accordion('Advanced Settings', open=False):
                with gr.Row():
                    tag_diffuser = auto_dropdown('Diffuser', choices.diffuser, default.diffuser)
                    tag_lora = auto_dropdown('Use LoRA', choices.lora, default.lora)
                    tag_scheduler = auto_dropdown('Scheduler', choices.scheduler, default.scheduler)
                with gr.Row():
                    cfg_scale = gr.Number(label="Scale", minimum=1, maximum=10, value=default.cfg_scale, step=0.5)
                    step = gr.Number(default.step, label="Step", precision=0)
                with gr.Row():
                    force_resize = gr.Checkbox(label="Force Resize", value=True)
                    inp_width = gr.Slider(label="Width", minimum=256, maximum=1024, value=512, step=64)
                    inp_height = gr.Slider(label="Height", minimum=256, maximum=1024, value=512, step=64)
            

    tag_diffuser.change(
        wrapper_obj.load_all,
        inputs = [tag_diffuser, tag_lora, tag_scheduler],
        outputs = [tag_diffuser, tag_lora, tag_scheduler],)

    tag_lora.change(
        wrapper_obj.load_all,
        inputs = [tag_diffuser, tag_lora, tag_scheduler],
        outputs = [tag_diffuser, tag_lora, tag_scheduler],)

    tag_scheduler.change(
        wrapper_obj.load_scheduler,
        inputs = [tag_scheduler],
        outputs = [tag_scheduler],)

    button_run.click(
        wrapper_obj.run_iminvs,
        inputs=[image_input, prompt,  
                cfg_scale, step, 
                force_resize, inp_width, inp_height,
                inversion, inner_step, force_reinvert, 
                tag_diffuser, tag_lora, tag_scheduler,],
        outputs=[recon_output])
    
    gr.Examples(
        label='Examples', 
        examples=get_iminvs_example(), 
        fn=wrapper_obj.run_iminvs,
        inputs=[image_input, prompt,],
        outputs=[recon_output],
        cache_examples=cache_examples,)


def interface_imedit(wrapper_obj):
    with gr.Row():
        image_input = gr.Image(label="Image input", type='pil', elem_id='customized_imbox')
        edited_output = gr.Gallery(label="Edited output", elem_id='customized_imbox')
    with gr.Row():
        with gr.Column():
            prompt_0 = gr.Textbox(label='Source Text', lines=1, placeholder="Source prompt...", )
            prompt_1 = gr.Textbox(label='Target Text', lines=1, placeholder="Target prompt...", )
            with gr.Row():
                button_run = gr.Button("Run")
            
        with gr.Column():
            with gr.Row():
                inversion = auto_dropdown('Inversion', choices.inversion, default.inversion)
                inner_step = gr.Slider(label="Inner Step (NTI)", value=default.nullinv_inner_step, minimum=1, maximum=10, step=1)
                force_reinvert = gr.Checkbox(label="Force ReInvert (NTI)", value=False)
                threshold = gr.Slider(label="Threshold", minimum=0, maximum=1, value=default.threshold, step=0.1)
            with gr.Accordion('Advanced Settings', open=False):
                with gr.Row():
                    tag_diffuser = auto_dropdown('Diffuser', choices.diffuser, default.diffuser)
                    tag_lora = auto_dropdown('Use LoRA', choices.lora, default.lora)
                    tag_scheduler = auto_dropdown('Scheduler', choices.scheduler, default.scheduler)
                with gr.Row():
                    cfg_scale = gr.Number(label="Scale", minimum=1, maximum=10, value=default.cfg_scale, step=0.5)
                    step = gr.Number(default.step, label="Step", precision=0)
                with gr.Row():
                    force_resize = gr.Checkbox(label="Force Resize", value=True)
                    inp_width = gr.Slider(label="Width", minimum=256, maximum=1024, value=512, step=64)
                    inp_height = gr.Slider(label="Height", minimum=256, maximum=1024, value=512, step=64)
            

    tag_diffuser.change(
        wrapper_obj.load_all,
        inputs = [tag_diffuser, tag_lora, tag_scheduler],
        outputs = [tag_diffuser, tag_lora, tag_scheduler],)

    tag_lora.change(
        wrapper_obj.load_all,
        inputs = [tag_diffuser, tag_lora, tag_scheduler],
        outputs = [tag_diffuser, tag_lora, tag_scheduler],)

    tag_scheduler.change(
        wrapper_obj.load_scheduler,
        inputs = [tag_scheduler],
        outputs = [tag_scheduler],)

    button_run.click(
        wrapper_obj.run_imedit,
        inputs=[image_input, prompt_0, prompt_1, 
                threshold, cfg_scale, step, 
                force_resize, inp_width, inp_height,
                inversion, inner_step, force_reinvert, 
                tag_diffuser, tag_lora, tag_scheduler,],
        outputs=[edited_output])
    
    gr.Examples(
        label='Examples', 
        examples=get_imedit_example(), 
        fn=wrapper_obj.run_imedit,
        inputs=[image_input, prompt_0, prompt_1, threshold,],
        outputs=[edited_output],
        cache_examples=cache_examples,)
        

#############
# Interface #
#############

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-p', '--port', type=int, default=None)
    args = parser.parse_args()
    from app_utils import css_empty, css_version_4_11_0
    # css = css_empty
    css = css_version_4_11_0

    wrapper_obj = wrapper(
        fp16=False, 
        tag_diffuser=default.diffuser,
        tag_lora=default.lora,
        tag_scheduler=default.scheduler)

    if True:
        with gr.Blocks(css=css) as demo:
            gr.HTML(
                """
                <div style="text-align: center; max-width: 1200px; margin: 20px auto;">
                <h1 style="font-weight: 900; font-size: 3rem; margin: 0rem">
                    {}
                </h1>
                <h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
                <b>Smooth Diffusion</b> is a new category of diffusion models that is simultaneously high-performing and smooth. <br>
                Our method formally introduces latent space smoothness to diffusion models like Stable Diffusion. This smoothness dramatically aids in: 1) improving the continuity of transitions in image interpolation, 2) reducing approximation errors in image inversion, and 3) better preserving unedited contents in image editing.
                </h2>
                <h3 style="font-weight: 450; font-size: 1rem; margin: 0rem">
                <a href="https://www.jiayiguo.net/" target="_blank">Jiayi Guo</a>, <a href="https://www.linkedin.com/in/xingqian-xu-97b46526/" target="_blank">Xingqian Xu</a>, 
                <a href="https://scholar.google.com/citations?user=oM9rnYQAAAAJ&hl=en" target="_blank">Yifan Pu</a>, <a href="https://scholar.google.com/citations?user=Yibz_asAAAAJ&hl=en" target="_blank">Zanlin Ni</a>, 
                <a href="https://scholar.google.com/citations?user=-hwGMHcAAAAJ&hl=en" target="_blank">Chaofei Wang</a>, <a href="https://in.linkedin.com/in/v-manushree" target="_blank">Manushree Vasu</a>, 
                <a href="https://www.au.tsinghua.edu.cn/info/1103/1553.htm" target="_blank">Shiji Song</a>, <a href="https://www.gaohuang.net/" target="_blank">Gao Huang</a> 
                and <a href="https://www.humphreyshi.com/home">Humphrey Shi</a> 
                [<a href="https://arxiv.org/abs/2312.04410" style="color:blue;">arXiv</a>] 
                [<a href="https://github.com/SHI-Labs/Smooth-Diffusion" style="color:blue;">GitHub</a>]
                </h3>
                </div>
                """.format(version))

            with gr.Tab('Image Interpolation'):
                interface_imintp(wrapper_obj)
            with gr.Tab('Image Inversion'):
                interface_iminvs(wrapper_obj)
            with gr.Tab('Image Editing'):
                interface_imedit(wrapper_obj)

            gr.Markdown(r"""
            If you find our work helpful, please **star 🌟** the <a href='https://github.com/SHI-Labs/Smooth-Diffusion' target='_blank'>Github Repo</a>. Thanks for your support! 
            [![GitHub Stars](https://img.shields.io/github/stars/SHI-Labs/Smooth-Diffusion?style=social)](https://github.com/SHI-Labs/Smooth-Diffusion)
            ---
            📑 **Citation**
            <br>
            If our work is useful for your research, please consider citing:
            ```bibtex
            @InProceedings{guo2024smooth,
              title={Smooth Diffusion: Crafting Smooth Latent Spaces in Diffusion Models},
              author={Jiayi Guo and Xingqian Xu and Yifan Pu and Zanlin Ni and Chaofei Wang and Manushree Vasu and Shiji Song and Gao Huang and Humphrey Shi},
              booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
              year={2024}
            }
            ```
            """)

        demo.queue()
        demo.launch()