test160b

Runtime error

App Files Files Community

tubui commited on Jun 15, 2023

Commit

dfec228

•

0 Parent(s):

Duplicate from tubui/test

Browse files

Files changed (39) hide show

.gitattributes +34 -0
Dockerfile +10 -0
Embed_Secret.py +264 -0
README.md +12 -0
cldm/ae.py +727 -0
cldm/cldm.py +517 -0
cldm/diffsteg.py +782 -0
cldm/hack.py +113 -0
cldm/logger.py +149 -0
cldm/loss.py +78 -0
cldm/loss_weight_scheduler.py +17 -0
cldm/model.py +28 -0
cldm/plms.py +1481 -0
cldm/tmp.py +340 -0
cldm/transformations.py +127 -0
cldm/transformations2.py +415 -0
cldm/utils.py +539 -0
flae/models.py +325 -0
flae/munit.py +576 -0
flae/unet.py +123 -0
ldm/modules/ema.py +80 -0
ldm/util.py +197 -0
pages/Extract_Secret.py +108 -0
tools/__init__.py +3 -0
tools/augment_imagenetc.py +155 -0
tools/base_lmdb.py +588 -0
tools/ecc.py +281 -0
tools/eval_metrics.py +130 -0
tools/fid.py +672 -0
tools/fid_lmdb.py +683 -0
tools/gradcam.py +152 -0
tools/helpers.py +416 -0
tools/hparams.py +743 -0
tools/image_dataset.py +184 -0
tools/image_dataset_generic.py +157 -0
tools/image_tools.py +164 -0
tools/imgcap_dataset.py +163 -0
tools/sifid.py +246 -0
tools/slack_bot.py +157 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,10 @@

+FROM tuvbui/torchcpu:torch111
+ADD cldm ./cldm
+ADD flae ./flae
+ADD ldm ./ldm
+ADD tools ./tools
+ADD pages ./pages
+ADD Embed_Secret.py .
+EXPOSE 7860
+CMD streamlit run Embed_Secret.py --server.enableXsrfProtection=false --server.port 7860 -- --weight https://kahlan.cvssp.org/data/Flickr25K/tubui/stega/unet100b_croprs/epoch=000070-step=000219999.ckpt --config https://kahlan.cvssp.org/data/Flickr25K/tubui/stega/unet100b_croprs/-project.yaml

Embed_Secret.py ADDED Viewed

	@@ -0,0 +1,264 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+streamlit app demo
+how to run:
+streamlit run app.py --server.port 8501
+@author: Tu Bui @surrey.ac.uk
+"""
+import os, sys, torch
+import argparse
+from pathlib import Path
+import numpy as np
+import pickle
+import pytorch_lightning as pl
+from torchvision import transforms
+import argparse
+from ldm.util import instantiate_from_config
+from omegaconf import OmegaConf
+from PIL import Image
+from tools.augment_imagenetc import RandomImagenetC
+from io import BytesIO
+from tools.helpers import welcome_message
+from tools.ecc import BCH, RSC
+import streamlit as st
+from streamlit.source_util import (
+    page_icon_and_name,
+    calc_md5,
+    get_pages,
+    _on_pages_changed
+)
+model_names = ['UNet']
+def delete_page(main_script_path_str, page_name):
+    current_pages = get_pages(main_script_path_str)
+    for key, value in current_pages.items():
+        print(value['page_name'])
+        if value['page_name'] == page_name:
+            del current_pages[key]
+            break
+        else:
+            pass
+    _on_pages_changed.send()
+def add_page(main_script_path_str, page_name):
+    pages = get_pages(main_script_path_str)
+    main_script_path = Path(main_script_path_str)
+    pages_dir = main_script_path.parent / "pages"
+    # st.write(list(pages_dir.glob("*.py"))+list(main_script_path.parent.glob("*.py")))
+    script_path = [f for f in list(pages_dir.glob("*.py"))+list(main_script_path.parent.glob("*.py")) if f.name.find(page_name) != -1][0]
+    script_path_str = str(script_path.resolve())
+    pi, pn = page_icon_and_name(script_path)
+    psh = calc_md5(script_path_str)
+    pages[psh] = {
+        "page_script_hash": psh,
+        "page_name": pn,
+        "icon": pi,
+        "script_path": script_path_str,
+    }
+    _on_pages_changed.send()
+def unormalize(x):
+    # convert x in range [-1, 1], (B,C,H,W), tensor to [0, 255], uint8, numpy, (B,H,W,C)
+    x = torch.clamp((x + 1) * 127.5, 0, 255).permute(0, 2, 3, 1).cpu().numpy().astype(np.uint8)
+    return x
+def to_bytes(x, mime):
+    x = Image.fromarray(x)
+    buf = BytesIO()
+    f = "JPEG" if mime == 'image/jpeg' else "PNG"
+    x.save(buf, format=f)
+    byte_im = buf.getvalue()
+    return byte_im
+def load_UNet(args):
+    print('args: ', args)
+    # # crop safe model
+    # config_file = '/mnt/fast/nobackup/scratch4weeks/tb0035/projects/diffsteg/FLAE/simple_tform2/configs/-project.yaml'
+    # weight_file = '/mnt/fast/nobackup/scratch4weeks/tb0035/projects/diffsteg/FLAE/simple_tform2/checkpoints/epoch=000060-step=000189999.ckpt'
+    # # resized crop safe model
+    # config_file = '/mnt/fast/nobackup/scratch4weeks/tb0035/projects/diffsteg/FLAE/simple_t2_croprs/configs/-project.yaml'
+    # weight_file = '/mnt/fast/nobackup/scratch4weeks/tb0035/projects/diffsteg/FLAE/simple_t2_croprs/checkpoints/epoch=000070-step=000219999.ckpt'
+    config_file = args.config
+    weight_file = args.weight
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    if weight_file.startswith('http'):  # download from url
+        weight_dir = Path('./weights')
+        weight_dir.mkdir(exist_ok=True)
+        weight_path = weight_dir / weight_file.split('/')[-1]
+        config_path = weight_dir / config_file.split('/')[-1]
+        if not weight_path.exists():
+            import wget
+            print(f'Downloading {weight_file}...')
+            with st.spinner("Downloading model... this may take awhile!"):
+                wget.download(weight_file, str(weight_path))
+                wget.download(config_file, str(config_path))
+        weight_file = str(weight_path)
+        config_file = str(config_path)
+    config = OmegaConf.load(config_file).model
+    secret_len = config.params.secret_len
+    print(f'Secret length: {secret_len}')
+    model = instantiate_from_config(config)
+    state_dict = torch.load(weight_file, map_location=torch.device('cpu'))
+    if 'global_step' in state_dict:
+        print(f'Global step: {state_dict["global_step"]}, epoch: {state_dict["epoch"]}')
+    if 'state_dict' in state_dict:
+        state_dict = state_dict['state_dict']
+    misses, ignores = model.load_state_dict(state_dict, strict=False)
+    print(f'Missed keys: {misses}\nIgnore keys: {ignores}')
+    model = model.to(device)
+    model.eval()
+    return model, secret_len
+def embed_secret(model_name, model, cover, tform, secret):
+    if model_name == 'UNet':
+        w, h = cover.size
+        with torch.no_grad():
+            im = tform(cover).unsqueeze(0).to(model.device)  # 1, 3, 256, 256
+            stego, _ = model(im, secret)  # 1, 3, 256, 256
+            res = (stego.clamp(-1,1) - im)  # (1,3,256,256) residual
+            res = torch.nn.functional.interpolate(res, (h,w), mode='bilinear')
+            res = res.permute(0,2,3,1).cpu().numpy()  # (1,256,256,3)
+            stego_uint8 = np.clip(res[0] + np.array(cover)/127.5-1., -1,1)*127.5+127.5  # (256, 256, 3), ndarray, uint8
+            stego_uint8 = stego_uint8.astype(np.uint8)
+    else:
+        raise NotImplementedError
+    return stego_uint8
+def identity(x):
+    return x
+def decode_secret(model_name, model, im, tform):
+    if model_name in ['RoSteALS', 'UNet']:
+        with torch.no_grad():
+            im = tform(im).unsqueeze(0).to(model.device)  # 1, 3, 256, 256
+            secret_pred = (model.decoder(im) > 0).cpu().numpy()  # 1, 100
+    else:
+        raise NotImplementedError
+    return secret_pred
+@st.cache_resource
+def load_model(model_name, _args):
+    if model_name == 'UNet':
+        tform_emb = transforms.Compose([
+            transforms.Resize((256,256)),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+        ])
+        tform_det = transforms.Compose([
+            transforms.Resize((224,224)),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+        ])
+        model, secret_len = load_UNet(_args)
+    else:
+        raise NotImplementedError
+    return model, tform_emb, tform_det, secret_len
+@st.cache_resource
+def load_ecc(ecc_name, secret_len):
+    if ecc_name == 'BCH':
+        if secret_len == 160:
+            ecc = BCH(285, 10, secret_len, verbose=True)
+        elif secret_len == 100:
+            ecc = BCH(137, 5, payload_len= secret_len, verbose=True)
+    elif ecc_name == 'RSC':
+        ecc = RSC(data_bytes=16, ecc_bytes=4, verbose=True)
+    return ecc
+class Resize(object):
+    def __init__(self, size=None) -> None:
+        self.size = size
+    def __call__(self, x, size=None):
+        if isinstance(x, np.ndarray):
+            x = Image.fromarray(x)
+        new_size = size if size is not None else self.size
+        if min(x.size) > min(new_size):  # downsample
+            x = x.resize(new_size, Image.LANCZOS)
+        else:  # upsample
+            x = x.resize(new_size, Image.BILINEAR)
+        x = np.array(x)
+        return x
+def parse_st_args():
+    # usage: streamlit run app.py -- --arg1 val1 --arg2 val2
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--weight', default='/mnt/fast/nobackup/scratch4weeks/tb0035/projects/diffsteg/FLAE/simple_t2_croprs/checkpoints/epoch=000070-step=000219999.ckpt')
+    parser.add_argument('--config', default='/mnt/fast/nobackup/scratch4weeks/tb0035/projects/diffsteg/FLAE/simple_t2_croprs/configs/-project.yaml')
+    # parser.add_argument('--cpu', action='store_true')
+    args = parser.parse_args()
+    return args
+def app(args):
+    # delete_page('Embed_Secret', 'Extract_Secret')
+    st.title('Watermarking Demo')
+    # setup model
+    model_name = st.selectbox("Choose the model", model_names)
+    model, tform_emb, tform_det, secret_len = load_model(model_name, args)
+    display_width = 300
+    # ecc
+    ecc = load_ecc('BCH', secret_len)
+    # setup st
+    st.subheader("Input")
+    image_file = st.file_uploader("Upload an image", type=["png","jpg","jpeg"])
+    if image_file is not None:
+        print('Image: ', image_file.name)
+        ext = image_file.name.split('.')[-1]
+        im = Image.open(image_file).convert('RGB')
+        size0 = im.size
+        st.image(im, width=display_width)
+    secret_text = st.text_input(f'Input the secret (max {ecc.data_len} chars)', 'A secret')
+    assert len(secret_text) <= ecc.data_len
+    # embed
+    st.subheader("Embed results")
+    status = st.empty()
+    prep = transforms.Compose([
+        transforms.Resize((256,256)),
+        transforms.CenterCrop((224,224))
+    ])
+    if image_file is not None and secret_text is not None:
+        secret = ecc.encode_text([secret_text])  # (1, len)
+        secret = torch.from_numpy(secret).float().to(model.device)
+        # im = tform(im).unsqueeze(0).cuda()  # (1,3,H,W)
+        stego = embed_secret(model_name, model, im, tform_emb, secret)
+        st.image(stego, width=display_width)
+        # download button
+        mime='image/jpeg' if ext=='jpg' else f'image/{ext}'
+        stego_bytes = to_bytes(stego, mime)
+        st.download_button(label='Download image', data=stego_bytes, file_name=f'stego.{ext}', mime=mime)
+        # verify secret
+        stego_processed = prep(Image.fromarray(stego))
+        secret_pred = decode_secret(model_name, model, stego_processed, tform_det)
+        bit_acc = (secret_pred == secret.cpu().numpy()).mean()
+        secret_pred = ecc.decode_text(secret_pred)[0]
+        status.markdown('**Secret recovery check:** ' + secret_pred, unsafe_allow_html=True)
+        status.markdown('**Bit accuracy:** ' + str(bit_acc), unsafe_allow_html=True)
+if __name__ == '__main__':
+    args = parse_st_args()
+    app(args)

README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: Test
+emoji: 🐠
+colorFrom: indigo
+colorTo: blue
+sdk: docker
+pinned: false
+license: cc-by-nc-sa-4.0
+duplicated_from: tubui/test
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

cldm/ae.py ADDED Viewed

	@@ -0,0 +1,727 @@

+import numpy as np
+import einops
+import torch
+import torch as th
+import torch.nn as nn
+from torch.nn import functional as thf
+import pytorch_lightning as pl
+import torchvision
+from copy import deepcopy
+from ldm.modules.diffusionmodules.util import (
+    conv_nd,
+    linear,
+    zero_module,
+    timestep_embedding,
+)
+from contextlib import contextmanager, nullcontext
+from einops import rearrange, repeat
+from torchvision.utils import make_grid
+from ldm.modules.attention import SpatialTransformer
+from ldm.modules.diffusionmodules.openaimodel import UNetModel, TimestepEmbedSequential, ResBlock, Downsample, AttentionBlock
+from ldm.models.diffusion.ddpm import LatentDiffusion
+from ldm.util import log_txt_as_img, exists, instantiate_from_config, default
+from ldm.models.diffusion.ddim import DDIMSampler
+from ldm.modules.ema import LitEma
+from ldm.modules.distributions.distributions import normal_kl, DiagonalGaussianDistribution
+from ldm.modules.diffusionmodules.model import Encoder
+import lpips
+import kornia
+from kornia import color
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+class View(nn.Module):
+    def __init__(self, *shape):
+        super().__init__()
+        self.shape = shape
+    def forward(self, x):
+        return x.view(*self.shape)
+class SecretEncoder3(nn.Module):
+    def __init__(self, secret_len, base_res=16, resolution=64) -> None:
+        super().__init__()
+        log_resolution = int(np.log2(resolution))
+        log_base = int(np.log2(base_res))
+        self.secret_len = secret_len
+        self.secret_scaler = nn.Sequential(
+            nn.Linear(secret_len, base_res*base_res*3),
+            nn.SiLU(),
+            View(-1, 3, base_res, base_res),
+            nn.Upsample(scale_factor=(2**(log_resolution-log_base), 2**(log_resolution-log_base))),  # chx16x16 -> chx256x256
+            zero_module(conv_nd(2, 3, 3, 3, padding=1))
+        )  # secret len -> ch x res x res
+    def copy_encoder_weight(self, ae_model):
+        # misses, ignores = self.load_state_dict(ae_state_dict, strict=False)
+        return None
+    def encode(self, x):
+        x = self.secret_scaler(x)
+        return x
+    def forward(self, x, c):
+        # x: [B, C, H, W], c: [B, secret_len]
+        c = self.encode(c)
+        return c, None
+class SecretEncoder4(nn.Module):
+    """same as SecretEncoder3 but with ch as input"""
+    def __init__(self, secret_len, ch=3, base_res=16, resolution=64) -> None:
+        super().__init__()
+        log_resolution = int(np.log2(resolution))
+        log_base = int(np.log2(base_res))
+        self.secret_len = secret_len
+        self.secret_scaler = nn.Sequential(
+            nn.Linear(secret_len, base_res*base_res*ch),
+            nn.SiLU(),
+            View(-1, ch, base_res, base_res),
+            nn.Upsample(scale_factor=(2**(log_resolution-log_base), 2**(log_resolution-log_base))),  # chx16x16 -> chx256x256
+            zero_module(conv_nd(2, ch, ch, 3, padding=1))
+        )  # secret len -> ch x res x res
+    def copy_encoder_weight(self, ae_model):
+        # misses, ignores = self.load_state_dict(ae_state_dict, strict=False)
+        return None
+    def encode(self, x):
+        x = self.secret_scaler(x)
+        return x
+    def forward(self, x, c):
+        # x: [B, C, H, W], c: [B, secret_len]
+        c = self.encode(c)
+        return c, None
+class SecretEncoder6(nn.Module):
+    """join img emb with secret emb"""
+    def __init__(self, secret_len, ch=3, base_res=16, resolution=64, emode='c3') -> None:
+        super().__init__()
+        assert emode in ['c3', 'c2', 'm3']
+        if emode == 'c3':  # c3: concat c and x each has ch channels
+            secret_ch = ch
+            join_ch = 2*ch
+        elif emode == 'c2':  # c2: concat c (2) and x ave (1)
+            secret_ch = 2
+            join_ch = ch
+        elif emode == 'm3':  # m3: multiply c (ch) and x (ch)
+            secret_ch = ch
+            join_ch = ch
+        # m3: multiply c (ch) and x ave (1)
+        log_resolution = int(np.log2(resolution))
+        log_base = int(np.log2(base_res))
+        self.secret_len = secret_len
+        self.emode = emode
+        self.resolution = resolution
+        self.secret_scaler = nn.Sequential(
+            nn.Linear(secret_len, base_res*base_res*secret_ch),
+            nn.SiLU(),
+            View(-1, secret_ch, base_res, base_res),
+            nn.Upsample(scale_factor=(2**(log_resolution-log_base), 2**(log_resolution-log_base))),  # chx16x16 -> chx256x256
+        )  # secret len -> ch x res x res
+        self.join_encoder = nn.Sequential(
+            conv_nd(2, join_ch, join_ch, 3, padding=1),
+            nn.SiLU(),
+            conv_nd(2, join_ch, ch, 3, padding=1),
+            nn.SiLU(),
+            conv_nd(2, ch, ch, 3, padding=1),
+            nn.SiLU()
+        )
+        self.out_layer = zero_module(conv_nd(2, ch, ch, 3, padding=1))
+    def copy_encoder_weight(self, ae_model):
+        # misses, ignores = self.load_state_dict(ae_state_dict, strict=False)
+        return None
+    def encode(self, x):
+        x = self.secret_scaler(x)
+        return x
+    def forward(self, x, c):
+        # x: [B, C, H, W], c: [B, secret_len]
+        c = self.encode(c)
+        if self.emode == 'c3':
+            x = torch.cat([x, c], dim=1)
+        elif self.emode == 'c2':
+            x = torch.cat([x.mean(dim=1, keepdim=True), c], dim=1)
+        elif self.emode == 'm3':
+            x = x * c
+        dx = self.join_encoder(x)
+        dx = self.out_layer(dx)
+        return dx, None
+class SecretEncoder5(nn.Module):
+    """same as SecretEncoder3 but with ch as input"""
+    def __init__(self, secret_len, ch=3, base_res=16, resolution=64, joint=False) -> None:
+        super().__init__()
+        log_resolution = int(np.log2(resolution))
+        log_base = int(np.log2(base_res))
+        self.secret_len = secret_len
+        self.joint = joint
+        self.resolution = resolution
+        self.secret_scaler = nn.Sequential(
+            nn.Linear(secret_len, base_res*base_res*ch),
+            nn.SiLU(),
+            View(-1, ch, base_res, base_res),
+            nn.Upsample(scale_factor=(2**(log_resolution-log_base), 2**(log_resolution-log_base))),  # chx16x16 -> chx256x256
+        )  # secret len -> ch x res x res
+        if joint:
+            self.join_encoder = nn.Sequential(
+                conv_nd(2, 2*ch, 2*ch, 3, padding=1),
+                nn.SiLU(),
+                conv_nd(2, 2*ch, ch, 3, padding=1),
+                nn.SiLU()
+            )
+        self.out_layer = zero_module(conv_nd(2, ch, ch, 3, padding=1))
+    def copy_encoder_weight(self, ae_model):
+        # misses, ignores = self.load_state_dict(ae_state_dict, strict=False)
+        return None
+    def encode(self, x):
+        x = self.secret_scaler(x)
+        return x
+    def forward(self, x, c):
+        # x: [B, C, H, W], c: [B, secret_len]
+        c = self.encode(c)
+        if self.joint:
+            x = thf.interpolate(x, size=(self.resolution, self.resolution), mode="bilinear", align_corners=False, antialias=True)
+            c = self.join_encoder(torch.cat([x, c], dim=1))
+        c = self.out_layer(c)
+        return c, None
+class SecretEncoder2(nn.Module):
+    def __init__(self, secret_len, embed_dim, ddconfig, ckpt_path=None,
+                 ignore_keys=[],
+                 image_key="image",
+                 colorize_nlabels=None,
+                 monitor=None,
+                 ema_decay=None,
+                 learn_logvar=False) -> None:
+        super().__init__()
+        log_resolution = int(np.log2(ddconfig.resolution))
+        self.secret_len = secret_len
+        self.learn_logvar = learn_logvar
+        self.image_key = image_key
+        self.encoder = Encoder(**ddconfig)
+        self.encoder.conv_out = zero_module(self.encoder.conv_out)
+        self.embed_dim = embed_dim
+        if colorize_nlabels is not None:
+            assert type(colorize_nlabels)==int
+            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+        self.secret_scaler = nn.Sequential(
+            nn.Linear(secret_len, 32*32*ddconfig.out_ch),
+            nn.SiLU(),
+            View(-1, ddconfig.out_ch, 32, 32),
+            nn.Upsample(scale_factor=(2**(log_resolution-5), 2**(log_resolution-5))),  # chx16x16 -> chx256x256
+            # zero_module(conv_nd(2, ddconfig.out_ch, ddconfig.out_ch, 3, padding=1))
+        )  # secret len -> ch x res x res
+        # out_resolution = ddconfig.resolution//(len(ddconfig.ch_mult)-1)
+        # self.out_layer = zero_module(conv_nd(2, ddconfig.out_ch, ddconfig.out_ch, 3, padding=1))
+        self.use_ema = ema_decay is not None
+        if self.use_ema:
+            self.ema_decay = ema_decay
+            assert 0. < ema_decay < 1.
+            self.model_ema = LitEma(self, decay=ema_decay)
+            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        misses, ignores = self.load_state_dict(sd, strict=False)
+        print(f"[SecretEncoder] Restored from {path}, misses: {misses}, ignores: {ignores}")
+    def copy_encoder_weight(self, ae_model):
+        # misses, ignores = self.load_state_dict(ae_state_dict, strict=False)
+        return None
+        self.encoder.load_state_dict(ae_model.encoder.state_dict())
+        self.quant_conv.load_state_dict(ae_model.quant_conv.state_dict())
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.model_ema.store(self.parameters())
+            self.model_ema.copy_to(self)
+            if context is not None:
+                print(f"{context}: Switched to EMA weights")
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.model_ema.restore(self.parameters())
+                if context is not None:
+                    print(f"{context}: Restored training weights")
+    def on_train_batch_end(self, *args, **kwargs):
+        if self.use_ema:
+            self.model_ema(self)
+    def encode(self, x):
+        h = self.encoder(x)
+        posterior = h
+        return posterior
+    def forward(self, x, c):
+        # x: [B, C, H, W], c: [B, secret_len]
+        c = self.secret_scaler(c)
+        x = torch.cat([x, c], dim=1)
+        z = self.encode(x)
+        # z = self.out_layer(z)
+        return z, None
+class SecretEncoder7(nn.Module):
+    def __init__(self, secret_len, ddconfig, ckpt_path=None,
+                 ignore_keys=[],embed_dim=3,
+                 ema_decay=None) -> None:
+        super().__init__()
+        log_resolution = int(np.log2(ddconfig.resolution))
+        self.secret_len = secret_len
+        self.encoder = Encoder(**ddconfig)
+        # self.encoder.conv_out = zero_module(self.encoder.conv_out)
+        self.quant_conv = nn.Conv2d(ddconfig["z_channels"], embed_dim, 1)
+        self.secret_scaler = nn.Sequential(
+            nn.Linear(secret_len, 32*32*2),
+            nn.SiLU(),
+            View(-1, 2, 32, 32),
+            # nn.Upsample(scale_factor=(2**(log_resolution-5), 2**(log_resolution-5))),  # chx16x16 -> chx256x256
+            # zero_module(conv_nd(2, ddconfig.out_ch, ddconfig.out_ch, 3, padding=1))
+        )  # secret len -> ch x res x res
+        # out_resolution = ddconfig.resolution//(len(ddconfig.ch_mult)-1)
+        # self.out_layer = zero_module(conv_nd(2, ddconfig.out_ch, ddconfig.out_ch, 3, padding=1))
+        self.use_ema = ema_decay is not None
+        if self.use_ema:
+            self.ema_decay = ema_decay
+            assert 0. < ema_decay < 1.
+            self.model_ema = LitEma(self, decay=ema_decay)
+            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        misses, ignores = self.load_state_dict(sd, strict=False)
+        print(f"[SecretEncoder7] Restored from {path}, misses: {len(misses)}, ignores: {len(ignores)}. Do not worry as we are not using the decoder and the secret encoder is a novel module.")
+    def copy_encoder_weight(self, ae_model):
+        # misses, ignores = self.load_state_dict(ae_state_dict, strict=False)
+        # return None
+        self.encoder.load_state_dict(deepcopy(ae_model.encoder.state_dict()))
+        self.quant_conv.load_state_dict(deepcopy(ae_model.quant_conv.state_dict()))
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.model_ema.store(self.parameters())
+            self.model_ema.copy_to(self)
+            if context is not None:
+                print(f"{context}: Switched to EMA weights")
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.model_ema.restore(self.parameters())
+                if context is not None:
+                    print(f"{context}: Restored training weights")
+    def on_train_batch_end(self, *args, **kwargs):
+        if self.use_ema:
+            self.model_ema(self)
+    def encode(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        return h
+    def forward(self, x, c):
+        # x: [B, C, H, W], c: [B, secret_len]
+        c = self.secret_scaler(c)  # [B, 2, 32, 32]
+        # c = thf.interpolate(c, size=x.shape[-2:], mode="bilinear", align_corners=False)
+        c = thf.interpolate(c, size=x.shape[-2:], mode="nearest")
+        x = 0.2125 * x[:,0,...] + 0.7154 *x[:,1,...] + 0.0721 * x[:,2,...]
+        x = torch.cat([x.unsqueeze(1), c], dim=1)
+        z = self.encode(x)
+        # z = self.out_layer(z)
+        return z, None
+class SecretEncoder(nn.Module):
+    def __init__(self, secret_len, embed_dim, ddconfig, ckpt_path=None,
+                 ignore_keys=[],
+                 image_key="image",
+                 colorize_nlabels=None,
+                 monitor=None,
+                 ema_decay=None,
+                 learn_logvar=False) -> None:
+        super().__init__()
+        log_resolution = int(np.log2(ddconfig.resolution))
+        self.secret_len = secret_len
+        self.learn_logvar = learn_logvar
+        self.image_key = image_key
+        self.encoder = Encoder(**ddconfig)
+        assert ddconfig["double_z"]
+        self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
+        self.embed_dim = embed_dim
+        if colorize_nlabels is not None:
+            assert type(colorize_nlabels)==int
+            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+        self.use_ema = ema_decay is not None
+        if self.use_ema:
+            self.ema_decay = ema_decay
+            assert 0. < ema_decay < 1.
+            self.model_ema = LitEma(self, decay=ema_decay)
+            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+        self.secret_scaler = nn.Sequential(
+            nn.Linear(secret_len, 32*32*ddconfig.out_ch),
+            nn.SiLU(),
+            View(-1, ddconfig.out_ch, 32, 32),
+            nn.Upsample(scale_factor=(2**(log_resolution-5), 2**(log_resolution-5))),  # chx16x16 -> chx256x256
+            zero_module(conv_nd(2, ddconfig.out_ch, ddconfig.out_ch, 3, padding=1))
+        )  # secret len -> ch x res x res
+        # out_resolution = ddconfig.resolution//(len(ddconfig.ch_mult)-1)
+        self.out_layer = zero_module(conv_nd(2, ddconfig.out_ch, ddconfig.out_ch, 3, padding=1))
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        misses, ignores = self.load_state_dict(sd, strict=False)
+        print(f"[SecretEncoder] Restored from {path}, misses: {misses}, ignores: {ignores}")
+    def copy_encoder_weight(self, ae_model):
+        # misses, ignores = self.load_state_dict(ae_state_dict, strict=False)
+        self.encoder.load_state_dict(ae_model.encoder.state_dict())
+        self.quant_conv.load_state_dict(ae_model.quant_conv.state_dict())
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.model_ema.store(self.parameters())
+            self.model_ema.copy_to(self)
+            if context is not None:
+                print(f"{context}: Switched to EMA weights")
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.model_ema.restore(self.parameters())
+                if context is not None:
+                    print(f"{context}: Restored training weights")
+    def on_train_batch_end(self, *args, **kwargs):
+        if self.use_ema:
+            self.model_ema(self)
+    def encode(self, x):
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+        return posterior
+    def forward(self, x, c):
+        # x: [B, C, H, W], c: [B, secret_len]
+        c = self.secret_scaler(c)
+        x = x + c
+        posterior = self.encode(x)
+        z = posterior.sample()
+        z = self.out_layer(z)
+        return z, posterior
+class ControlAE(pl.LightningModule):
+    def __init__(self,
+                 first_stage_key,
+                 first_stage_config,
+                 control_key,
+                 control_config,
+                 decoder_config,
+                 loss_config,
+                 noise_config='__none__',
+                 use_ema=False,
+                 secret_warmup=False,
+                 scale_factor=1.,
+                 ckpt_path="__none__",
+                 ):
+        super().__init__()
+        self.scale_factor = scale_factor
+        self.control_key = control_key
+        self.first_stage_key = first_stage_key
+        self.ae = instantiate_from_config(first_stage_config)
+        self.control = instantiate_from_config(control_config)
+        self.decoder = instantiate_from_config(decoder_config)
+        self.crop = kornia.augmentation.CenterCrop((224, 224), cropping_mode="resample")  # early training phase
+        if noise_config != '__none__':
+            print('Using noise')
+            self.noise = instantiate_from_config(noise_config)
+        # copy weights from first stage
+        self.control.copy_encoder_weight(self.ae)
+        # freeze first stage
+        self.ae.eval()
+        self.ae.train = disabled_train
+        for p in self.ae.parameters():
+            p.requires_grad = False
+        self.loss_layer = instantiate_from_config(loss_config)
+        # early training phase
+        # self.fixed_input = True
+        self.fixed_x = None
+        self.fixed_img = None
+        self.fixed_input_recon = None
+        self.fixed_control = None
+        self.register_buffer("fixed_input", torch.tensor(True))
+        # secret warmup
+        self.secret_warmup = secret_warmup
+        self.secret_baselen = 2
+        self.secret_len = control_config.params.secret_len
+        if self.secret_warmup:
+            assert self.secret_len == 2**(int(np.log2(self.secret_len)))
+        self.use_ema = use_ema
+        if self.use_ema:
+            print('Using EMA')
+            self.control_ema = LitEma(self.control)
+            self.decoder_ema = LitEma(self.decoder)
+            print(f"Keeping EMAs of {len(list(self.control_ema.buffers()) + list(self.decoder_ema.buffers()))}.")
+        if ckpt_path != '__none__':
+            self.init_from_ckpt(ckpt_path, ignore_keys=[])
+    def get_warmup_secret(self, old_secret):
+        # old_secret: [B, secret_len]
+        # new_secret: [B, secret_len]
+        if self.secret_warmup:
+            bsz = old_secret.shape[0]
+            nrepeats = self.secret_len // self.secret_baselen
+            new_secret  = torch.zeros((bsz, self.secret_baselen), dtype=torch.float).random_(0, 2).repeat_interleave(nrepeats, dim=1)
+            return new_secret.to(old_secret.device)
+        else:
+            return old_secret
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        self.load_state_dict(sd, strict=False)
+        print(f"Restored from {path}")
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.control_ema.store(self.control.parameters())
+            self.decoder_ema.store(self.decoder.parameters())
+            self.control_ema.copy_to(self.control)
+            self.decoder_ema.copy_to(self.decoder)
+            if context is not None:
+                print(f"{context}: Switched to EMA weights")
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.control_ema.restore(self.control.parameters())
+                self.decoder_ema.restore(self.decoder.parameters())
+                if context is not None:
+                    print(f"{context}: Restored training weights")
+    def on_train_batch_end(self, *args, **kwargs):
+        if self.use_ema:
+            self.control_ema(self.control)
+            self.decoder_ema(self.decoder)
+    def compute_loss(self, pred, target):
+        # return thf.mse_loss(pred, target, reduction="none").mean(dim=(1, 2, 3))
+        lpips_loss = self.lpips_loss(pred, target).mean(dim=[1,2,3])
+        pred_yuv = color.rgb_to_yuv((pred + 1) / 2)
+        target_yuv = color.rgb_to_yuv((target + 1) / 2)
+        yuv_loss = torch.mean((pred_yuv - target_yuv)**2, dim=[2,3])
+        yuv_loss = 1.5*torch.mm(yuv_loss, self.yuv_scales).squeeze(1)
+        return lpips_loss + yuv_loss
+    def forward(self, x, image, c):
+        if self.control.__class__.__name__ == 'SecretEncoder6':
+            eps, posterior = self.control(x, c)
+        else:
+            eps, posterior = self.control(image, c)
+        return x + eps, posterior
+    @torch.no_grad()
+    def get_input(self, batch, return_first_stage=False, bs=None):
+        image = batch[self.first_stage_key]
+        control = batch[self.control_key]
+        control = self.get_warmup_secret(control)
+        if bs is not None:
+            image = image[:bs]
+            control = control[:bs]
+        else:
+            bs = image.shape[0]
+        # encode image 1st stage
+        image = einops.rearrange(image, "b h w c -> b c h w").contiguous()
+        x = self.encode_first_stage(image).detach()
+        image_rec = self.decode_first_stage(x).detach()
+        # check if using fixed input (early training phase)
+        # if self.training and self.fixed_input:
+        if self.fixed_input:
+            if self.fixed_x is None:  # first iteration
+                print('[TRAINING] Warmup - using fixed input image for now!')
+                self.fixed_x = x.detach().clone()[:bs]
+                self.fixed_img = image.detach().clone()[:bs]
+                self.fixed_input_recon = image_rec.detach().clone()[:bs]
+                self.fixed_control = control.detach().clone()[:bs]  # use for log_images with fixed_input option only
+            x, image, image_rec = self.fixed_x, self.fixed_img, self.fixed_input_recon
+        out = [x, control]
+        if return_first_stage:
+            out.extend([image, image_rec])
+        return out
+    def decode_first_stage(self, z):
+        z = 1./self.scale_factor * z
+        image_rec = self.ae.decode(z)
+        return image_rec
+    def encode_first_stage(self, image):
+        encoder_posterior = self.ae.encode(image)
+        if isinstance(encoder_posterior, DiagonalGaussianDistribution):
+            z = encoder_posterior.sample()
+        elif isinstance(encoder_posterior, torch.Tensor):
+            z = encoder_posterior
+        else:
+            raise NotImplementedError(f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented")
+        return self.scale_factor * z
+    def shared_step(self, batch):
+        x, c, img, _ = self.get_input(batch, return_first_stage=True)
+        # import pdb; pdb.set_trace()
+        x, posterior = self(x, img, c)
+        image_rec = self.decode_first_stage(x)
+        # resize
+        if img.shape[-1] > 256:
+            img =  thf.interpolate(img, size=(256, 256), mode='bilinear', align_corners=False).detach()
+            image_rec =  thf.interpolate(image_rec, size=(256, 256), mode='bilinear', align_corners=False)
+        if hasattr(self, 'noise') and self.noise.is_activated():
+            image_rec_noised = self.noise(image_rec, self.global_step, p=0.9)
+        else:
+            image_rec_noised = self.crop(image_rec)  # center crop
+        image_rec_noised = torch.clamp(image_rec_noised, -1, 1)
+        pred = self.decoder(image_rec_noised)
+        loss, loss_dict = self.loss_layer(img, image_rec, posterior, c, pred, self.global_step)
+        bit_acc = loss_dict["bit_acc"]
+        bit_acc_ = bit_acc.item()
+        if (bit_acc_ > 0.98) and (not self.fixed_input) and self.noise.is_activated():
+            self.loss_layer.activate_ramp(self.global_step)
+        if (bit_acc_ > 0.95) and (not self.fixed_input):  # ramp up image loss at late training stage
+            if hasattr(self, 'noise') and (not self.noise.is_activated()):
+                self.noise.activate(self.global_step)
+        if (bit_acc_ > 0.9) and self.fixed_input:  # execute only once
+            print(f'[TRAINING] High bit acc ({bit_acc_}) achieved, switch to full image dataset training.')
+            self.fixed_input = ~self.fixed_input
+        return loss, loss_dict
+    def training_step(self, batch, batch_idx):
+        loss, loss_dict = self.shared_step(batch)
+        loss_dict = {f"train/{key}": val for key, val in loss_dict.items()}
+        self.log_dict(loss_dict, prog_bar=True,
+                      logger=True, on_step=True, on_epoch=True)
+        self.log("global_step", self.global_step,
+                 prog_bar=True, logger=True, on_step=True, on_epoch=False)
+        # if self.use_scheduler:
+        #     lr = self.optimizers().param_groups[0]['lr']
+        #     self.log('lr_abs', lr, prog_bar=True, logger=True, on_step=True, on_epoch=False)
+        return loss
+    @torch.no_grad()
+    def validation_step(self, batch, batch_idx):
+        _, loss_dict_no_ema = self.shared_step(batch)
+        loss_dict_no_ema = {f"val/{key}": val for key, val in loss_dict_no_ema.items() if key != 'img_lw'}
+        with self.ema_scope():
+            _, loss_dict_ema = self.shared_step(batch)
+            loss_dict_ema = {'val/' + key + '_ema': loss_dict_ema[key] for key in loss_dict_ema}
+        self.log_dict(loss_dict_no_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True)
+        self.log_dict(loss_dict_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True)
+    @torch.no_grad()
+    def log_images(self, batch, fixed_input=False, **kwargs):
+        log = dict()
+        if fixed_input and self.fixed_img is not None:
+            x, c, img, img_recon = self.fixed_x, self.fixed_control, self.fixed_img, self.fixed_input_recon
+        else:
+            x, c, img, img_recon = self.get_input(batch, return_first_stage=True)
+        x, _ = self(x, img, c)
+        image_out = self.decode_first_stage(x)
+        if hasattr(self, 'noise') and self.noise.is_activated():
+            img_noise = self.noise(image_out, self.global_step, p=1.0)
+            log['noised'] = img_noise
+        log['input'] = img
+        log['output'] = image_out
+        log['recon'] = img_recon
+        return log
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        params = list(self.control.parameters()) + list(self.decoder.parameters())
+        optimizer = torch.optim.AdamW(params, lr=lr)
+        return optimizer

cldm/cldm.py ADDED Viewed

	@@ -0,0 +1,517 @@

+import numpy as np
+import einops
+import torch
+import torch as th
+import torch.nn as nn
+import torchvision
+from ldm.modules.diffusionmodules.util import (
+    conv_nd,
+    linear,
+    zero_module,
+    timestep_embedding,
+)
+from einops import rearrange, repeat
+from torchvision.utils import make_grid
+from ldm.modules.attention import SpatialTransformer
+from ldm.modules.diffusionmodules.openaimodel import UNetModel, TimestepEmbedSequential, ResBlock, Downsample, AttentionBlock
+from ldm.models.diffusion.ddpm import LatentDiffusion
+from ldm.util import log_txt_as_img, exists, instantiate_from_config
+from ldm.models.diffusion.ddim import DDIMSampler
+class ControlledUnetModel(UNetModel):
+    def forward(self, x, timesteps=None, context=None, control=None, only_mid_control=False, **kwargs):
+        hs = []
+        with torch.no_grad():
+            t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+            emb = self.time_embed(t_emb)
+            h = x.type(self.dtype)
+            for module in self.input_blocks:
+                h = module(h, emb, context)
+                hs.append(h)
+            h = self.middle_block(h, emb, context)
+        h += control.pop()
+        for i, module in enumerate(self.output_blocks):
+            if only_mid_control:
+                h = torch.cat([h, hs.pop()], dim=1)
+            else:
+                h = torch.cat([h, hs.pop() + control.pop()], dim=1)
+            h = module(h, emb, context)
+        h = h.type(x.dtype)
+        return self.out(h)
+class View(nn.Module):
+    def __init__(self, *shape):
+        super().__init__()
+        self.shape = shape
+    def forward(self, x):
+        return x.view(*self.shape)
+class ControlNet(nn.Module):
+    def __init__(
+        self,
+        image_size,
+        in_channels,
+        model_channels,
+        hint_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        conv_resample=True,
+        dims=2,
+        use_checkpoint=False,
+        use_fp16=False,
+        num_heads=-1,
+        num_head_channels=-1,
+        num_heads_upsample=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        use_new_attention_order=False,
+        use_spatial_transformer=False,    # custom transformer support
+        transformer_depth=1,              # custom transformer support
+        context_dim=None,                 # custom transformer support
+        n_embed=None,                     # custom support for prediction of discrete ids into codebook of first stage vq model
+        legacy=True,
+        disable_self_attentions=None,
+        num_attention_blocks=None,
+        disable_middle_self_attn=False,
+        use_linear_in_transformer=False,
+        secret_len = 0,
+    ):
+        super().__init__()
+        if use_spatial_transformer:
+            assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
+        if context_dim is not None:
+            assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
+            from omegaconf.listconfig import ListConfig
+            if type(context_dim) == ListConfig:
+                context_dim = list(context_dim)
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+        if num_heads == -1:
+            assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
+        if num_head_channels == -1:
+            assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
+        self.dims = dims
+        self.image_size = image_size
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        if isinstance(num_res_blocks, int):
+            self.num_res_blocks = len(channel_mult) * [num_res_blocks]
+        else:
+            if len(num_res_blocks) != len(channel_mult):
+                raise ValueError("provide num_res_blocks either as an int (globally constant) or "
+                                 "as a list/tuple (per-level) with the same length as channel_mult")
+            self.num_res_blocks = num_res_blocks
+        if disable_self_attentions is not None:
+            # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
+            assert len(disable_self_attentions) == len(channel_mult)
+        if num_attention_blocks is not None:
+            assert len(num_attention_blocks) == len(self.num_res_blocks)
+            assert all(map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks))))
+            print(f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
+                  f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
+                  f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
+                  f"attention will still not be set.")
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.use_checkpoint = use_checkpoint
+        self.dtype = th.float16 if use_fp16 else th.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+        self.predict_codebook_ids = n_embed is not None
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+        self.input_blocks = nn.ModuleList(
+            [
+                TimestepEmbedSequential(
+                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
+                )
+            ]
+        )
+        self.zero_convs = nn.ModuleList([self.make_zero_conv(model_channels)])
+        self.secret_len = secret_len
+        if secret_len > 0:
+            log_resolution = int(np.log2(64))
+            self.input_hint_block = TimestepEmbedSequential(
+                nn.Linear(secret_len, 16*16*4),
+                nn.SiLU(),
+                View(-1, 4, 16, 16),
+                nn.Upsample(scale_factor=(2**(log_resolution-4), 2**(log_resolution-4))),
+                conv_nd(dims, 4, 64, 3, padding=1),
+                nn.SiLU(),
+                conv_nd(dims, 64, 256, 3, padding=1),
+                nn.SiLU(),
+                zero_module(conv_nd(dims, 256, model_channels, 3, padding=1))
+            )
+        else:
+            self.input_hint_block = TimestepEmbedSequential(
+                        conv_nd(dims, hint_channels, 16, 3, padding=1),
+                        nn.SiLU(),
+                        conv_nd(dims, 16, 16, 3, padding=1),
+                        nn.SiLU(),
+                        conv_nd(dims, 16, 32, 3, padding=1, stride=2),
+                        nn.SiLU(),
+                        conv_nd(dims, 32, 32, 3, padding=1),
+                        nn.SiLU(),
+                        conv_nd(dims, 32, 96, 3, padding=1, stride=2),
+                        nn.SiLU(),
+                        conv_nd(dims, 96, 96, 3, padding=1),
+                        nn.SiLU(),
+                        conv_nd(dims, 96, 256, 3, padding=1, stride=2),
+                        nn.SiLU(),
+                        zero_module(conv_nd(dims, 256, model_channels, 3, padding=1))
+            )
+        self._feature_size = model_channels
+        input_block_chans = [model_channels]
+        ch = model_channels
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for nr in range(self.num_res_blocks[level]):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=mult * model_channels,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    if legacy:
+                        #num_heads = 1
+                        dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+                    if exists(disable_self_attentions):
+                        disabled_sa = disable_self_attentions[level]
+                    else:
+                        disabled_sa = False
+                    if not exists(num_attention_blocks) or nr < num_attention_blocks[level]:
+                        layers.append(
+                            AttentionBlock(
+                                ch,
+                                use_checkpoint=use_checkpoint,
+                                num_heads=num_heads,
+                                num_head_channels=dim_head,
+                                use_new_attention_order=use_new_attention_order,
+                            ) if not use_spatial_transformer else SpatialTransformer(
+                                ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
+                                disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer,
+                                use_checkpoint=use_checkpoint
+                            )
+                        )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self.zero_convs.append(self.make_zero_conv(ch))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(
+                            ch, conv_resample, dims=dims, out_channels=out_ch
+                        )
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                self.zero_convs.append(self.make_zero_conv(ch))
+                ds *= 2
+                self._feature_size += ch
+        if num_head_channels == -1:
+            dim_head = ch // num_heads
+        else:
+            num_heads = ch // num_head_channels
+            dim_head = num_head_channels
+        if legacy:
+            #num_heads = 1
+            dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=dim_head,
+                use_new_attention_order=use_new_attention_order,
+            ) if not use_spatial_transformer else SpatialTransformer(  # always uses a self-attn
+                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
+                            disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer,
+                            use_checkpoint=use_checkpoint
+                        ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self.middle_block_out = self.make_zero_conv(ch)
+        self._feature_size += ch
+    def make_zero_conv(self, channels):
+        return TimestepEmbedSequential(zero_module(conv_nd(self.dims, channels, channels, 1, padding=0)))
+    def forward(self, x, hint, timesteps, context, **kwargs):
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+        emb = self.time_embed(t_emb)
+        # import pdb; pdb.set_trace()
+        guided_hint = self.input_hint_block(hint, emb, context)
+        outs = []
+        h = x.type(self.dtype)
+        for module, zero_conv in zip(self.input_blocks, self.zero_convs):
+            if guided_hint is not None:
+                h = module(h, emb, context)
+                h += guided_hint
+                guided_hint = None
+            else:
+                h = module(h, emb, context)
+            outs.append(zero_conv(h, emb, context))
+        h = self.middle_block(h, emb, context)
+        outs.append(self.middle_block_out(h, emb, context))
+        return outs
+class SecretDecoder(nn.Module):
+    def __init__(self, arch='CNN', act='ReLU', norm='none', resolution=256, in_channels=3, secret_len=100):
+        super().__init__()
+        self.resolution = resolution
+        self.arch = arch
+        print(f'SecretDecoder arch: {arch}')
+        def activation(name = 'ReLU'):
+            if name == 'ReLU':
+                return nn.ReLU()
+            elif name == 'LeakyReLU':
+                return nn.LeakyReLU()
+            elif name == 'SiLU':
+                return nn.SiLU()
+        def normalisation(name, n):
+            if name == 'none':
+                return nn.Identity()
+            elif name == 'BatchNorm2D':
+                return nn.BatchNorm2d(n)
+            elif name == 'BatchNorm1d':
+                return nn.BatchNorm1d(n)
+            elif name == 'LayerNorm':
+                return nn.LayerNorm(n)
+        if arch=='CNN':
+            self.decoder = nn.Sequential(
+                nn.Conv2d(in_channels, 32, (3, 3), 2, 1),  # 128
+                activation(act),
+                nn.Conv2d(32, 32, 3, 1, 1),
+                activation(act),
+                nn.Conv2d(32, 64, 3, 2, 1),  # 64
+                activation(act),
+                nn.Conv2d(64, 64, 3, 1, 1),
+                activation(act),
+                nn.Conv2d(64, 64, 3, 2, 1),  # 32
+                activation(act),
+                nn.Conv2d(64, 128, 3, 2, 1),  # 16
+                activation(act),
+                nn.Conv2d(128, 128, (3, 3), 2, 1),  # 8
+                activation(act),
+            )
+            self.dense = nn.Sequential(
+                nn.Linear(resolution * resolution * 128 // 32 // 32, 512),
+                activation(act),
+                nn.Linear(512, secret_len)
+            )
+        elif arch == 'resnet50':
+            self.decoder = torchvision.models.resnet50(pretrained=True, progress=False)
+            self.decoder.fc = nn.Linear(self.decoder.fc.in_features, secret_len)
+        else:
+            raise NotImplementedError
+    def forward(self, image):
+        x = self.decoder(image)
+        if self.arch == 'CNN':
+            x = x.view(-1, self.resolution * self.resolution * 128 // 32 // 32)
+            x = self.dense(x)
+        return x
+class ControlLDM(LatentDiffusion):
+    def __init__(self, control_stage_config, control_key, only_mid_control, secret_decoder_config, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.control_model = instantiate_from_config(control_stage_config)
+        self.control_key = control_key
+        self.only_mid_control = only_mid_control
+        if secret_decoder_config != 'none':
+            self.secret_decoder = instantiate_from_config(secret_decoder_config)
+    @torch.no_grad()
+    def get_input(self, batch, k, bs=None, *args, **kwargs):
+        x, c = super().get_input(batch, self.first_stage_key, *args, **kwargs)
+        control = batch[self.control_key]
+        if bs is not None:
+            control = control[:bs]
+        control = control.to(self.device)
+        if self.control_key == 'hint':
+            control = einops.rearrange(control, 'b h w c -> b c h w')
+            control = control.to(memory_format=torch.contiguous_format).float()
+        return x, dict(c_crossattn=[c], c_concat=[control])
+    def apply_model(self, x_noisy, t, cond, *args, **kwargs):
+        assert isinstance(cond, dict)
+        diffusion_model = self.model.diffusion_model
+        cond_txt = torch.cat(cond['c_crossattn'], 1)
+        cond_hint = torch.cat(cond['c_concat'], 1)
+        control = self.control_model(x=x_noisy, hint=cond_hint, timesteps=t, context=cond_txt)
+        eps = diffusion_model(x=x_noisy, timesteps=t, context=cond_txt, control=control, only_mid_control=self.only_mid_control)
+        return eps
+    @torch.no_grad()
+    def get_unconditional_conditioning(self, N):
+        return self.get_learned_conditioning([""] * N)
+    @torch.no_grad()
+    def log_images(self, batch, N=4, n_row=2, sample=False, ddim_steps=50, ddim_eta=0.0, return_keys=None,
+                   quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
+                   plot_diffusion_rows=False, unconditional_guidance_scale=9.0, unconditional_guidance_label=None,
+                   use_ema_scope=True,
+                   **kwargs):
+        use_ddim = ddim_steps is not None
+        log = dict()
+        z, c = self.get_input(batch, self.first_stage_key, bs=N)
+        c_cat, c = c["c_concat"][0][:N], c["c_crossattn"][0][:N]
+        N = min(z.shape[0], N)
+        n_row = min(z.shape[0], n_row)
+        log["reconstruction"] = self.decode_first_stage(z)
+        log["control"] = c_cat * 2.0 - 1.0
+        log["conditioning"] = log_txt_as_img((512, 512), batch[self.cond_stage_key], size=16)
+        if plot_diffusion_rows:
+            # get diffusion row
+            diffusion_row = list()
+            z_start = z[:n_row]
+            for t in range(self.num_timesteps):
+                if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
+                    t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
+                    t = t.to(self.device).long()
+                    noise = torch.randn_like(z_start)
+                    z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
+                    diffusion_row.append(self.decode_first_stage(z_noisy))
+            diffusion_row = torch.stack(diffusion_row)  # n_log_step, n_row, C, H, W
+            diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
+            diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
+            diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
+            log["diffusion_row"] = diffusion_grid
+        if sample:
+            # get denoise row
+            samples, z_denoise_row = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c]},
+                                                     batch_size=N, ddim=use_ddim,
+                                                     ddim_steps=ddim_steps, eta=ddim_eta)
+            x_samples = self.decode_first_stage(samples)
+            log["samples"] = x_samples
+            if plot_denoise_rows:
+                denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
+                log["denoise_row"] = denoise_grid
+        # import pudb; pudb.set_trace()
+        if unconditional_guidance_scale > 1.0:
+            uc_cross = self.get_unconditional_conditioning(N)
+            uc_cat = c_cat  # torch.zeros_like(c_cat)
+            uc_full = {"c_concat": [uc_cat], "c_crossattn": [uc_cross]}
+            samples_cfg, _ = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c]},
+                                             batch_size=N, ddim=use_ddim,
+                                             ddim_steps=ddim_steps, eta=ddim_eta,
+                                             unconditional_guidance_scale=unconditional_guidance_scale,
+                                             unconditional_conditioning=uc_full,
+                                             )
+            x_samples_cfg = self.decode_first_stage(samples_cfg)
+            log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
+        return log
+    @torch.no_grad()
+    def sample_log(self, cond, batch_size, ddim, ddim_steps, **kwargs):
+        ddim_sampler = DDIMSampler(self)
+        # import pdb; pdb.set_trace()
+        # b, c, h, w = cond["c_concat"][0].shape
+        b, c, h, w = cond["c_concat"][0].shape[0], self.channels, self.image_size*8, self.image_size*8
+        shape = (self.channels, h // 8, w // 8)
+        samples, intermediates = ddim_sampler.sample(ddim_steps, batch_size, shape, cond, verbose=False, **kwargs)
+        return samples, intermediates
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        params = list(self.control_model.parameters())
+        if not self.sd_locked:
+            params += list(self.model.diffusion_model.output_blocks.parameters())
+            params += list(self.model.diffusion_model.out.parameters())
+        opt = torch.optim.AdamW(params, lr=lr)
+        return opt
+    def low_vram_shift(self, is_diffusing):
+        if is_diffusing:
+            self.model = self.model.cuda()
+            self.control_model = self.control_model.cuda()
+            self.first_stage_model = self.first_stage_model.cpu()
+            self.cond_stage_model = self.cond_stage_model.cpu()
+        else:
+            self.model = self.model.cpu()
+            self.control_model = self.control_model.cpu()
+            self.first_stage_model = self.first_stage_model.cuda()
+            self.cond_stage_model = self.cond_stage_model.cuda()

cldm/diffsteg.py ADDED Viewed

	@@ -0,0 +1,782 @@

+import numpy as np
+import einops
+import torch
+import torch as th
+import torch.nn as nn
+from torch.nn import functional as thf
+import torchvision
+from ldm.modules.diffusionmodules.util import (
+    conv_nd,
+    linear,
+    zero_module,
+    timestep_embedding,
+)
+from einops import rearrange, repeat
+from torchvision.utils import make_grid
+from ldm.modules.attention import SpatialTransformer
+from ldm.modules.diffusionmodules.openaimodel import UNetModel, TimestepEmbedSequential, ResBlock, Downsample, AttentionBlock
+from ldm.models.diffusion.ddpm import LatentDiffusion
+from ldm.util import log_txt_as_img, exists, instantiate_from_config, default
+from ldm.models.diffusion.ddim import DDIMSampler
+# class CUNetModel(nn.Module):
+#     def forward(self, x, timesteps=None, context=None, control=None, only_mid_control=False, **kwargs):
+#         hs = []
+#         with torch.no_grad():
+#             t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+#             emb = self.time_embed(t_emb)
+#             h = x.type(self.dtype)
+#             for module in self.input_blocks:
+#                 h = module(h, emb, context)
+#                 hs.append(h)
+#             h = self.middle_block(h, emb, context)
+#         h += control.pop(0)
+#         for module in self.output_blocks:
+#             if only_mid_control:
+#                 h = th.cat([h, hs.pop()], dim=1)
+#             else:
+#                 h = torch.cat([h, hs.pop() + control.pop(0)], dim=1)
+#             h = module(h, emb, context)
+#         h = h.type(x.dtype)
+#         return self.out(h)
+class SecretNet(nn.Module):
+    def __init__(
+        self,
+        image_size,
+        in_channels,
+        model_channels,
+        hint_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        conv_resample=True,
+        dims=2,
+        use_checkpoint=False,
+        use_fp16=False,
+        num_heads=-1,
+        num_head_channels=-1,
+        num_heads_upsample=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        use_new_attention_order=False,
+        use_spatial_transformer=False,    # custom transformer support
+        transformer_depth=1,              # custom transformer support
+        context_dim=None,                 # custom transformer support
+        n_embed=None,                     # custom support for prediction of discrete ids into codebook of first stage vq model
+        legacy=True,
+        disable_self_attentions=None,
+        num_attention_blocks=None,
+        disable_middle_self_attn=False,
+        use_linear_in_transformer=False,
+        secret_len = 0,
+    ):
+        super().__init__()
+        if use_spatial_transformer:
+            assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
+        if context_dim is not None:
+            assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
+            from omegaconf.listconfig import ListConfig
+            if type(context_dim) == ListConfig:
+                context_dim = list(context_dim)
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+        if num_heads == -1:
+            assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
+        if num_head_channels == -1:
+            assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
+        self.dims = dims
+        self.image_size = image_size
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        if isinstance(num_res_blocks, int):
+            self.num_res_blocks = len(channel_mult) * [num_res_blocks]
+        else:
+            if len(num_res_blocks) != len(channel_mult):
+                raise ValueError("provide num_res_blocks either as an int (globally constant) or "
+                                 "as a list/tuple (per-level) with the same length as channel_mult")
+            self.num_res_blocks = num_res_blocks
+        if disable_self_attentions is not None:
+            # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
+            assert len(disable_self_attentions) == len(channel_mult)
+        if num_attention_blocks is not None:
+            assert len(num_attention_blocks) == len(self.num_res_blocks)
+            assert all(map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks))))
+            print(f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
+                  f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
+                  f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
+                  f"attention will still not be set.")
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.use_checkpoint = use_checkpoint
+        self.dtype = th.float16 if use_fp16 else th.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+        self.predict_codebook_ids = n_embed is not None
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+        # self.input_blocks = nn.ModuleList(
+        #     [
+        #         TimestepEmbedSequential(
+        #             conv_nd(dims, in_channels, model_channels, 3, padding=1)
+        #         )
+        #     ]
+        # )
+        self.zero_convs = nn.ModuleList([self.make_zero_conv(model_channels)])
+        self.secret_len = secret_len
+        if secret_len > 0:  # TODO: update for dec
+            log_resolution = int(np.log2(64))
+            self.input_hint_block = TimestepEmbedSequential(
+                nn.Linear(secret_len, 16*16*4),
+                nn.SiLU(),
+                View(-1, 4, 16, 16),
+                nn.Upsample(scale_factor=(2**(log_resolution-4), 2**(log_resolution-4))),
+                conv_nd(dims, 4, 64, 3, padding=1),
+                nn.SiLU(),
+                conv_nd(dims, 64, 256, 3, padding=1),
+                nn.SiLU(),
+                zero_module(conv_nd(dims, 256, model_channels, 3, padding=1))
+            )
+        self._feature_size = model_channels
+        input_block_chans = [model_channels]
+        ch = model_channels
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for nr in range(self.num_res_blocks[level]):
+                layers = []
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    if legacy:
+                        #num_heads = 1
+                        dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+                    if exists(disable_self_attentions):
+                        disabled_sa = disable_self_attentions[level]
+                    else:
+                        disabled_sa = False
+                    if not exists(num_attention_blocks) or nr < num_attention_blocks[level]:
+                        layers.append(0)
+                # self.input_blocks.append(TimestepEmbedSequential(*layers))
+                # self.zero_convs.append(self.make_zero_conv(ch))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    0
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                # self.zero_convs.append(self.make_zero_conv(ch))
+                ds *= 2
+                self._feature_size += ch
+        if num_head_channels == -1:
+            dim_head = ch // num_heads
+        else:
+            num_heads = ch // num_head_channels
+            dim_head = num_head_channels
+        if legacy:
+            #num_heads = 1
+            dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=dim_head,
+                use_new_attention_order=use_new_attention_order,
+            ) if not use_spatial_transformer else SpatialTransformer(  # always uses a self-attn
+                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
+                            disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer,
+                            use_checkpoint=use_checkpoint
+                        ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self.middle_block_out = self.make_zero_conv(ch)
+        self._feature_size += ch
+    def make_zero_conv(self, channels):
+        return TimestepEmbedSequential(zero_module(conv_nd(self.dims, channels, channels, 1, padding=0)))
+    def forward(self, x, hint, timesteps, context, **kwargs):
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+        emb = self.time_embed(t_emb)
+        guided_hint = self.input_hint_block(hint, emb, context)
+        # import pdb; pdb.set_trace()
+        outs = []
+        h = x.type(self.dtype)
+        for module, zero_conv in zip(self.input_blocks, self.zero_convs):
+            if guided_hint is not None:
+                h = module(h, emb, context)
+                h += guided_hint
+                guided_hint = None
+            else:
+                h = module(h, emb, context)
+            outs.append(zero_conv(h, emb, context))
+        h = self.middle_block(h, emb, context)
+        outs.append(self.middle_block_out(h, emb, context))
+        return outs
+class ControlledUnetModel(UNetModel):
+    def forward(self, x, timesteps=None, context=None, control=None, only_mid_control=False, **kwargs):
+        hs = []
+        with torch.no_grad():
+            t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+            emb = self.time_embed(t_emb)
+            h = x.type(self.dtype)
+            for module in self.input_blocks:
+                h = module(h, emb, context)
+                hs.append(h)
+            h = self.middle_block(h, emb, context)
+        h += control.pop()
+        for i, module in enumerate(self.output_blocks):
+            if only_mid_control:
+                h = torch.cat([h, hs.pop()], dim=1)
+            else:
+                h = torch.cat([h, hs.pop() + control.pop()], dim=1)
+            h = module(h, emb, context)
+        h = h.type(x.dtype)
+        return self.out(h)
+class View(nn.Module):
+    def __init__(self, *shape):
+        super().__init__()
+        self.shape = shape
+    def forward(self, x):
+        return x.view(*self.shape)
+class ControlNet(nn.Module):
+    def __init__(
+        self,
+        image_size,
+        in_channels,
+        model_channels,
+        hint_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        conv_resample=True,
+        dims=2,
+        use_checkpoint=False,
+        use_fp16=False,
+        num_heads=-1,
+        num_head_channels=-1,
+        num_heads_upsample=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        use_new_attention_order=False,
+        use_spatial_transformer=False,    # custom transformer support
+        transformer_depth=1,              # custom transformer support
+        context_dim=None,                 # custom transformer support
+        n_embed=None,                     # custom support for prediction of discrete ids into codebook of first stage vq model
+        legacy=True,
+        disable_self_attentions=None,
+        num_attention_blocks=None,
+        disable_middle_self_attn=False,
+        use_linear_in_transformer=False,
+        secret_len = 0,
+    ):
+        super().__init__()
+        if use_spatial_transformer:
+            assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
+        if context_dim is not None:
+            assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
+            from omegaconf.listconfig import ListConfig
+            if type(context_dim) == ListConfig:
+                context_dim = list(context_dim)
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+        if num_heads == -1:
+            assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
+        if num_head_channels == -1:
+            assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
+        self.dims = dims
+        self.image_size = image_size
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        if isinstance(num_res_blocks, int):
+            self.num_res_blocks = len(channel_mult) * [num_res_blocks]
+        else:
+            if len(num_res_blocks) != len(channel_mult):
+                raise ValueError("provide num_res_blocks either as an int (globally constant) or "
+                                 "as a list/tuple (per-level) with the same length as channel_mult")
+            self.num_res_blocks = num_res_blocks
+        if disable_self_attentions is not None:
+            # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
+            assert len(disable_self_attentions) == len(channel_mult)
+        if num_attention_blocks is not None:
+            assert len(num_attention_blocks) == len(self.num_res_blocks)
+            assert all(map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks))))
+            print(f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
+                  f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
+                  f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
+                  f"attention will still not be set.")
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.use_checkpoint = use_checkpoint
+        self.dtype = th.float16 if use_fp16 else th.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+        self.predict_codebook_ids = n_embed is not None
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+        self.input_blocks = nn.ModuleList(
+            [
+                TimestepEmbedSequential(
+                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
+                )
+            ]
+        )
+        self.zero_convs = nn.ModuleList([self.make_zero_conv(model_channels)])
+        self.secret_len = secret_len
+        if secret_len > 0:
+            log_resolution = int(np.log2(64))
+            self.input_hint_block = TimestepEmbedSequential(
+                nn.Linear(secret_len, 16*16*4),
+                nn.SiLU(),
+                View(-1, 4, 16, 16),
+                nn.Upsample(scale_factor=(2**(log_resolution-4), 2**(log_resolution-4))),
+                conv_nd(dims, 4, 64, 3, padding=1),
+                nn.SiLU(),
+                conv_nd(dims, 64, 256, 3, padding=1),
+                nn.SiLU(),
+                zero_module(conv_nd(dims, 256, model_channels, 3, padding=1))
+            )
+        else:
+            self.input_hint_block = TimestepEmbedSequential(
+                        conv_nd(dims, hint_channels, 16, 3, padding=1),
+                        nn.SiLU(),
+                        conv_nd(dims, 16, 16, 3, padding=1),
+                        nn.SiLU(),
+                        conv_nd(dims, 16, 32, 3, padding=1, stride=2),
+                        nn.SiLU(),
+                        conv_nd(dims, 32, 32, 3, padding=1),
+                        nn.SiLU(),
+                        conv_nd(dims, 32, 96, 3, padding=1, stride=2),
+                        nn.SiLU(),
+                        conv_nd(dims, 96, 96, 3, padding=1),
+                        nn.SiLU(),
+                        conv_nd(dims, 96, 256, 3, padding=1, stride=2),
+                        nn.SiLU(),
+                        zero_module(conv_nd(dims, 256, model_channels, 3, padding=1))
+            )
+        self._feature_size = model_channels
+        input_block_chans = [model_channels]
+        ch = model_channels
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for nr in range(self.num_res_blocks[level]):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=mult * model_channels,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    if legacy:
+                        #num_heads = 1
+                        dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+                    if exists(disable_self_attentions):
+                        disabled_sa = disable_self_attentions[level]
+                    else:
+                        disabled_sa = False
+                    if not exists(num_attention_blocks) or nr < num_attention_blocks[level]:
+                        layers.append(
+                            AttentionBlock(
+                                ch,
+                                use_checkpoint=use_checkpoint,
+                                num_heads=num_heads,
+                                num_head_channels=dim_head,
+                                use_new_attention_order=use_new_attention_order,
+                            ) if not use_spatial_transformer else SpatialTransformer(
+                                ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
+                                disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer,
+                                use_checkpoint=use_checkpoint
+                            )
+                        )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self.zero_convs.append(self.make_zero_conv(ch))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(
+                            ch, conv_resample, dims=dims, out_channels=out_ch
+                        )
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                self.zero_convs.append(self.make_zero_conv(ch))
+                ds *= 2
+                self._feature_size += ch
+        if num_head_channels == -1:
+            dim_head = ch // num_heads
+        else:
+            num_heads = ch // num_head_channels
+            dim_head = num_head_channels
+        if legacy:
+            #num_heads = 1
+            dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=dim_head,
+                use_new_attention_order=use_new_attention_order,
+            ) if not use_spatial_transformer else SpatialTransformer(  # always uses a self-attn
+                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
+                            disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer,
+                            use_checkpoint=use_checkpoint
+                        ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self.middle_block_out = self.make_zero_conv(ch)
+        self._feature_size += ch
+    def make_zero_conv(self, channels):
+        return TimestepEmbedSequential(zero_module(conv_nd(self.dims, channels, channels, 1, padding=0)))
+    def forward(self, x, hint, timesteps, context, **kwargs):
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+        emb = self.time_embed(t_emb)
+        guided_hint = self.input_hint_block(hint, emb, context)
+        # import pdb; pdb.set_trace()
+        outs = []
+        h = x.type(self.dtype)
+        for module, zero_conv in zip(self.input_blocks, self.zero_convs):
+            if guided_hint is not None:
+                h = module(h, emb, context)
+                h += guided_hint
+                guided_hint = None
+            else:
+                h = module(h, emb, context)
+            outs.append(zero_conv(h, emb, context))
+        h = self.middle_block(h, emb, context)
+        outs.append(self.middle_block_out(h, emb, context))
+        return outs
+class SecretDecoder(nn.Module):
+    def __init__(self, arch='resnet50', secret_len=100):
+        super().__init__()
+        self.arch = arch
+        print(f'SecretDecoder arch: {arch}')
+        self.resolution = 224
+        if arch == 'resnet50':
+            self.decoder = torchvision.models.resnet50(pretrained=True, progress=False)
+            self.decoder.fc = nn.Linear(self.decoder.fc.in_features, secret_len)
+        elif arch == 'resnet18':
+            self.decoder = torchvision.models.resnet18(pretrained=True, progress=False)
+            self.decoder.fc = nn.Linear(self.decoder.fc.in_features, secret_len)
+        else:
+            raise NotImplementedError
+    def forward(self, image):
+        if self.arch in ['resnet50', 'resnet18'] and image.shape[-1] > self.resolution:
+            image = thf.interpolate(image, size=(self.resolution, self.resolution), mode='bilinear', align_corners=False)
+        x = self.decoder(image)
+        return x
+class ControlLDM(LatentDiffusion):
+    def __init__(self, control_stage_config, control_key, only_mid_control, secret_decoder_config, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.control_model = instantiate_from_config(control_stage_config)
+        self.control_key = control_key
+        self.only_mid_control = only_mid_control
+        self.secret_decoder = None if secret_decoder_config == 'none' else instantiate_from_config(secret_decoder_config)
+        self.secret_loss_layer = nn.BCEWithLogitsLoss()
+    @torch.no_grad()
+    def get_input(self, batch, k, bs=None, *args, **kwargs):
+        x, c = super().get_input(batch, self.first_stage_key, *args, **kwargs)
+        control = batch[self.control_key]
+        if bs is not None:
+            control = control[:bs]
+        control = control.to(self.device)
+        if self.control_key == 'hint':
+            control = einops.rearrange(control, 'b h w c -> b c h w')
+            control = control.to(memory_format=torch.contiguous_format).float()
+        return x, dict(c_crossattn=[c], c_concat=[control])
+    def apply_model(self, x_noisy, t, cond, *args, **kwargs):
+        assert isinstance(cond, dict)
+        diffusion_model = self.model.diffusion_model
+        cond_txt = torch.cat(cond['c_crossattn'], 1)
+        cond_hint = torch.cat(cond['c_concat'], 1)
+        control = self.control_model(x=x_noisy, hint=cond_hint, timesteps=t, context=cond_txt)
+        eps = diffusion_model(x=x_noisy, timesteps=t, context=cond_txt, control=control, only_mid_control=self.only_mid_control)
+        return eps
+    def p_losses(self, x_start, cond, t, noise=None):
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
+        model_output = self.apply_model(x_noisy, t, cond)
+        loss_dict = {}
+        prefix = 'train' if self.training else 'val'
+        if self.parameterization == "x0":
+            target = x_start
+            x_recon = model_output
+        elif self.parameterization == "eps":
+            target = noise
+            x_recon = self.predict_start_from_noise(x_noisy, t, noise=model_output)
+        elif self.parameterization == "v":
+            target = self.get_v(x_start, noise, t)
+        else:
+            raise NotImplementedError()
+        loss_simple = self.get_loss(model_output, target, mean=False).mean([1, 2, 3])
+        loss_dict.update({f'{prefix}/loss_simple': loss_simple.mean()})
+        logvar_t = self.logvar[t].to(self.device)
+        loss = loss_simple / torch.exp(logvar_t) + logvar_t
+        # loss = loss_simple / torch.exp(self.logvar) + self.logvar
+        if self.learn_logvar:
+            loss_dict.update({f'{prefix}/loss_gamma': loss.mean()})
+            loss_dict.update({'logvar': self.logvar.data.mean()})
+        loss = self.l_simple_weight * loss.mean()
+        loss_vlb = self.get_loss(model_output, target, mean=False).mean(dim=(1, 2, 3))
+        loss_vlb = (self.lvlb_weights[t] * loss_vlb).mean()
+        loss_dict.update({f'{prefix}/loss_vlb': loss_vlb})
+        loss += (self.original_elbo_weight * loss_vlb)
+        # secret decode
+        if self.secret_decoder is not None:
+            simple_loss_weight = 0.1
+            x_recon = self.differentiable_decode_first_stage(x_recon)
+            secret_pred = self.secret_decoder(x_recon)
+            secret = cond['c_concat'][0]
+            loss_secret = self.secret_loss_layer(secret_pred, secret)
+            bit_acc = ((secret_pred.detach() > 0).float() == secret).float().mean()
+            loss_dict.update({f'{prefix}/bit_acc': bit_acc})
+            loss_dict.update({f'{prefix}/loss_secret': loss_secret})
+            loss = (loss*simple_loss_weight + loss_secret) / (simple_loss_weight + 1)
+        loss_dict.update({f'{prefix}/loss': loss})
+        return loss, loss_dict
+    def differentiable_decode_first_stage(self, z, predict_cids=False, force_not_quantize=False):
+        if predict_cids:
+            if z.dim() == 4:
+                z = torch.argmax(z.exp(), dim=1).long()
+            z = self.first_stage_model.quantize.get_codebook_entry(z, shape=None)
+            z = rearrange(z, 'b h w c -> b c h w').contiguous()
+        z = 1. / self.scale_factor * z
+        return self.first_stage_model.decode(z)
+    @torch.no_grad()
+    def get_unconditional_conditioning(self, N):
+        return self.get_learned_conditioning([""] * N)
+    @torch.no_grad()
+    def log_images(self, batch, N=4, n_row=2, sample=False, ddim_steps=50, ddim_eta=0.0, return_keys=None,
+                   quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
+                   plot_diffusion_rows=False, unconditional_guidance_scale=9.0, unconditional_guidance_label=None,
+                   use_ema_scope=True,
+                   **kwargs):
+        use_ddim = ddim_steps is not None
+        log = dict()
+        z, c = self.get_input(batch, self.first_stage_key, bs=N)
+        c_cat, c = c["c_concat"][0][:N], c["c_crossattn"][0][:N]
+        N = min(z.shape[0], N)
+        n_row = min(z.shape[0], n_row)
+        log["reconstruction"] = self.decode_first_stage(z)
+        # log["control"] = c_cat * 2.0 - 1.0
+        log["conditioning"] = log_txt_as_img((512, 512), batch[self.cond_stage_key], size=16)
+        if plot_diffusion_rows:
+            # get diffusion row
+            diffusion_row = list()
+            z_start = z[:n_row]
+            for t in range(self.num_timesteps):
+                if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
+                    t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
+                    t = t.to(self.device).long()
+                    noise = torch.randn_like(z_start)
+                    z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
+                    diffusion_row.append(self.decode_first_stage(z_noisy))
+            diffusion_row = torch.stack(diffusion_row)  # n_log_step, n_row, C, H, W
+            diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
+            diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
+            diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
+            log["diffusion_row"] = diffusion_grid
+        if sample:
+            # get denoise row
+            samples, z_denoise_row = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c]},
+                                                     batch_size=N, ddim=use_ddim,
+                                                     ddim_steps=ddim_steps, eta=ddim_eta)
+            x_samples = self.decode_first_stage(samples)
+            log["samples"] = x_samples
+            if plot_denoise_rows:
+                denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
+                log["denoise_row"] = denoise_grid
+        # import pudb; pudb.set_trace()
+        if unconditional_guidance_scale > 1.0:
+            uc_cross = self.get_unconditional_conditioning(N)
+            uc_cat = c_cat  # torch.zeros_like(c_cat)
+            uc_full = {"c_concat": [uc_cat], "c_crossattn": [uc_cross]}
+            samples_cfg, _ = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c]},
+                                             batch_size=N, ddim=use_ddim,
+                                             ddim_steps=ddim_steps, eta=ddim_eta,
+                                             unconditional_guidance_scale=unconditional_guidance_scale,
+                                             unconditional_conditioning=uc_full,
+                                             )
+            x_samples_cfg = self.decode_first_stage(samples_cfg)
+            log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
+        return log
+    @torch.no_grad()
+    def sample_log(self, cond, batch_size, ddim, ddim_steps, **kwargs):
+        ddim_sampler = DDIMSampler(self)
+        # import pdb; pdb.set_trace()
+        # b, c, h, w = cond["c_concat"][0].shape
+        b, c, h, w = cond["c_concat"][0].shape[0], self.channels, self.image_size*8, self.image_size*8
+        shape = (self.channels, h // 8, w // 8)
+        samples, intermediates = ddim_sampler.sample(ddim_steps, batch_size, shape, cond, verbose=False, **kwargs)
+        return samples, intermediates
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        params = list(self.control_model.parameters())
+        if self.secret_decoder is not None:
+            params += list(self.secret_decoder.parameters())
+        if not self.sd_locked:
+            params += list(self.model.diffusion_model.output_blocks.parameters())
+            params += list(self.model.diffusion_model.out.parameters())
+        opt = torch.optim.AdamW(params, lr=lr)
+        return opt
+    def low_vram_shift(self, is_diffusing):
+        if is_diffusing:
+            self.model = self.model.cuda()
+            self.control_model = self.control_model.cuda()
+            self.first_stage_model = self.first_stage_model.cpu()
+            self.cond_stage_model = self.cond_stage_model.cpu()
+        else:
+            self.model = self.model.cpu()
+            self.control_model = self.control_model.cpu()
+            self.first_stage_model = self.first_stage_model.cuda()
+            self.cond_stage_model = self.cond_stage_model.cuda()

cldm/hack.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import torch
+import einops
+import ldm.modules.encoders.modules
+import ldm.modules.attention
+from transformers import logging
+from ldm.modules.attention import default
+import warnings
+def disable_verbosity():
+    logging.set_verbosity_error()
+    warnings.filterwarnings(action='ignore', category=DeprecationWarning)
+    warnings.filterwarnings(action='ignore', category=UserWarning)
+    print('logging improved.')
+    return
+def enable_sliced_attention():
+    ldm.modules.attention.CrossAttention.forward = _hacked_sliced_attentin_forward
+    print('Enabled sliced_attention.')
+    return
+def hack_everything(clip_skip=0):
+    disable_verbosity()
+    ldm.modules.encoders.modules.FrozenCLIPEmbedder.forward = _hacked_clip_forward
+    ldm.modules.encoders.modules.FrozenCLIPEmbedder.clip_skip = clip_skip
+    print('Enabled clip hacks.')
+    return
+# Written by Lvmin
+def _hacked_clip_forward(self, text):
+    PAD = self.tokenizer.pad_token_id
+    EOS = self.tokenizer.eos_token_id
+    BOS = self.tokenizer.bos_token_id
+    def tokenize(t):
+        return self.tokenizer(t, truncation=False, add_special_tokens=False)["input_ids"]
+    def transformer_encode(t):
+        if self.clip_skip > 1:
+            rt = self.transformer(input_ids=t, output_hidden_states=True)
+            return self.transformer.text_model.final_layer_norm(rt.hidden_states[-self.clip_skip])
+        else:
+            return self.transformer(input_ids=t, output_hidden_states=False).last_hidden_state
+    def split(x):
+        return x[75 * 0: 75 * 1], x[75 * 1: 75 * 2], x[75 * 2: 75 * 3]
+    def pad(x, p, i):
+        return x[:i] if len(x) >= i else x + [p] * (i - len(x))
+    raw_tokens_list = tokenize(text)
+    tokens_list = []
+    for raw_tokens in raw_tokens_list:
+        raw_tokens_123 = split(raw_tokens)
+        raw_tokens_123 = [[BOS] + raw_tokens_i + [EOS] for raw_tokens_i in raw_tokens_123]
+        raw_tokens_123 = [pad(raw_tokens_i, PAD, 77) for raw_tokens_i in raw_tokens_123]
+        tokens_list.append(raw_tokens_123)
+    tokens_list = torch.IntTensor(tokens_list).to(self.device)
+    feed = einops.rearrange(tokens_list, 'b f i -> (b f) i')
+    y = transformer_encode(feed)
+    z = einops.rearrange(y, '(b f) i c -> b (f i) c', f=3)
+    return z
+# Stolen from https://github.com/basujindal/stable-diffusion/blob/main/optimizedSD/splitAttention.py
+def _hacked_sliced_attentin_forward(self, x, context=None, mask=None):
+    h = self.heads
+    q = self.to_q(x)
+    context = default(context, x)
+    k = self.to_k(context)
+    v = self.to_v(context)
+    del context, x
+    q, k, v = map(lambda t: einops.rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+    limit = k.shape[0]
+    att_step = 1
+    q_chunks = list(torch.tensor_split(q, limit // att_step, dim=0))
+    k_chunks = list(torch.tensor_split(k, limit // att_step, dim=0))
+    v_chunks = list(torch.tensor_split(v, limit // att_step, dim=0))
+    q_chunks.reverse()
+    k_chunks.reverse()
+    v_chunks.reverse()
+    sim = torch.zeros(q.shape[0], q.shape[1], v.shape[2], device=q.device)
+    del k, q, v
+    for i in range(0, limit, att_step):
+        q_buffer = q_chunks.pop()
+        k_buffer = k_chunks.pop()
+        v_buffer = v_chunks.pop()
+        sim_buffer = torch.einsum('b i d, b j d -> b i j', q_buffer, k_buffer) * self.scale
+        del k_buffer, q_buffer
+        # attention, what we cannot get enough of, by chunks
+        sim_buffer = sim_buffer.softmax(dim=-1)
+        sim_buffer = torch.einsum('b i j, b j d -> b i d', sim_buffer, v_buffer)
+        del v_buffer
+        sim[i:i + att_step, :, :] = sim_buffer
+        del sim_buffer
+    sim = einops.rearrange(sim, '(b h) n d -> b n (h d)', h=h)
+    return self.to_out(sim)

cldm/logger.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import os
+from omegaconf import OmegaConf
+import numpy as np
+import torch
+import torchvision
+from PIL import Image
+from pytorch_lightning.callbacks import Callback
+from pytorch_lightning.utilities.distributed import rank_zero_only
+from pytorch_lightning.utilities import rank_zero_info
+import time
+class CUDACallback(Callback):
+    # see https://github.com/SeanNaren/minGPT/blob/master/mingpt/callback.py
+    def on_train_epoch_start(self, trainer, pl_module):
+        # Reset the memory use counter
+        torch.cuda.reset_peak_memory_stats(trainer.root_gpu)
+        torch.cuda.synchronize(trainer.root_gpu)
+        self.start_time = time.time()
+    def on_train_epoch_end(self, trainer, pl_module, outputs):
+        torch.cuda.synchronize(trainer.root_gpu)
+        max_memory = torch.cuda.max_memory_allocated(trainer.root_gpu) / 2 ** 20
+        epoch_time = (time.time() - self.start_time)/3600
+        try:
+            max_memory = trainer.training_type_plugin.reduce(max_memory)
+            epoch_time = trainer.training_type_plugin.reduce(epoch_time)
+            rank_zero_info(f"Average Epoch time: {epoch_time:.2f} hours")
+            rank_zero_info(f"Average Peak memory {max_memory:.2f}MiB")
+        except AttributeError:
+            pass
+class SetupCallback(Callback):
+    def __init__(self, resume, now, logdir, ckptdir, cfgdir, config, lightning_config):
+        super().__init__()
+        self.resume = resume
+        self.now = now
+        self.logdir = logdir
+        self.ckptdir = ckptdir
+        self.cfgdir = cfgdir
+        self.config = config
+        self.lightning_config = lightning_config
+    def on_keyboard_interrupt(self, trainer, pl_module):
+        if trainer.global_rank == 0:
+            print("Summoning checkpoint.")
+            ckpt_path = os.path.join(self.ckptdir, "last.ckpt")
+            trainer.save_checkpoint(ckpt_path)
+    def on_pretrain_routine_start(self, trainer, pl_module):
+        if trainer.global_rank == 0:
+            # Create logdirs and save configs
+            os.makedirs(self.logdir, exist_ok=True)
+            os.makedirs(self.ckptdir, exist_ok=True)
+            os.makedirs(self.cfgdir, exist_ok=True)
+            if "callbacks" in self.lightning_config:
+                if 'metrics_over_trainsteps_checkpoint' in self.lightning_config['callbacks']:
+                    os.makedirs(os.path.join(self.ckptdir, 'trainstep_checkpoints'), exist_ok=True)
+            print("Project config")
+            print(OmegaConf.to_yaml(self.config))
+            OmegaConf.save(self.config,
+                           os.path.join(self.cfgdir, "{}-project.yaml".format(self.now)))
+            print("Lightning config")
+            print(OmegaConf.to_yaml(self.lightning_config))
+            OmegaConf.save(OmegaConf.create({"lightning": self.lightning_config}),
+                           os.path.join(self.cfgdir, "{}-lightning.yaml".format(self.now)))
+        else:
+            # ModelCheckpoint callback created log directory --- remove it
+            if not self.resume and os.path.exists(self.logdir):
+                dst, name = os.path.split(self.logdir)
+                dst = os.path.join(dst, "child_runs", name)
+                os.makedirs(os.path.split(dst)[0], exist_ok=True)
+                try:
+                    os.rename(self.logdir, dst)
+                except FileNotFoundError:
+                    pass
+class ImageLogger(Callback):
+    def __init__(self, batch_frequency=2000, max_images=4, clamp=True, increase_log_steps=True,
+                 rescale=True, disabled=False, log_on_batch_idx=False, log_first_step=False,
+                 log_images_kwargs=None, fixed_input=False):
+        super().__init__()
+        self.rescale = rescale
+        self.batch_freq = batch_frequency
+        self.max_images = max_images
+        if not increase_log_steps:
+            self.log_steps = [self.batch_freq]
+        self.clamp = clamp
+        self.disabled = disabled
+        self.log_on_batch_idx = log_on_batch_idx
+        self.log_images_kwargs = log_images_kwargs if log_images_kwargs else {}
+        self.log_first_step = log_first_step
+        self.fixed_input = fixed_input
+    @rank_zero_only
+    def log_local(self, save_dir, split, images, global_step, current_epoch, batch_idx):
+        root = os.path.join(save_dir, "image_log", split)
+        for k in images:
+            grid = torchvision.utils.make_grid(images[k], nrow=4)
+            if self.rescale:
+                grid = (grid + 1.0) / 2.0  # -1,1 -> 0,1; c,h,w
+            grid = grid.transpose(0, 1).transpose(1, 2).squeeze(-1)
+            grid = grid.numpy()
+            grid = (grid * 255).astype(np.uint8)
+            filename = "{}_gs-{:06}_e-{:06}_b-{:06}.png".format(k, global_step, current_epoch, batch_idx)
+            path = os.path.join(root, filename)
+            os.makedirs(os.path.split(path)[0], exist_ok=True)
+            Image.fromarray(grid).save(path)
+    def log_img(self, pl_module, batch, batch_idx, split="train"):
+        check_idx = batch_idx  # if self.log_on_batch_idx else pl_module.global_step
+        if (self.check_frequency(check_idx) and  # batch_idx % self.batch_freq == 0
+                hasattr(pl_module, "log_images") and
+                callable(pl_module.log_images) and
+                self.max_images > 0):
+            logger = type(pl_module.logger)
+            is_train = pl_module.training
+            if is_train:
+                pl_module.eval()
+            with torch.no_grad():
+                images = pl_module.log_images(batch, fixed_input=self.fixed_input, split=split, **self.log_images_kwargs)
+            for k in images:
+                N = min(images[k].shape[0], self.max_images)
+                images[k] = images[k][:N]
+                if isinstance(images[k], torch.Tensor):
+                    images[k] = images[k].detach().cpu()
+                    if self.clamp:
+                        images[k] = torch.clamp(images[k], -1., 1.)
+            self.log_local(pl_module.logger.save_dir, split, images,
+                           pl_module.global_step, pl_module.current_epoch, batch_idx)
+            if is_train:
+                pl_module.train()
+    def check_frequency(self, check_idx):
+        return check_idx % self.batch_freq == 0
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
+        if not self.disabled:
+            self.log_img(pl_module, batch, batch_idx, split="train")

cldm/loss.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import torch
+import torch.nn as nn
+from lpips import LPIPS
+from kornia import color
+# from taming.modules.losses.vqperceptual import *
+class ImageSecretLoss(nn.Module):
+    def __init__(self, recon_type='rgb', recon_weight=1., perceptual_weight=1.0, secret_weight=10., kl_weight=0.000001, logvar_init=0.0, ramp=100000, max_image_weight_ratio=2.) -> None:
+        super().__init__()
+        self.recon_type = recon_type
+        assert recon_type in ['rgb', 'yuv']
+        if recon_type == 'yuv':
+            self.register_buffer('yuv_scales', torch.tensor([1,100,100]).unsqueeze(1).float())  # [3,1]
+        self.recon_weight = recon_weight
+        self.perceptual_weight = perceptual_weight
+        self.secret_weight = secret_weight
+        self.kl_weight = kl_weight
+        self.ramp = ramp
+        self.max_image_weight = max_image_weight_ratio * secret_weight - 1
+        self.register_buffer('ramp_on', torch.tensor(False))
+        self.register_buffer('step0', torch.tensor(1e9))  # large number
+        self.perceptual_loss = LPIPS().eval()
+        self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init)
+        self.bce = nn.BCEWithLogitsLoss(reduction="none")
+    def activate_ramp(self, global_step):
+        if not self.ramp_on:  # do not activate ramp twice
+            self.step0 = torch.tensor(global_step)
+            self.ramp_on = ~self.ramp_on
+            print('[TRAINING] Activate ramp for image loss at step ', global_step)
+    def compute_recon_loss(self, inputs, reconstructions):
+        if self.recon_type == 'rgb':
+            rec_loss = torch.abs(inputs - reconstructions).mean(dim=[1,2,3])
+        elif self.recon_type == 'yuv':
+            reconstructions_yuv = color.rgb_to_yuv((reconstructions + 1) / 2)
+            inputs_yuv = color.rgb_to_yuv((inputs + 1) / 2)
+            yuv_loss = torch.mean((reconstructions_yuv - inputs_yuv)**2, dim=[2,3])
+            rec_loss = torch.mm(yuv_loss, self.yuv_scales).squeeze(1)
+        else:
+            raise ValueError(f"Unknown recon type {self.recon_type}")
+        return rec_loss
+    def forward(self, inputs, reconstructions, posteriors, secret_gt, secret_pred, global_step):
+        loss_dict = {}
+        rec_loss = self.compute_recon_loss(inputs.contiguous(), reconstructions.contiguous())
+        loss = rec_loss*self.recon_weight
+        if self.perceptual_weight > 0:
+            p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous()).mean(dim=[1,2,3])
+            loss += self.perceptual_weight * p_loss
+            loss_dict['p_loss'] = p_loss.mean()
+        loss = loss / torch.exp(self.logvar) + self.logvar
+        if self.kl_weight > 0:
+            kl_loss = posteriors.kl()
+            loss += kl_loss*self.kl_weight
+            loss_dict['kl_loss'] = kl_loss.mean()
+        image_weight = 1 + min(self.max_image_weight, max(0., self.max_image_weight*(global_step - self.step0.item())/self.ramp))
+        secret_loss = self.bce(secret_pred, secret_gt).mean(dim=1)
+        loss = (loss*image_weight + secret_loss*self.secret_weight) / (image_weight+self.secret_weight)
+        # loss dict update
+        bit_acc = ((secret_pred.detach() > 0).float() == secret_gt).float().mean()
+        loss_dict['bit_acc'] = bit_acc
+        loss_dict['loss'] = loss.mean()
+        loss_dict['img_lw'] = image_weight/self.secret_weight
+        loss_dict['rec_loss'] = rec_loss.mean()
+        loss_dict['secret_loss'] = secret_loss.mean()
+        return loss.mean(), loss_dict

cldm/loss_weight_scheduler.py ADDED Viewed

	@@ -0,0 +1,17 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@author: Tu Bui @University of Surrey
+"""
+class SimpleLossWeightScheduler(object):
+    def __init__(self, simple_loss_weight_max=10., wait_steps=50000, ramp=100000) -> None:
+        self.simple_loss_weight_max = simple_loss_weight_max
+        self.wait_steps = wait_steps
+        self.ramp = ramp
+    def __call__(self, step):
+        max_weight = self.simple_loss_weight_max - 1
+        w = 1 + min(max_weight, max(0., max_weight*(step - self.wait_steps)/self.ramp))
+        return w

cldm/model.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import os
+import torch
+from omegaconf import OmegaConf
+from ldm.util import instantiate_from_config
+def get_state_dict(d):
+    return d.get('state_dict', d)
+def load_state_dict(ckpt_path, location='cpu'):
+    _, extension = os.path.splitext(ckpt_path)
+    if extension.lower() == ".safetensors":
+        import safetensors.torch
+        state_dict = safetensors.torch.load_file(ckpt_path, device=location)
+    else:
+        state_dict = get_state_dict(torch.load(ckpt_path, map_location=torch.device(location)))
+    state_dict = get_state_dict(state_dict)
+    print(f'Loaded state_dict from [{ckpt_path}]')
+    return state_dict
+def create_model(config_path):
+    config = OmegaConf.load(config_path)
+    model = instantiate_from_config(config.model).cpu()
+    print(f'Loaded model config from [{config_path}]')
+    return model

cldm/plms.py ADDED Viewed

	@@ -0,0 +1,1481 @@

+"""SAMPLING ONLY."""
+import os
+import torch
+from torch import nn
+import torchvision
+import numpy as np
+from tqdm import tqdm
+from functools import partial
+from PIL import Image
+import shutil
+from ldm.modules.diffusionmodules.util import (
+    make_ddim_sampling_parameters,
+    make_ddim_timesteps,
+    noise_like,
+)
+import clip
+from einops import rearrange
+import random
+class VGGPerceptualLoss(torch.nn.Module):
+    def __init__(self, resize=True):
+        super(VGGPerceptualLoss, self).__init__()
+        blocks = []
+        blocks.append(torchvision.models.vgg16(pretrained=True).features[:4].eval())
+        blocks.append(torchvision.models.vgg16(pretrained=True).features[4:9].eval())
+        blocks.append(torchvision.models.vgg16(pretrained=True).features[9:16].eval())
+        blocks.append(torchvision.models.vgg16(pretrained=True).features[16:23].eval())
+        for bl in blocks:
+            for p in bl.parameters():
+                p.requires_grad = False
+        self.blocks = torch.nn.ModuleList(blocks)
+        self.transform = torch.nn.functional.interpolate
+        self.resize = resize
+        self.register_buffer(
+            "mean", torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
+        )
+        self.register_buffer(
+            "std", torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
+        )
+    def forward(self, input, target, feature_layers=[0, 1, 2, 3], style_layers=[]):
+        input = (input - self.mean) / self.std
+        target = (target - self.mean) / self.std
+        if self.resize:
+            input = self.transform(
+                input, mode="bilinear", size=(224, 224), align_corners=False
+            )
+            target = self.transform(
+                target, mode="bilinear", size=(224, 224), align_corners=False
+            )
+        loss = 0.0
+        x = input
+        y = target
+        for i, block in enumerate(self.blocks):
+            x = block(x)
+            y = block(y)
+            if i in feature_layers:
+                loss += torch.nn.functional.l1_loss(x, y)
+            if i in style_layers:
+                act_x = x.reshape(x.shape[0], x.shape[1], -1)
+                act_y = y.reshape(y.shape[0], y.shape[1], -1)
+                gram_x = act_x @ act_x.permute(0, 2, 1)
+                gram_y = act_y @ act_y.permute(0, 2, 1)
+                loss += torch.nn.functional.l1_loss(gram_x, gram_y)
+        return loss
+class DCLIPLoss(torch.nn.Module):
+    def __init__(self):
+        super(DCLIPLoss, self).__init__()
+        self.model, self.preprocess = clip.load("ViT-B/32", device="cuda")
+        self.upsample = torch.nn.Upsample(scale_factor=7)
+        self.avg_pool = torch.nn.AvgPool2d(kernel_size=16)
+    def forward(self, image1, image2, text1, text2):
+        text1 = clip.tokenize([text1]).to("cuda")
+        text2 = clip.tokenize([text2]).to("cuda")
+        image1 = image1.unsqueeze(0).cuda()
+        image2 = image2.unsqueeze(0)
+        image1 = self.avg_pool(self.upsample(image1))
+        image2 = self.avg_pool(self.upsample(image2))
+        image1_feat = self.model.encode_image(image1)
+        image2_feat = self.model.encode_image(image2)
+        text1_feat = self.model.encode_text(text1)
+        text2_feat = self.model.encode_text(text2)
+        d_image_feat = image1_feat - image2_feat
+        d_text_feat = text1_feat - text2_feat
+        similarity = torch.nn.CosineSimilarity()(d_image_feat, d_text_feat)
+        return 1 - similarity
+class PLMSSampler(object):
+    def __init__(self, model, schedule="linear", **kwargs):
+        super().__init__()
+        self.model = model
+        self.ddpm_num_timesteps = model.num_timesteps
+        self.schedule = schedule
+    def register_buffer(self, name, attr):
+        if type(attr) == torch.Tensor:
+            if attr.device != torch.device("cuda"):
+                attr = attr.to(torch.device("cuda"))
+        setattr(self, name, attr)
+    def make_schedule(
+        self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0.0, verbose=True
+    ):
+        if ddim_eta != 0:
+            raise ValueError("ddim_eta must be 0 for PLMS")
+        self.ddim_timesteps = make_ddim_timesteps(
+            ddim_discr_method=ddim_discretize,
+            num_ddim_timesteps=ddim_num_steps,
+            num_ddpm_timesteps=self.ddpm_num_timesteps,
+            verbose=verbose,
+        )
+        alphas_cumprod = self.model.alphas_cumprod
+        assert (
+            alphas_cumprod.shape[0] == self.ddpm_num_timesteps
+        ), "alphas have to be defined for each timestep"
+        to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
+        self.register_buffer("betas", to_torch(self.model.betas))
+        self.register_buffer("alphas_cumprod", to_torch(alphas_cumprod))
+        self.register_buffer(
+            "alphas_cumprod_prev", to_torch(self.model.alphas_cumprod_prev)
+        )
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer(
+            "sqrt_alphas_cumprod", to_torch(np.sqrt(alphas_cumprod.cpu()))
+        )
+        self.register_buffer(
+            "sqrt_one_minus_alphas_cumprod",
+            to_torch(np.sqrt(1.0 - alphas_cumprod.cpu())),
+        )
+        self.register_buffer(
+            "log_one_minus_alphas_cumprod", to_torch(np.log(1.0 - alphas_cumprod.cpu()))
+        )
+        self.register_buffer(
+            "sqrt_recip_alphas_cumprod", to_torch(np.sqrt(1.0 / alphas_cumprod.cpu()))
+        )
+        self.register_buffer(
+            "sqrt_recipm1_alphas_cumprod",
+            to_torch(np.sqrt(1.0 / alphas_cumprod.cpu() - 1)),
+        )
+        # ddim sampling parameters
+        ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(
+            alphacums=alphas_cumprod.cpu(),
+            ddim_timesteps=self.ddim_timesteps,
+            eta=0.0,
+            verbose=verbose,
+        )
+        self.register_buffer("ddim_sigmas", ddim_sigmas)
+        self.register_buffer("ddim_alphas", ddim_alphas)
+        self.register_buffer("ddim_alphas_prev", ddim_alphas_prev)
+        self.register_buffer("ddim_sqrt_one_minus_alphas", np.sqrt(1.0 - ddim_alphas))
+        sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
+            (1 - self.alphas_cumprod_prev)
+            / (1 - self.alphas_cumprod)
+            * (1 - self.alphas_cumprod / self.alphas_cumprod_prev)
+        )
+        self.register_buffer(
+            "ddim_sigmas_for_original_num_steps", sigmas_for_original_sampling_steps
+        )
+    @torch.no_grad()
+    def sample(self,
+               S,
+               batch_size,
+               shape,
+               conditioning=None,
+               callback=None,
+               normals_sequence=None,
+               img_callback=None,
+               quantize_x0=False,
+               eta=0.,
+               mask=None,
+               x0=None,
+               temperature=1.,
+               noise_dropout=0.,
+               score_corrector=None,
+               corrector_kwargs=None,
+               verbose=True,
+               x_T=None,
+               log_every_t=100,
+               unconditional_guidance_scale=1.,
+               unconditional_conditioning=None,
+               # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
+               dynamic_threshold=None,
+               **kwargs
+               ):
+        if conditioning is not None:
+            if isinstance(conditioning, dict):
+                cbs = conditioning[list(conditioning.keys())[0]][0].shape[0]
+                if cbs != batch_size:
+                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
+            else:
+                if conditioning.shape[0] != batch_size:
+                    print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
+        self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
+        # sampling
+        C, H, W = shape
+        size = (batch_size, C, H, W)
+        print(f'Data shape for PLMS sampling is {size}')
+        samples, intermediates = self.plms_sampling(conditioning, size,
+                                                    callback=callback,
+                                                    img_callback=img_callback,
+                                                    quantize_denoised=quantize_x0,
+                                                    mask=mask, x0=x0,
+                                                    ddim_use_original_steps=False,
+                                                    noise_dropout=noise_dropout,
+                                                    temperature=temperature,
+                                                    score_corrector=score_corrector,
+                                                    corrector_kwargs=corrector_kwargs,
+                                                    x_T=x_T,
+                                                    log_every_t=log_every_t,
+                                                    unconditional_guidance_scale=unconditional_guidance_scale,
+                                                    unconditional_conditioning=unconditional_conditioning,
+                                                    )
+        return samples, intermediates
+    @torch.no_grad()
+    def plms_sampling(
+        self,
+        cond,
+        shape,
+        x_T=None,
+        ddim_use_original_steps=False,
+        callback=None,
+        timesteps=None,
+        quantize_denoised=False,
+        mask=None,
+        x0=None,
+        img_callback=None,
+        log_every_t=100,
+        temperature=1.0,
+        noise_dropout=0.0,
+        score_corrector=None,
+        corrector_kwargs=None,
+        unconditional_guidance_scale=1.0,
+        unconditional_conditioning=None,
+    ):
+        device = self.model.betas.device
+        b = shape[0]
+        if x_T is None:
+            img = torch.randn(shape, device=device)
+        else:
+            img = x_T
+        if timesteps is None:
+            timesteps = (
+                self.ddpm_num_timesteps
+                if ddim_use_original_steps
+                else self.ddim_timesteps
+            )
+        elif timesteps is not None and not ddim_use_original_steps:
+            subset_end = (
+                int(
+                    min(timesteps / self.ddim_timesteps.shape[0], 1)
+                    * self.ddim_timesteps.shape[0]
+                )
+                - 1
+            )
+            timesteps = self.ddim_timesteps[:subset_end]
+        intermediates = {"x_inter": [img], "pred_x0": [img]}
+        time_range = (
+            list(reversed(range(0, timesteps)))
+            if ddim_use_original_steps
+            else np.flip(timesteps)
+        )
+        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
+        print(f"Running PLMS Sampling with {total_steps} timesteps")
+        iterator = tqdm(time_range, desc="PLMS Sampler", total=total_steps)
+        old_eps = []
+        for i, step in enumerate(iterator):
+            index = total_steps - i - 1
+            ts = torch.full((b,), step, device=device, dtype=torch.long)
+            ts_next = torch.full(
+                (b,),
+                time_range[min(i + 1, len(time_range) - 1)],
+                device=device,
+                dtype=torch.long,
+            )
+            if mask is not None:
+                assert x0 is not None
+                # import ipdb; ipdb.set_trace()
+                img_orig = self.model.q_sample(
+                    x0, ts
+                )  # TODO: deterministic forward pass?
+                img = img_orig * mask + (1.0 - mask) * img
+            outs = self.p_sample_plms(
+                img,
+                cond,
+                ts,
+                index=index,
+                use_original_steps=ddim_use_original_steps,
+                quantize_denoised=quantize_denoised,
+                temperature=temperature,
+                noise_dropout=noise_dropout,
+                score_corrector=score_corrector,
+                corrector_kwargs=corrector_kwargs,
+                unconditional_guidance_scale=unconditional_guidance_scale,
+                unconditional_conditioning=unconditional_conditioning,
+                old_eps=old_eps,
+                t_next=ts_next,
+            )
+            img, pred_x0, e_t = outs
+            old_eps.append(e_t)
+            if len(old_eps) >= 4:
+                old_eps.pop(0)
+            if callback:
+                callback(i)
+            if img_callback:
+                img_callback(pred_x0, i)
+            if index % 1 == 0 or index == total_steps - 1:
+                intermediates["x_inter"].append(img)
+                intermediates["pred_x0"].append(pred_x0)
+        return img, intermediates
+    @torch.no_grad()
+    def p_sample_plms(
+        self,
+        x,
+        c,
+        t,
+        index,
+        repeat_noise=False,
+        use_original_steps=False,
+        quantize_denoised=False,
+        temperature=1.0,
+        noise_dropout=0.0,
+        score_corrector=None,
+        corrector_kwargs=None,
+        unconditional_guidance_scale=1.0,
+        unconditional_conditioning=None,
+        old_eps=None,
+        t_next=None,
+    ):
+        b, *_, device = *x.shape, x.device
+        def get_model_output(x, t):
+            if (
+                unconditional_conditioning is None
+                or unconditional_guidance_scale == 1.0
+            ):
+                e_t = self.model.apply_model(x, t, c)
+            else:
+                x_in = torch.cat([x] * 2)
+                t_in = torch.cat([t] * 2)
+                if isinstance(c, dict):
+                    c_in = {key: [torch.cat([unconditional_conditioning[key][0], c[key][0]])] for key in c}
+                else:
+                    c_in = torch.cat([unconditional_conditioning, c])
+                e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
+                e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
+            if score_corrector is not None:
+                assert self.model.parameterization == "eps"
+                e_t = score_corrector.modify_score(
+                    self.model, e_t, x, t, c, **corrector_kwargs
+                )
+            return e_t
+        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
+        alphas_prev = (
+            self.model.alphas_cumprod_prev
+            if use_original_steps
+            else self.ddim_alphas_prev
+        )
+        sqrt_one_minus_alphas = (
+            self.model.sqrt_one_minus_alphas_cumprod
+            if use_original_steps
+            else self.ddim_sqrt_one_minus_alphas
+        )
+        sigmas = (
+            self.model.ddim_sigmas_for_original_num_steps
+            if use_original_steps
+            else self.ddim_sigmas
+        )
+        def get_x_prev_and_pred_x0(e_t, index):
+            # select parameters corresponding to the currently considered timestep
+            a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
+            a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
+            sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
+            sqrt_one_minus_at = torch.full(
+                (b, 1, 1, 1), sqrt_one_minus_alphas[index], device=device
+            )
+            # current prediction for x_0
+            pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+            if quantize_denoised:
+                pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
+            # direction pointing to x_t
+            dir_xt = (1.0 - a_prev - sigma_t ** 2).sqrt() * e_t
+            noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
+            if noise_dropout > 0.0:
+                noise = torch.nn.functional.dropout(noise, p=noise_dropout)
+            x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
+            return x_prev, pred_x0
+        e_t = get_model_output(x, t)
+        if len(old_eps) == 0:
+            # Pseudo Improved Euler (2nd order)
+            x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index)
+            e_t_next = get_model_output(x_prev, t_next)
+            e_t_prime = (e_t + e_t_next) / 2
+        elif len(old_eps) == 1:
+            # 2nd order Pseudo Linear Multistep (Adams-Bashforth)
+            e_t_prime = (3 * e_t - old_eps[-1]) / 2
+        elif len(old_eps) == 2:
+            # 3nd order Pseudo Linear Multistep (Adams-Bashforth)
+            e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12
+        elif len(old_eps) >= 3:
+            # 4nd order Pseudo Linear Multistep (Adams-Bashforth)
+            e_t_prime = (
+                55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3]
+            ) / 24
+        x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)
+        return x_prev, pred_x0, e_t
+    ###### Above are original stable-diffusion code ############
+    ###### Encode Image ########################################
+    @torch.no_grad()
+    def sample_encode_save_noise(
+        self,
+        S,
+        batch_size,
+        shape,
+        conditioning=None,
+        callback=None,
+        normals_sequence=None,
+        img_callback=None,
+        quantize_x0=False,
+        eta=0.0,
+        mask=None,
+        x0=None,
+        temperature=1.0,
+        noise_dropout=0.0,
+        score_corrector=None,
+        corrector_kwargs=None,
+        verbose=True,
+        x_T=None,
+        log_every_t=100,
+        unconditional_guidance_scale=1.0,
+        unconditional_conditioning=None,
+        input_image=None,
+        noise_save_path=None,
+        # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
+        **kwargs,
+    ):
+        assert conditioning is not None
+        # assert not isinstance(conditioning, dict)
+        self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
+        # sampling
+        C, H, W = shape
+        size = (batch_size, C, H, W)
+        if verbose:
+            print(f"Data shape for PLMS sampling is {size}")
+        samples, intermediates, x0_loop = self.plms_sampling_enc_save_noise(
+            conditioning,
+            size,
+            callback=callback,
+            img_callback=img_callback,
+            quantize_denoised=quantize_x0,
+            mask=mask,
+            x0=x0,
+            ddim_use_original_steps=False,
+            noise_dropout=noise_dropout,
+            temperature=temperature,
+            score_corrector=score_corrector,
+            corrector_kwargs=corrector_kwargs,
+            x_T=x_T,
+            log_every_t=log_every_t,
+            unconditional_guidance_scale=unconditional_guidance_scale,
+            unconditional_conditioning=unconditional_conditioning,
+            input_image=input_image,
+            noise_save_path=noise_save_path,
+            verbose=verbose
+        )
+        return samples, intermediates, x0_loop
+    @torch.no_grad()
+    def plms_sampling_enc_save_noise(
+        self,
+        cond,
+        shape,
+        x_T=None,
+        ddim_use_original_steps=False,
+        callback=None,
+        timesteps=None,
+        quantize_denoised=False,
+        mask=None,
+        x0=None,
+        img_callback=None,
+        log_every_t=100,
+        temperature=1.0,
+        noise_dropout=0.0,
+        score_corrector=None,
+        corrector_kwargs=None,
+        unconditional_guidance_scale=1.0,
+        unconditional_conditioning=None,
+        input_image=None,
+        noise_save_path=None,
+        verbose=True,
+    ):
+        device = self.model.betas.device
+        b = shape[0]
+        if x_T is None:
+            img = torch.randn(shape, device=device)
+        else:
+            img = x_T
+        if timesteps is None:
+            timesteps = (
+                self.ddpm_num_timesteps
+                if ddim_use_original_steps
+                else self.ddim_timesteps
+            )
+        elif timesteps is not None and not ddim_use_original_steps:
+            subset_end = (
+                int(
+                    min(timesteps / self.ddim_timesteps.shape[0], 1)
+                    * self.ddim_timesteps.shape[0]
+                )
+                - 1
+            )
+            timesteps = self.ddim_timesteps[:subset_end]
+        intermediates = {"x_inter": [img], "pred_x0": [img]}
+        time_range = (
+            list(reversed(range(0, timesteps)))
+            if ddim_use_original_steps
+            else np.flip(timesteps)
+        )
+        time_range = list(range(0, timesteps)) if ddim_use_original_steps else timesteps
+        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
+        if verbose:
+            print(f"Running PLMS Sampling with {total_steps} timesteps")
+            iterator = tqdm(time_range[:-1], desc='PLMS Sampler', total=total_steps)
+        else:
+            iterator = time_range[:-1]
+        old_eps = []
+        noise_images = []
+        for each_time in time_range:
+            noised_image = self.model.q_sample(
+                input_image, torch.tensor([each_time]).to(device)
+            )
+            noise_images.append(noised_image)
+            # torch.save(noised_image, noise_save_path + "_image_time%d.pt" % (each_time))
+        # import pudb; pudb.set_trace()
+        x0_loop = input_image.clone()
+        alphas = (
+            self.model.alphas_cumprod if ddim_use_original_steps else self.ddim_alphas
+        )
+        alphas_prev = (
+            self.model.alphas_cumprod_prev
+            if ddim_use_original_steps
+            else self.ddim_alphas_prev
+        )
+        sqrt_one_minus_alphas = (
+            self.model.sqrt_one_minus_alphas_cumprod
+            if ddim_use_original_steps
+            else self.ddim_sqrt_one_minus_alphas
+        )
+        sigmas = (
+            self.model.ddim_sigmas_for_original_num_steps
+            if ddim_use_original_steps
+            else self.ddim_sigmas
+        )
+        def get_model_output(x, t):
+            x_in = torch.cat([x] * 2)
+            t_in = torch.cat([t] * 2)
+            if isinstance(cond, dict):
+                c_in = {key: [torch.cat([unconditional_conditioning[key][0], cond[key][0]])] for key in cond}
+            else:
+                c_in = torch.cat([unconditional_conditioning, cond])
+            e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
+            e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
+            return e_t
+        def get_x_prev_and_pred_x0(e_t, index, curr_x0):
+            # select parameters corresponding to the currently considered timestep
+            a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
+            a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
+            sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
+            sqrt_one_minus_at = torch.full(
+                (b, 1, 1, 1), sqrt_one_minus_alphas[index], device=device
+            )
+            # current prediction for x_0
+            pred_x0 = (curr_x0 - sqrt_one_minus_at * e_t) / a_t.sqrt()
+            a_t = torch.full((b, 1, 1, 1), alphas[index + 1], device=device)
+            a_prev = torch.full((b, 1, 1, 1), alphas_prev[index + 1], device=device)
+            sigma_t = torch.full((b, 1, 1, 1), sigmas[index + 1], device=device)
+            sqrt_one_minus_at = torch.full(
+                (b, 1, 1, 1), sqrt_one_minus_alphas[index + 1], device=device
+            )
+            dir_xt = (1.0 - a_t - sigma_t ** 2).sqrt() * e_t
+            x_prev = a_t.sqrt() * pred_x0 + dir_xt
+            return x_prev, pred_x0
+        for i, step in enumerate(iterator):
+            index = i
+            ts = torch.full((b,), step, device=device, dtype=torch.long)
+            ts_next = torch.full(
+                (b,),
+                time_range[min(i + 1, len(time_range) - 1)],
+                device=device,
+                dtype=torch.long,
+            )
+            e_t = get_model_output(x0_loop, ts)
+            x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index, x0_loop)
+            x0_loop = x_prev
+        # torch.save(x0_loop, noise_save_path + "_final_latent.pt")
+        # Reconstruction
+        img = x0_loop.clone()
+        time_range = (
+            list(reversed(range(0, timesteps)))
+            if ddim_use_original_steps
+            else np.flip(timesteps)
+        )
+        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
+        if verbose:
+            print(f"Running PLMS Sampling with {total_steps} timesteps")
+            iterator = tqdm(time_range, desc="PLMS Sampler", total=total_steps, miniters=total_steps+1, mininterval=600)
+        else:
+            iterator = time_range
+        old_eps = []
+        for i, step in enumerate(iterator):
+            index = total_steps - i - 1
+            ts = torch.full((b,), step, device=device, dtype=torch.long)
+            ts_next = torch.full(
+                (b,),
+                time_range[min(i + 1, len(time_range) - 1)],
+                device=device,
+                dtype=torch.long,
+            )
+            if mask is not None:
+                assert x0 is not None
+                img_orig = self.model.q_sample(
+                    x0, ts
+                )  # TODO: deterministic forward pass?
+                img = img_orig * mask + (1.0 - mask) * img
+            outs = self.p_sample_plms_dec_save_noise(
+                img,
+                cond,
+                ts,
+                index=index,
+                use_original_steps=ddim_use_original_steps,
+                quantize_denoised=quantize_denoised,
+                temperature=temperature,
+                noise_dropout=noise_dropout,
+                score_corrector=score_corrector,
+                corrector_kwargs=corrector_kwargs,
+                unconditional_guidance_scale=unconditional_guidance_scale,
+                unconditional_conditioning=unconditional_conditioning,
+                old_eps=old_eps,
+                t_next=ts_next,
+                input_image=input_image,
+                noise_save_path=noise_save_path,
+                noise_image=noise_images.pop(),
+            )
+            img, pred_x0, e_t = outs
+            old_eps.append(e_t)
+            if len(old_eps) >= 4:
+                old_eps.pop(0)
+            if callback:
+                callback(i)
+            if img_callback:
+                img_callback(pred_x0, i)
+            if index % log_every_t == 0 or index == total_steps - 1:
+                intermediates["x_inter"].append(img)
+                intermediates["pred_x0"].append(pred_x0)
+        return img, intermediates, x0_loop
+    @torch.no_grad()
+    def p_sample_plms_dec_save_noise(
+        self,
+        x,
+        c1,
+        t,
+        index,
+        repeat_noise=False,
+        use_original_steps=False,
+        quantize_denoised=False,
+        temperature=1.0,
+        noise_dropout=0.0,
+        score_corrector=None,
+        corrector_kwargs=None,
+        unconditional_guidance_scale=1.0,
+        unconditional_conditioning=None,
+        old_eps=None,
+        t_next=None,
+        input_image=None,
+        noise_save_path=None,
+        noise_image=None,
+    ):
+        b, *_, device = *x.shape, x.device
+        def get_model_output(x, t):
+            if (
+                unconditional_conditioning is None
+                or unconditional_guidance_scale == 1.0
+            ):
+                e_t = self.model.apply_model(x, t, c1)
+            else:
+                x_in = torch.cat([x] * 2)
+                t_in = torch.cat([t] * 2)
+                if isinstance(c1, dict):
+                    c_in = {key: [torch.cat([unconditional_conditioning[key][0], c1[key][0]])] for key in c1}
+                else:
+                    c_in = torch.cat([unconditional_conditioning, c1])
+                e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
+                e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
+            return e_t
+        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
+        alphas_prev = (
+            self.model.alphas_cumprod_prev
+            if use_original_steps
+            else self.ddim_alphas_prev
+        )
+        sqrt_one_minus_alphas = (
+            self.model.sqrt_one_minus_alphas_cumprod
+            if use_original_steps
+            else self.ddim_sqrt_one_minus_alphas
+        )
+        sigmas = (
+            self.model.ddim_sigmas_for_original_num_steps
+            if use_original_steps
+            else self.ddim_sigmas
+        )
+        def get_x_prev_and_pred_x0(e_t, index):
+            # select parameters corresponding to the currently considered timestep
+            a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
+            a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
+            sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
+            sqrt_one_minus_at = torch.full(
+                (b, 1, 1, 1), sqrt_one_minus_alphas[index], device=device
+            )
+            # current prediction for x_0
+            pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+            if quantize_denoised:
+                pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
+            # direction pointing to x_t
+            dir_xt = (1.0 - a_prev - sigma_t ** 2).sqrt() * e_t
+            time_curr = index * 20 + 1
+            # img_prev = torch.load(noise_save_path + "_image_time%d.pt" % (time_curr))
+            img_prev = noise_image
+            noise = img_prev - a_prev.sqrt() * pred_x0 - dir_xt
+            # torch.save(noise, noise_save_path + "_time%d.pt" % (time_curr))
+            x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
+            return x_prev, pred_x0
+        e_t = get_model_output(x, t)
+        if len(old_eps) == 0:
+            # Pseudo Improved Euler (2nd order)
+            x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index)
+            e_t_next = get_model_output(x_prev, t_next)
+            e_t_prime = (e_t + e_t_next) / 2
+        elif len(old_eps) == 1:
+            # 2nd order Pseudo Linear Multistep (Adams-Bashforth)
+            e_t_prime = (3 * e_t - old_eps[-1]) / 2
+        elif len(old_eps) == 2:
+            # 3nd order Pseudo Linear Multistep (Adams-Bashforth)
+            e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12
+        elif len(old_eps) >= 3:
+            # 4nd order Pseudo Linear Multistep (Adams-Bashforth)
+            e_t_prime = (
+                55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3]
+            ) / 24
+        x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)
+        return x_prev, pred_x0, e_t
+    ################## Encode Image End ###############################
+    def p_sample_plms_sampling(
+        self,
+        x,
+        c1,
+        c2,
+        t,
+        index,
+        repeat_noise=False,
+        use_original_steps=False,
+        quantize_denoised=False,
+        temperature=1.0,
+        noise_dropout=0.0,
+        score_corrector=None,
+        corrector_kwargs=None,
+        unconditional_guidance_scale=1.0,
+        unconditional_conditioning=None,
+        old_eps=None,
+        t_next=None,
+        input_image=None,
+        optimizing_weight=None,
+        noise_save_path=None,
+    ):
+        b, *_, device = *x.shape, x.device
+        def optimize_model_output(x, t):
+            # weight_for_pencil = torch.nn.Sigmoid()(optimizing_weight)
+            # condition = weight_for_pencil * c1 + (1 - weight_for_pencil) * c2
+            condition = optimizing_weight * c1 + (1 - optimizing_weight) * c2
+            if (
+                unconditional_conditioning is None
+                or unconditional_guidance_scale == 1.0
+            ):
+                e_t = self.model.apply_model(x, t, condition)
+            else:
+                x_in = torch.cat([x] * 2)
+                t_in = torch.cat([t] * 2)
+                if isinstance(condition, dict):
+                    c_in = {key: [torch.cat([unconditional_conditioning[key][0], condition[key][0]])] for key in condition}
+                else:
+                    c_in = torch.cat([unconditional_conditioning, condition])
+                e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
+                e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
+            return e_t
+        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
+        alphas_prev = (
+            self.model.alphas_cumprod_prev
+            if use_original_steps
+            else self.ddim_alphas_prev
+        )
+        sqrt_one_minus_alphas = (
+            self.model.sqrt_one_minus_alphas_cumprod
+            if use_original_steps
+            else self.ddim_sqrt_one_minus_alphas
+        )
+        sigmas = (
+            self.model.ddim_sigmas_for_original_num_steps
+            if use_original_steps
+            else self.ddim_sigmas
+        )
+        def get_x_prev_and_pred_x0(e_t, index):
+            # select parameters corresponding to the currently considered timestep
+            a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
+            a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
+            sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
+            sqrt_one_minus_at = torch.full(
+                (b, 1, 1, 1), sqrt_one_minus_alphas[index], device=device
+            )
+            # current prediction for x_0
+            pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+            if quantize_denoised:
+                pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
+            # direction pointing to x_t
+            dir_xt = (1.0 - a_prev - sigma_t ** 2).sqrt() * e_t
+            time_curr = index * 20 + 1
+            if noise_save_path and index > 16:
+                noise = torch.load(noise_save_path + "_time%d.pt" % (time_curr))[:1]
+            else:
+                noise = torch.zeros_like(dir_xt)
+            x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
+            return x_prev, pred_x0
+        e_t = optimize_model_output(x, t)
+        if len(old_eps) == 0:
+            # Pseudo Improved Euler (2nd order)
+            x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index)
+            # e_t_next = get_model_output(x_prev, t_next)
+            e_t_next = optimize_model_output(x_prev, t_next)
+            e_t_prime = (e_t + e_t_next) / 2
+        elif len(old_eps) == 1:
+            # 2nd order Pseudo Linear Multistep (Adams-Bashforth)
+            e_t_prime = (3 * e_t - old_eps[-1]) / 2
+        elif len(old_eps) == 2:
+            # 3nd order Pseudo Linear Multistep (Adams-Bashforth)
+            e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12
+        elif len(old_eps) >= 3:
+            # 4nd order Pseudo Linear Multistep (Adams-Bashforth)
+            e_t_prime = (
+                55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3]
+            ) / 24
+        x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)
+        return x_prev, pred_x0, e_t
+    ################## Edit Input Image ###############################
+    def sample_optimize_intrinsic_edit(
+        self,
+        S,
+        batch_size,
+        shape,
+        conditioning1=None,
+        conditioning2=None,
+        callback=None,
+        normals_sequence=None,
+        img_callback=None,
+        quantize_x0=False,
+        eta=0.0,
+        mask=None,
+        x0=None,
+        temperature=1.0,
+        noise_dropout=0.0,
+        score_corrector=None,
+        corrector_kwargs=None,
+        verbose=True,
+        x_T=None,
+        log_every_t=100,
+        unconditional_guidance_scale=1.0,
+        unconditional_conditioning=None,
+        input_image=None,
+        noise_save_path=None,
+        lambda_t=None,
+        lambda_save_path=None,
+        image_save_path=None,
+        original_text=None,
+        new_text=None,
+        otext=None,
+        noise_saved_path=None,
+        # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
+        **kwargs,
+    ):
+        assert conditioning1 is not None
+        assert conditioning2 is not None
+        self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
+        # sampling
+        C, H, W = shape
+        size = (batch_size, C, H, W)
+        print(f"Data shape for PLMS sampling is {size}")
+        self.plms_sampling_optimize_intrinsic_edit(
+            conditioning1,
+            conditioning2,
+            size,
+            callback=callback,
+            img_callback=img_callback,
+            quantize_denoised=quantize_x0,
+            mask=mask,
+            x0=x0,
+            ddim_use_original_steps=False,
+            noise_dropout=noise_dropout,
+            temperature=temperature,
+            score_corrector=score_corrector,
+            corrector_kwargs=corrector_kwargs,
+            x_T=x_T,
+            log_every_t=log_every_t,
+            unconditional_guidance_scale=unconditional_guidance_scale,
+            unconditional_conditioning=unconditional_conditioning,
+            input_image=input_image,
+            noise_save_path=noise_save_path,
+            lambda_t=lambda_t,
+            lambda_save_path=lambda_save_path,
+            image_save_path=image_save_path,
+            original_text=original_text,
+            new_text=new_text,
+            otext=otext,
+            noise_saved_path=noise_saved_path,
+        )
+        return None
+    def plms_sampling_optimize_intrinsic_edit(
+        self,
+        cond1,
+        cond2,
+        shape,
+        x_T=None,
+        ddim_use_original_steps=False,
+        callback=None,
+        timesteps=None,
+        quantize_denoised=False,
+        mask=None,
+        x0=None,
+        img_callback=None,
+        log_every_t=100,
+        temperature=1.0,
+        noise_dropout=0.0,
+        score_corrector=None,
+        corrector_kwargs=None,
+        unconditional_guidance_scale=1.0,
+        unconditional_conditioning=None,
+        input_image=None,
+        noise_save_path=None,
+        lambda_t=None,
+        lambda_save_path=None,
+        image_save_path=None,
+        original_text=None,
+        new_text=None,
+        otext=None,
+        noise_saved_path=None,
+    ):
+        # Different from above, the intrinsic edit version needs
+        device = self.model.betas.device
+        b = shape[0]
+        if x_T is None:
+            img = torch.randn(shape, device=device)
+        else:
+            img = x_T
+        img_clone = img.clone()
+        if timesteps is None:
+            timesteps = (
+                self.ddpm_num_timesteps
+                if ddim_use_original_steps
+                else self.ddim_timesteps
+            )
+        elif timesteps is not None and not ddim_use_original_steps:
+            subset_end = (
+                int(
+                    min(timesteps / self.ddim_timesteps.shape[0], 1)
+                    * self.ddim_timesteps.shape[0]
+                )
+                - 1
+            )
+            timesteps = self.ddim_timesteps[:subset_end]
+        intermediates = {"x_inter": [img], "pred_x0": [img]}
+        time_range = (
+            list(reversed(range(0, timesteps)))
+            if ddim_use_original_steps
+            else np.flip(timesteps)
+        )
+        weighting_parameter = lambda_t
+        weighting_parameter.requires_grad = True
+        from torch import optim
+        optimizer = optim.Adam([weighting_parameter], lr=0.05)
+        print("Original image")
+        with torch.no_grad():
+            img = img_clone.clone()
+            total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
+            iterator = time_range
+            old_eps = []
+            for i, step in enumerate(iterator):
+                index = total_steps - i - 1
+                ts = torch.full((b,), step, device=device, dtype=torch.long)
+                ts_next = torch.full(
+                    (b,),
+                    time_range[min(i + 1, len(time_range) - 1)],
+                    device=device,
+                    dtype=torch.long,
+                )
+                outs = self.p_sample_plms_sampling(
+                    img,
+                    cond1,
+                    cond2,
+                    ts,
+                    index=index,
+                    use_original_steps=ddim_use_original_steps,
+                    quantize_denoised=quantize_denoised,
+                    temperature=temperature,
+                    noise_dropout=noise_dropout,
+                    score_corrector=score_corrector,
+                    corrector_kwargs=corrector_kwargs,
+                    unconditional_guidance_scale=unconditional_guidance_scale,
+                    unconditional_conditioning=unconditional_conditioning,
+                    old_eps=old_eps,
+                    t_next=ts_next,
+                    input_image=input_image,
+                    optimizing_weight=torch.ones(50)[i],
+                    noise_save_path=noise_saved_path,
+                )
+                img, pred_x0, e_t = outs
+                old_eps.append(e_t)
+                if len(old_eps) >= 4:
+                    old_eps.pop(0)
+            img_temp = self.model.decode_first_stage(img)
+            img_temp_ddim = torch.clamp((img_temp + 1.0) / 2.0, min=0.0, max=1.0)
+            img_temp_ddim = img_temp_ddim.cpu().permute(0, 2, 3, 1).permute(0, 3, 1, 2)
+            # save image
+            with torch.no_grad():
+                x_sample = 255.0 * rearrange(
+                    img_temp_ddim[0].detach().cpu().numpy(), "c h w -> h w c"
+                )
+                imgsave = Image.fromarray(x_sample.astype(np.uint8))
+                imgsave.save(image_save_path + "original.png")
+            readed_image = (
+                torchvision.io.read_image(image_save_path + "original.png").float()
+                / 255
+            )
+        print("Optimizing start")
+        for epoch in tqdm(range(10)):
+            img = img_clone.clone()
+            total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
+            iterator = time_range
+            old_eps = []
+            for i, step in enumerate(iterator):
+                index = total_steps - i - 1
+                ts = torch.full((b,), step, device=device, dtype=torch.long)
+                ts_next = torch.full(
+                    (b,),
+                    time_range[min(i + 1, len(time_range) - 1)],
+                    device=device,
+                    dtype=torch.long,
+                )
+                outs = self.p_sample_plms_sampling(
+                    img,
+                    cond1,
+                    cond2,
+                    ts,
+                    index=index,
+                    use_original_steps=ddim_use_original_steps,
+                    quantize_denoised=quantize_denoised,
+                    temperature=temperature,
+                    noise_dropout=noise_dropout,
+                    score_corrector=score_corrector,
+                    corrector_kwargs=corrector_kwargs,
+                    unconditional_guidance_scale=unconditional_guidance_scale,
+                    unconditional_conditioning=unconditional_conditioning,
+                    old_eps=old_eps,
+                    t_next=ts_next,
+                    input_image=input_image,
+                    optimizing_weight=weighting_parameter[i],
+                    noise_save_path=noise_saved_path,
+                )
+                img, pred_x0, e_t = outs
+                old_eps.append(e_t)
+                if len(old_eps) >= 4:
+                    old_eps.pop(0)
+            img_temp = self.model.decode_first_stage(img)
+            img_temp_ddim = torch.clamp((img_temp + 1.0) / 2.0, min=0.0, max=1.0)
+            img_temp_ddim = img_temp_ddim.cpu()
+            # save image
+            # with torch.no_grad():
+            #     x_sample = 255.0 * rearrange(
+            #         img_temp_ddim[0].detach().cpu().numpy(), "c h w -> h w c"
+            #     )
+            #     imgsave = Image.fromarray(x_sample.astype(np.uint8))
+            #     imgsave.save(image_save_path + "/%d.png" % (epoch))
+            loss1 = VGGPerceptualLoss()(img_temp_ddim[0], readed_image)
+            loss2 = DCLIPLoss()(
+                readed_image, img_temp_ddim[0].float().cuda(), otext, new_text
+            )
+            loss = 0.05 * loss1 + loss2
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            # torch.save(
+            #     weighting_parameter, lambda_save_path + "/weightingParam%d.pt" % (epoch)
+            # )
+            if epoch < 9:
+                del img
+            else:
+                # save image
+                with torch.no_grad():
+                    x_sample = 255.0 * rearrange(
+                        img_temp_ddim[0].detach().cpu().numpy(), "c h w -> h w c"
+                    )
+                    imgsave = Image.fromarray(x_sample.astype(np.uint8))
+                    imgsave.save(image_save_path + "/final.png")
+                torch.save(
+                    weighting_parameter, lambda_save_path + "/weightingParam_final.pt"
+                )
+            torch.cuda.empty_cache()
+        # shutil.rmtree("noise")
+        return None
+    ################ Edit Image End ######################
+    ################ Disentangle #########################
+    def sample_optimize_intrinsic(
+        self,
+        S,
+        batch_size,
+        shape,
+        conditioning1=None,
+        conditioning2=None,
+        callback=None,
+        normals_sequence=None,
+        img_callback=None,
+        quantize_x0=False,
+        eta=0.0,
+        mask=None,
+        x0=None,
+        temperature=1.0,
+        noise_dropout=0.0,
+        score_corrector=None,
+        corrector_kwargs=None,
+        verbose=True,
+        x_T=None,
+        log_every_t=100,
+        unconditional_guidance_scale=1.0,
+        unconditional_conditioning=None,
+        input_image=None,
+        noise_save_path=None,
+        lambda_t=None,
+        lambda_save_path=None,
+        image_save_path=None,
+        original_text=None,
+        new_text=None,
+        otext=None,
+        # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
+        **kwargs,
+    ):
+        assert conditioning1 is not None
+        assert conditioning2 is not None
+        self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
+        # sampling
+        C, H, W = shape
+        size = (batch_size, C, H, W)
+        print(f"Data shape for PLMS sampling is {size}")
+        self.plms_sampling_optimize_intrinsic(
+            conditioning1,
+            conditioning2,
+            size,
+            callback=callback,
+            img_callback=img_callback,
+            quantize_denoised=quantize_x0,
+            mask=mask,
+            x0=x0,
+            ddim_use_original_steps=False,
+            noise_dropout=noise_dropout,
+            temperature=temperature,
+            score_corrector=score_corrector,
+            corrector_kwargs=corrector_kwargs,
+            x_T=x_T,
+            log_every_t=log_every_t,
+            unconditional_guidance_scale=unconditional_guidance_scale,
+            unconditional_conditioning=unconditional_conditioning,
+            input_image=input_image,
+            noise_save_path=noise_save_path,
+            lambda_t=lambda_t,
+            lambda_save_path=lambda_save_path,
+            image_save_path=image_save_path,
+            original_text=original_text,
+            new_text=new_text,
+            otext=otext,
+        )
+        return None
+    def plms_sampling_optimize_intrinsic(
+        self,
+        cond1,
+        cond2,
+        shape,
+        x_T=None,
+        ddim_use_original_steps=False,
+        callback=None,
+        timesteps=None,
+        quantize_denoised=False,
+        mask=None,
+        x0=None,
+        img_callback=None,
+        log_every_t=100,
+        temperature=1.0,
+        noise_dropout=0.0,
+        score_corrector=None,
+        corrector_kwargs=None,
+        unconditional_guidance_scale=1.0,
+        unconditional_conditioning=None,
+        input_image=None,
+        noise_save_path=None,
+        lambda_t=None,
+        lambda_save_path=None,
+        image_save_path=None,
+        original_text=None,
+        new_text=None,
+        otext=None,
+    ):
+        device = self.model.betas.device
+        b = shape[0]
+        if x_T is None:
+            img = torch.randn(shape, device=device)
+        else:
+            img = x_T
+        img_clone = img.clone()
+        if timesteps is None:
+            timesteps = (
+                self.ddpm_num_timesteps
+                if ddim_use_original_steps
+                else self.ddim_timesteps
+            )
+        elif timesteps is not None and not ddim_use_original_steps:
+            subset_end = (
+                int(
+                    min(timesteps / self.ddim_timesteps.shape[0], 1)
+                    * self.ddim_timesteps.shape[0]
+                )
+                - 1
+            )
+            timesteps = self.ddim_timesteps[:subset_end]
+        time_range = (
+            list(reversed(range(0, timesteps)))
+            if ddim_use_original_steps
+            else np.flip(timesteps)
+        )
+        weighting_parameter = lambda_t
+        weighting_parameter.requires_grad = True
+        from torch import optim
+        optimizer = optim.Adam([weighting_parameter], lr=0.05)
+        print("Original image")
+        with torch.no_grad():
+            img = img_clone.clone()
+            total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
+            iterator = time_range
+            old_eps = []
+            for i, step in enumerate(iterator):
+                index = total_steps - i - 1
+                ts = torch.full((b,), step, device=device, dtype=torch.long)
+                ts_next = torch.full(
+                    (b,),
+                    time_range[min(i + 1, len(time_range) - 1)],
+                    device=device,
+                    dtype=torch.long,
+                )
+                outs = self.p_sample_plms_sampling(
+                    img,
+                    cond1,
+                    cond2,
+                    ts,
+                    index=index,
+                    use_original_steps=ddim_use_original_steps,
+                    quantize_denoised=quantize_denoised,
+                    temperature=temperature,
+                    noise_dropout=noise_dropout,
+                    score_corrector=score_corrector,
+                    corrector_kwargs=corrector_kwargs,
+                    unconditional_guidance_scale=unconditional_guidance_scale,
+                    unconditional_conditioning=unconditional_conditioning,
+                    old_eps=old_eps,
+                    t_next=ts_next,
+                    input_image=input_image,
+                    optimizing_weight=torch.ones(50)[i],
+                    noise_save_path=noise_save_path,
+                )
+                img, pred_x0, e_t = outs
+                old_eps.append(e_t)
+                if len(old_eps) >= 4:
+                    old_eps.pop(0)
+            img_temp = self.model.decode_first_stage(img)
+            del img
+            img_temp_ddim = torch.clamp((img_temp + 1.0) / 2.0, min=0.0, max=1.0)
+            img_temp_ddim = img_temp_ddim.cpu().permute(0, 2, 3, 1).permute(0, 3, 1, 2)
+            # save image
+            with torch.no_grad():
+                x_sample = 255.0 * rearrange(
+                    img_temp_ddim[0].detach().cpu().numpy(), "c h w -> h w c"
+                )
+                imgsave = Image.fromarray(x_sample.astype(np.uint8))
+                imgsave.save(image_save_path + "original.png")
+                readed_image = (
+                    torchvision.io.read_image(image_save_path + "original.png").float()
+                    / 255
+                )
+        print("Optimizing start")
+        for epoch in tqdm(range(10)):
+            img = img_clone.clone()
+            total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
+            iterator = time_range
+            old_eps = []
+            for i, step in enumerate(iterator):
+                index = total_steps - i - 1
+                ts = torch.full((b,), step, device=device, dtype=torch.long)
+                ts_next = torch.full(
+                    (b,),
+                    time_range[min(i + 1, len(time_range) - 1)],
+                    device=device,
+                    dtype=torch.long,
+                )
+                outs = self.p_sample_plms_sampling(
+                    img,
+                    cond1,
+                    cond2,
+                    ts,
+                    index=index,
+                    use_original_steps=ddim_use_original_steps,
+                    quantize_denoised=quantize_denoised,
+                    temperature=temperature,
+                    noise_dropout=noise_dropout,
+                    score_corrector=score_corrector,
+                    corrector_kwargs=corrector_kwargs,
+                    unconditional_guidance_scale=unconditional_guidance_scale,
+                    unconditional_conditioning=unconditional_conditioning,
+                    old_eps=old_eps,
+                    t_next=ts_next,
+                    input_image=input_image,
+                    optimizing_weight=weighting_parameter[i],
+                    noise_save_path=noise_save_path,
+                )
+                img, _, e_t = outs
+                old_eps.append(e_t)
+                if len(old_eps) >= 4:
+                    old_eps.pop(0)
+            img_temp = self.model.decode_first_stage(img)
+            del img
+            img_temp_ddim = torch.clamp((img_temp + 1.0) / 2.0, min=0.0, max=1.0)
+            img_temp_ddim = img_temp_ddim.cpu()
+            # # save image
+            # with torch.no_grad():
+            #     x_sample = 255. * rearrange(img_temp_ddim[0].detach().cpu().numpy(), 'c h w -> h w c')
+            #     imgsave = Image.fromarray(x_sample.astype(np.uint8))
+            #     imgsave.save(image_save_path + "/%d.png"%(epoch))
+            loss1 = VGGPerceptualLoss()(img_temp_ddim[0], readed_image)
+            loss2 = DCLIPLoss()(
+                readed_image, img_temp_ddim[0].float().cuda(), otext, new_text
+            )
+            loss = (
+                0.05 * loss1 + loss2
+            )  # 0.05 or 0.03. Adjust according to attributes on scenes or people.
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            # torch.save(weighting_parameter,  lambda_save_path+"/weightingParam%d.pt"%(epoch))
+            with torch.no_grad():
+                if epoch == 9:
+                    # save image
+                    x_sample = 255.0 * rearrange(
+                        img_temp_ddim[0].detach().cpu().numpy(), "c h w -> h w c"
+                    )
+                    imgsave = Image.fromarray(x_sample.astype(np.uint8))
+                    imgsave.save(image_save_path + "/final.png")
+                    torch.save(
+                        weighting_parameter,
+                        lambda_save_path + "/weightingParam_final.pt",
+                    )
+            torch.cuda.empty_cache()
+        return None
+################ Disentangle End #########################

cldm/tmp.py ADDED Viewed

	@@ -0,0 +1,340 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+use kornia and albumentations for transformations
+@author: Tu Bui @University of Surrey
+"""
+import os
+from . import utils
+import torch
+import numpy as np
+from torch import nn
+import torch.nn.functional as F
+from PIL import Image
+import kornia as ko
+import albumentations as ab
+class IdentityAugment(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, **kwargs):
+        return x
+class RandomCompress(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            self.jpeg_quality = 70
+        elif severity == 'medium':
+            self.jpeg_quality = 50
+        elif severity == 'high':
+            self.jpeg_quality = 40
+    def forward(self, x, ramp=1.):
+        # x (B, C, H, W) in range [0, 1]
+        # ramp: adjust the ramping of the compression, 1.0 means min quality = self.jpeg_quality
+        if torch.rand(1)[0] >= self.p:
+            return x
+        jpeg_quality = 100. - torch.rand(1)[0] * ramp * (100. - self.jpeg_quality)
+        x = utils.jpeg_compress_decompress(x, rounding=utils.round_only_at_0, quality=jpeg_quality)
+        return x
+class RandomBoxBlur(nn.Module):
+    def __init__(self, severity='medium', border_type='reflect', normalize=True, p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            kernel_size = 3
+        elif severity == 'medium':
+            kernel_size = 5
+        elif severity == 'high':
+            kernel_size = 7
+        self.tform = ko.augmentation.RandomBoxBlur(kernel_size=(kernel_size, kernel_size), border_type=border_type, normalize=normalize, p=self.p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class RandomMedianBlur(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        self.tform = ko.augmentation.RandomMedianBlur(kernel_size=(3,3), p=p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class RandomBrightness(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            brightness = (0.9, 1.1)
+        elif severity == 'medium':
+            brightness = (0.75, 1.25)
+        elif severity == 'high':
+            brightness = (0.5, 1.5)
+        self.tform = ko.augmentation.RandomBrightness(brightness=brightness, p=p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class RandomContrast(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            contrast = (0.9, 1.1)
+        elif severity == 'medium':
+            contrast = (0.75, 1.25)
+        elif severity == 'high':
+            contrast = (0.5, 1.5)
+        self.tform = ko.augmentation.RandomContrast(contrast=contrast, p=p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class RandomSaturation(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            sat = (0.9, 1.1)
+        elif severity == 'medium':
+            sat = (0.75, 1.25)
+        elif severity == 'high':
+            sat = (0.5, 1.5)
+        self.tform = ko.augmentation.RandomSaturation(saturation=sat, p=p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class RandomSharpness(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            sharpness = 0.5
+        elif severity == 'medium':
+            sharpness = 1.0
+        elif severity == 'high':
+            sharpness = 2.5
+        self.tform = ko.augmentation.RandomSharpness(sharpness=sharpness, p=p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class RandomColorJiggle(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            factor = (0.05, 0.05, 0.05, 0.01)
+        elif severity == 'medium':
+            factor = (0.1, 0.1, 0.1, 0.02)
+        elif severity == 'high':
+            factor = (0.1, 0.1, 0.1, 0.05)
+        self.tform = ko.augmentation.ColorJiggle(*factor, p=p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class RandomHue(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            hue = 0.01
+        elif severity == 'medium':
+            hue = 0.02
+        elif severity == 'high':
+            hue = 0.05
+        self.tform = ko.augmentation.RandomHue(hue=(-hue, hue), p=p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class RandomGamma(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            gamma, gain = (0.9, 1.1), (0.9,1.1)
+        elif severity == 'medium':
+            gamma, gain = (0.75, 1.25), (0.75,1.25)
+        elif severity == 'high':
+            gamma, gain = (0.5, 1.5), (0.5,1.5)
+        self.tform = ko.augmentation.RandomGamma(gamma, gain, p=p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class RandomGaussianBlur(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            kernel_size, sigma = 3, (0.1, 1.0)
+        elif severity == 'medium':
+            kernel_size, sigma = 5, (0.1, 1.5)
+        elif severity == 'high':
+            kernel_size, sigma = 7, (0.1, 2.0)
+        self.tform = ko.augmentation.RandomGaussianBlur(kernel_size=(kernel_size, kernel_size), sigma=sigma, p=self.p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class RandomGaussianNoise(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            std = 0.02
+        elif severity == 'medium':
+            std = 0.04
+        elif severity == 'high':
+            std = 0.08
+        self.tform = ko.augmentation.RandomGaussianNoise(mean=0., std=std, p=p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class RandomMotionBlur(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            kernel_size, angle, direction = (3, 5), (-25, 25), (-0.25, 0.25)
+        elif severity == 'medium':
+            kernel_size, angle, direction = (3, 7), (-45, 45), (-0.5, 0.5)
+        elif severity == 'high':
+            kernel_size, angle, direction = (3, 9), (-90, 90), (-1.0, 1.0)
+        self.tform = ko.augmentation.RandomMotionBlur(kernel_size, angle, direction, p=p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class RandomPosterize(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            bits = 5
+        elif severity == 'medium':
+            bits = 4
+        elif severity == 'high':
+            bits = 3
+        self.tform = ko.augmentation.RandomPosterize(bits=bits, p=p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class RandomRGBShift(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            rgb = 0.02
+        elif severity == 'medium':
+            rgb = 0.05
+        elif severity == 'high':
+            rgb = 0.1
+        self.tform = ko.augmentation.RandomRGBShift(r_shift_limit=rgb, g_shift_limit=rgb, b_shift_limit=rgb, p=p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class TransformNet(nn.Module):
+    def __init__(self, flip=True, crop_mode='random_crop', compress=True, brightness=True, contrast=True, color_jiggle=True, gamma=True, grayscale=True, gaussian_blur=True, gaussian_noise=True, hue=True, motion_blur=True, posterize=True, rgb_shift=True, saturation=True, sharpness=True, median_blur=True, severity='medium', n_optional=2, ramp=1000, p=0.5):
+        super().__init__()
+        self.n_optional = n_optional
+        self.p = p
+        p_flip = 0.5 if flip else 0
+        rnd_flip_layer = ko.augmentation.RandomHorizontalFlip(p_flip)
+        self.ramp = ramp
+        self.register_buffer('step0', torch.tensor(0))
+        assert crop_mode in ['random_crop', 'resized_crop']
+        if crop_mode == 'random_crop':
+            rnd_crop_layer = ko.augmentation.RandomCrop((224,224), cropping_mode="resample")
+        elif crop_mode == 'resized_crop':
+            rnd_crop_layer = ko.augmentation.RandomResizedCrop(size=(224,224), scale=(0.7, 1.0), ratio=(3.0/4, 4.0/3), cropping_mode='resample')
+        self.fixed_transforms = [rnd_flip_layer, rnd_crop_layer]
+        self.optional_transforms = []
+        if compress:
+            self.optional_transforms.append(RandomCompress(severity, p=p))
+        if brightness:
+            self.optional_transforms.append(RandomBrightness(severity, p=p))
+        if contrast:
+            self.optional_transforms.append(RandomContrast(severity, p=p))
+        if color_jiggle:
+            self.optional_transforms.append(RandomColorJiggle(severity, p=p))
+        if gamma:
+            self.optional_transforms.append(RandomGamma(severity, p=p))
+        if grayscale:
+            self.optional_transforms.append(ko.augmentation.RandomGrayscale(p=p/4))
+        if gaussian_blur:
+            self.optional_transforms.append(RandomGaussianBlur(severity, p=p))
+        if gaussian_noise:
+            self.optional_transforms.append(RandomGaussianNoise(severity, p=p))
+        if hue:
+            self.optional_transforms.append(RandomHue(severity, p=p))
+        if motion_blur:
+            self.optional_transforms.append(RandomMotionBlur(severity, p=p))
+        if posterize:
+            self.optional_transforms.append(RandomPosterize(severity, p=p))
+        if rgb_shift:
+            self.optional_transforms.append(RandomRGBShift(severity, p=p))
+        if saturation:
+            self.optional_transforms.append(RandomSaturation(severity, p=p))
+        if sharpness:
+            self.optional_transforms.append(RandomSharpness(severity, p=p))
+        if median_blur:
+            self.optional_transforms.append(RandomMedianBlur(severity, p=p))
+    def activate(self, global_step):
+        if self.step0 == 0:
+            print(f'[TRAINING] Activating TransformNet at step {global_step}')
+            self.step0 = torch.tensor(global_step)
+    def is_activated(self):
+        return self.step0 > 0
+    def forward(self, x, global_step, p=0.9):
+        # x: [batch_size, 3, H, W] in range [-1, 1]
+        x = x * 0.5 + 0.5  # [-1, 1] -> [0, 1]
+        # fixed transforms
+        for tform in self.fixed_transforms:
+            x = tform(x)
+            if isinstance(x, tuple):
+                x = x[0]
+        # optional transforms
+        ramp = np.min([(global_step-self.step0.cpu().item()) / self.ramp, 1.])
+        try:
+            if len(self.optional_transforms) > 0:
+                tform_ids = torch.randint(len(self.optional_transforms), (self.n_optional,)).numpy()
+                for tform_id in tform_ids:
+                    tform = self.optional_transforms[tform_id]
+                    x = tform(x, ramp=ramp)
+                    if isinstance(x, tuple):
+                        x = x[0]
+        except Exception as e:
+            print(tform_id, ramp)
+            import pdb; pdb.set_trace()
+        return x * 2 - 1  # [0, 1] -> [-1, 1]
+if __name__ == '__main__':
+    pass

cldm/transformations.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import os
+from . import utils
+import torch
+import numpy as np
+from torch import nn
+import torch.nn.functional as F
+from tools.augment_imagenetc import RandomImagenetC
+from PIL import Image
+import kornia as ko
+# from kornia.augmentation import RandomHorizontalFlip, RandomCrop
+class TransformNet(nn.Module):
+    def __init__(self, rnd_bri=0.3, rnd_hue=0.1, do_jpeg=False, jpeg_quality=50, rnd_noise=0.02, rnd_sat=1.0, rnd_trans=0.1,contrast=[0.5, 1.5], rnd_flip=False, ramp=1000, imagenetc_level=0, crop_mode='crop') -> None:
+        super().__init__()
+        self.rnd_bri = rnd_bri
+        self.rnd_hue = rnd_hue
+        self.jpeg_quality = jpeg_quality
+        self.rnd_noise = rnd_noise
+        self.rnd_sat = rnd_sat
+        self.rnd_trans = rnd_trans
+        self.contrast_low, self.contrast_high = contrast
+        self.do_jpeg = do_jpeg
+        p_flip = 0.5 if rnd_flip else 0
+        self.rnd_flip = ko.augmentation.RandomHorizontalFlip(p_flip)
+        self.ramp = ramp
+        self.register_buffer('step0', torch.tensor(0))  # large number
+        assert crop_mode in ['crop', 'resized_crop']
+        if crop_mode == 'crop':
+            self.rnd_crop = ko.augmentation.RandomCrop((224,224), cropping_mode="resample")
+        elif crop_mode == 'resized_crop':
+            self.rnd_crop = ko.augmentation.RandomResizedCrop(size=(224,224), scale=(0.7, 1.0), ratio=(3.0/4, 4.0/3), cropping_mode='resample')
+        if imagenetc_level > 0:
+            self.imagenetc = ImagenetCTransform(max_severity=imagenetc_level)
+    def activate(self, global_step):
+        if self.step0 == 0:
+            print(f'[TRAINING] Activating TransformNet at step {global_step}')
+            self.step0 = torch.tensor(global_step)
+    def is_activated(self):
+        return self.step0 > 0
+    def forward(self, x, global_step, p=0.9):
+        # x: [batch_size, 3, H, W] in range [-1, 1]
+        x = x * 0.5 + 0.5  # [-1, 1] -> [0, 1]
+        # flip
+        x = self.rnd_flip(x)
+        # random crop
+        x = self.rnd_crop(x)
+        if isinstance(x, tuple):
+            x = x[0]  # weird bug in kornia 0.6.0 that returns transform matrix occasionally
+        if torch.rand(1)[0] >= p:
+            return x * 2 - 1  # [0, 1] -> [-1, 1]
+        if hasattr(self, 'imagenetc') and torch.rand(1)[0] < 0.5:
+            x = self.imagenetc(x * 2 - 1)  # [0, 1] -> [-1, 1])
+            return x
+        batch_size, sh, device = x.shape[0], x.size(), x.device
+        # x0 = x.clone().detach()
+        ramp_fn = lambda ramp: np.min([(global_step-self.step0.cpu().item()) / ramp, 1.])
+        rnd_bri = ramp_fn(self.ramp) * self.rnd_bri
+        rnd_hue = ramp_fn(self.ramp) * self.rnd_hue
+        rnd_brightness = utils.get_rnd_brightness_torch(rnd_bri, rnd_hue, batch_size).to(device)  # [batch_size, 3, 1, 1]
+        rnd_noise = torch.rand(1)[0] * ramp_fn(self.ramp) * self.rnd_noise
+        contrast_low = 1. - (1. - self.contrast_low) * ramp_fn(self.ramp)
+        contrast_high = 1. + (self.contrast_high - 1.) * ramp_fn(self.ramp)
+        contrast_params = [contrast_low, contrast_high]
+        # blur
+        N_blur = 7
+        f = utils.random_blur_kernel(probs=[.25, .25], N_blur=N_blur, sigrange_gauss=[1., 3.], sigrange_line=[.25, 1.],
+                                    wmin_line=3).to(device)
+        x = F.conv2d(x, f, bias=None, padding=int((N_blur - 1) / 2))
+        # noise
+        noise = torch.normal(mean=0, std=rnd_noise, size=x.size(), dtype=torch.float32).to(device)
+        x = x + noise
+        x = torch.clamp(x, 0, 1)
+        # contrast & brightness
+        contrast_scale = torch.Tensor(x.size()[0]).uniform_(contrast_params[0], contrast_params[1])
+        contrast_scale = contrast_scale.reshape(x.size()[0], 1, 1, 1).to(device)
+        x = x * contrast_scale
+        x = x + rnd_brightness
+        x = torch.clamp(x, 0, 1)
+        # saturation
+        # rnd_sat = torch.rand(1)[0] * ramp_fn(self.ramp) * self.rnd_sat
+        # sat_weight = torch.FloatTensor([.3, .6, .1]).reshape(1, 3, 1, 1).to(device)
+        # encoded_image_lum = torch.mean(x * sat_weight, dim=1).unsqueeze_(1)
+        # x = (1 - rnd_sat) * x + rnd_sat * encoded_image_lum
+        rnd_sat = (torch.rand(1)[0]*2.0 - 1.0)*ramp_fn(self.ramp) * self.rnd_sat + 1.0
+        x = ko.enhance.adjust.adjust_saturation(x, rnd_sat)
+        # jpeg
+        x = x.reshape(sh)
+        if self.do_jpeg:
+            jpeg_quality = 100. - torch.rand(1)[0] * ramp_fn(self.ramp) * (100. - self.jpeg_quality)
+            x = utils.jpeg_compress_decompress(x, rounding=utils.round_only_at_0, quality=jpeg_quality)
+        x = x * 2 - 1  # [0, 1] -> [-1, 1]
+        return x
+class ImagenetCTransform(nn.Module):
+    def __init__(self, max_severity=5) -> None:
+        super().__init__()
+        self.max_severity = max_severity
+        self.tform = RandomImagenetC(max_severity=max_severity, phase='train')
+    def forward(self, x):
+        # x: [batch_size, 3, H, W] in range [-1, 1]
+        img0 = x.detach().cpu().numpy()
+        img = img0 * 127.5 + 127.5  # [-1, 1] -> [0, 255]
+        img = img.transpose(0, 2, 3, 1).astype(np.uint8)
+        img = [Image.fromarray(i) for i in img]
+        img = [self.tform(i) for i in img]
+        img = np.array([np.array(i) for i in img], dtype=np.float32)
+        img = img.transpose(0, 3, 1, 2) / 127.5 - 1.  # [0, 255] -> [-1, 1]
+        residual = torch.from_numpy(img - img0).to(x.device)
+        x = x + residual
+        return x

cldm/transformations2.py ADDED Viewed

	@@ -0,0 +1,415 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+use kornia and albumentations for transformations
+@author: Tu Bui @University of Surrey
+"""
+import os
+from . import utils
+import torch
+import numpy as np
+from torch import nn
+import torch.nn.functional as thf
+from PIL import Image
+import kornia as ko
+import albumentations as ab
+from torchvision import transforms
+class IdentityAugment(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, **kwargs):
+        return x
+class RandomCompress(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            self.jpeg_quality = 70
+        elif severity == 'medium':
+            self.jpeg_quality = 50
+        elif severity == 'high':
+            self.jpeg_quality = 40
+    def forward(self, x, ramp=1.):
+        # x (B, C, H, W) in range [0, 1]
+        # ramp: adjust the ramping of the compression, 1.0 means min quality = self.jpeg_quality
+        if torch.rand(1)[0] >= self.p:
+            return x
+        jpeg_quality = 100. - torch.rand(1)[0] * ramp * (100. - self.jpeg_quality)
+        x = utils.jpeg_compress_decompress(x, rounding=utils.round_only_at_0, quality=jpeg_quality)
+        return x
+class RandomBoxBlur(nn.Module):
+    def __init__(self, severity='medium', border_type='reflect', normalized=True, p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            kernel_size = 3
+        elif severity == 'medium':
+            kernel_size = 5
+        elif severity == 'high':
+            kernel_size = 7
+        self.tform = ko.augmentation.RandomBoxBlur(kernel_size=(kernel_size, kernel_size), border_type=border_type, normalized=normalized, p=self.p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class RandomMedianBlur(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        self.tform = ko.augmentation.RandomMedianBlur(kernel_size=(3,3), p=p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class RandomBrightness(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            brightness = (0.9, 1.1)
+        elif severity == 'medium':
+            brightness = (0.75, 1.25)
+        elif severity == 'high':
+            brightness = (0.5, 1.5)
+        self.tform = ko.augmentation.RandomBrightness(brightness=brightness, p=p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class RandomContrast(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            contrast = (0.9, 1.1)
+        elif severity == 'medium':
+            contrast = (0.75, 1.25)
+        elif severity == 'high':
+            contrast = (0.5, 1.5)
+        self.tform = ko.augmentation.RandomContrast(contrast=contrast, p=p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class RandomSaturation(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            sat = (0.9, 1.1)
+        elif severity == 'medium':
+            sat = (0.75, 1.25)
+        elif severity == 'high':
+            sat = (0.5, 1.5)
+        self.tform = ko.augmentation.RandomSaturation(saturation=sat, p=p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class RandomSharpness(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            sharpness = 0.5
+        elif severity == 'medium':
+            sharpness = 1.0
+        elif severity == 'high':
+            sharpness = 2.5
+        self.tform = ko.augmentation.RandomSharpness(sharpness=sharpness, p=p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class RandomColorJiggle(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            factor = (0.05, 0.05, 0.05, 0.01)
+        elif severity == 'medium':
+            factor = (0.1, 0.1, 0.1, 0.02)
+        elif severity == 'high':
+            factor = (0.1, 0.1, 0.1, 0.05)
+        self.tform = ko.augmentation.ColorJiggle(*factor, p=p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class RandomHue(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            hue = 0.01
+        elif severity == 'medium':
+            hue = 0.02
+        elif severity == 'high':
+            hue = 0.05
+        self.tform = ko.augmentation.RandomHue(hue=(-hue, hue), p=p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class RandomGamma(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            gamma, gain = (0.9, 1.1), (0.9,1.1)
+        elif severity == 'medium':
+            gamma, gain = (0.75, 1.25), (0.75,1.25)
+        elif severity == 'high':
+            gamma, gain = (0.5, 1.5), (0.5,1.5)
+        self.tform = ko.augmentation.RandomGamma(gamma, gain, p=p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class RandomGaussianBlur(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            kernel_size, sigma = 3, (0.1, 1.0)
+        elif severity == 'medium':
+            kernel_size, sigma = 5, (0.1, 1.5)
+        elif severity == 'high':
+            kernel_size, sigma = 7, (0.1, 2.0)
+        self.tform = ko.augmentation.RandomGaussianBlur(kernel_size=(kernel_size, kernel_size), sigma=sigma, p=self.p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class RandomGaussianNoise(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            std = 0.02
+        elif severity == 'medium':
+            std = 0.04
+        elif severity == 'high':
+            std = 0.08
+        self.tform = ko.augmentation.RandomGaussianNoise(mean=0., std=std, p=p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class RandomMotionBlur(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            kernel_size, angle, direction = (3, 5), (-25, 25), (-0.25, 0.25)
+        elif severity == 'medium':
+            kernel_size, angle, direction = (3, 7), (-45, 45), (-0.5, 0.5)
+        elif severity == 'high':
+            kernel_size, angle, direction = (3, 9), (-90, 90), (-1.0, 1.0)
+        self.tform = ko.augmentation.RandomMotionBlur(kernel_size, angle, direction, p=p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class RandomPosterize(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            bits = 5
+        elif severity == 'medium':
+            bits = 4
+        elif severity == 'high':
+            bits = 3
+        self.tform = ko.augmentation.RandomPosterize(bits=bits, p=p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class RandomRGBShift(nn.Module):
+    def __init__(self, severity='medium', p=0.5):
+        super().__init__()
+        self.p = p
+        if severity == 'low':
+            rgb = 0.02
+        elif severity == 'medium':
+            rgb = 0.05
+        elif severity == 'high':
+            rgb = 0.1
+        self.tform = ko.augmentation.RandomRGBShift(r_shift_limit=rgb, g_shift_limit=rgb, b_shift_limit=rgb, p=p)
+    def forward(self, x, **kwargs):
+        return self.tform(x)
+class TransformNet(nn.Module):
+    def __init__(self, flip=True, crop_mode='random_crop', compress=True, brightness=True, contrast=True, color_jiggle=True, gamma=False, grayscale=True, gaussian_blur=True, gaussian_noise=True, hue=True, motion_blur=True, posterize=True, rgb_shift=True, saturation=True, sharpness=True, median_blur=True, box_blur=True, severity='medium', n_optional=2, ramp=1000, p=0.5):
+        super().__init__()
+        self.n_optional = n_optional
+        self.p = p
+        p_flip = 0.5 if flip else 0
+        rnd_flip_layer = ko.augmentation.RandomHorizontalFlip(p_flip)
+        self.ramp = ramp
+        self.register_buffer('step0', torch.tensor(0))
+        self.crop_mode = crop_mode
+        assert crop_mode in ['random_crop', 'resized_crop']
+        if crop_mode == 'random_crop':
+            rnd_crop_layer = ko.augmentation.RandomCrop((224,224), cropping_mode="resample")
+        elif crop_mode == 'resized_crop':
+            rnd_crop_layer = ko.augmentation.RandomResizedCrop(size=(224,224), scale=(0.7, 1.0), ratio=(3.0/4, 4.0/3), cropping_mode='resample')
+        self.fixed_transforms = [rnd_flip_layer, rnd_crop_layer]
+        if compress:
+            self.register(RandomCompress(severity, p=p), 'Random Compress')
+        if brightness:
+            self.register(RandomBrightness(severity, p=p), 'Random Brightness')
+        if contrast:
+            self.register(RandomContrast(severity, p=p), 'Random Contrast')
+        if color_jiggle:
+            self.register(RandomColorJiggle(severity, p=p), 'Random Color')
+        if gamma:
+            self.register(RandomGamma(severity, p=p), 'Random Gamma')
+        if grayscale:
+            self.register(ko.augmentation.RandomGrayscale(p=p), 'Grayscale')
+        if gaussian_blur:
+            self.register(RandomGaussianBlur(severity, p=p), 'Random Gaussian Blur')
+        if gaussian_noise:
+            self.register(RandomGaussianNoise(severity, p=p), 'Random Gaussian Noise')
+        if hue:
+            self.register(RandomHue(severity, p=p), 'Random Hue')
+        if motion_blur:
+            self.register(RandomMotionBlur(severity, p=p), 'Random Motion Blur')
+        if posterize:
+            self.register(RandomPosterize(severity, p=p), 'Random Posterize')
+        if rgb_shift:
+            self.register(RandomRGBShift(severity, p=p), 'Random RGB Shift')
+        if saturation:
+            self.register(RandomSaturation(severity, p=p), 'Random Saturation')
+        if sharpness:
+            self.register(RandomSharpness(severity, p=p), 'Random Sharpness')
+        if median_blur:
+            self.register(RandomMedianBlur(severity, p=p), 'Random Median Blur')
+        if box_blur:
+            self.register(RandomBoxBlur(severity, p=p), 'Random Box Blur')
+    def register(self, tform, name):
+        # register a new (optional) transform
+        if not hasattr(self, 'optional_transforms'):
+            self.optional_transforms = []
+            self.optional_names = []
+        self.optional_transforms.append(tform)
+        self.optional_names.append(name)
+    def activate(self, global_step):
+        if self.step0 == 0:
+            print(f'[TRAINING] Activating TransformNet at step {global_step}')
+            self.step0 = torch.tensor(global_step)
+    def is_activated(self):
+        return self.step0 > 0
+    def forward(self, x, global_step, p=0.9):
+        # x: [batch_size, 3, H, W] in range [-1, 1]
+        x = x * 0.5 + 0.5  # [-1, 1] -> [0, 1]
+        # fixed transforms
+        for tform in self.fixed_transforms:
+            x = tform(x)
+            if isinstance(x, tuple):
+                x = x[0]
+        # optional transforms
+        ramp = np.min([(global_step-self.step0.cpu().item()) / self.ramp, 1.])
+        if len(self.optional_transforms) > 0:
+            tform_ids = torch.randint(len(self.optional_transforms), (self.n_optional,)).numpy()
+            for tform_id in tform_ids:
+                tform = self.optional_transforms[tform_id]
+                x = tform(x, ramp=ramp)
+                if isinstance(x, tuple):
+                    x = x[0]
+        return x * 2 - 1  # [0, 1] -> [-1, 1]
+    def transform_by_id(self, x, tform_id):
+        # x: [batch_size, 3, H, W] in range [-1, 1]
+        x = x * 0.5 + 0.5  # [-1, 1] -> [0, 1]
+        # fixed transforms
+        for tform in self.fixed_transforms:
+            x = tform(x)
+            if isinstance(x, tuple):
+                x = x[0]
+        # optional transforms
+        tform = self.optional_transforms[tform_id]
+        x = tform(x)
+        if isinstance(x, tuple):
+            x = x[0]
+        return x * 2 - 1  # [0, 1] -> [-1, 1]
+    def transform_by_name(self, x, tform_name):
+        assert tform_name in self.optional_names
+        tform_id = self.optional_names.index(tform_name)
+        return self.transform_by_id(x, tform_id)
+    def apply_transform_on_pil_image(self, x, tform_name):
+        # x: PIL image
+        # return: PIL image
+        assert tform_name in self.optional_names + ['Fixed Augment']
+        # if tform_name == 'Random Crop':  # the only transform dependent on image size
+        #     # crop equivalent to 224/256
+        #     w, h = x.size
+        #     new_w, new_h = int(224 / 256 * w), int(224 / 256 * h)
+        #     x = transforms.RandomCrop((new_h, new_w))(x)
+        #     return x
+        # x = np.array(x).astype(np.float32) / 255.  # [0, 255] -> [0, 1]
+        # x = torch.from_numpy(x).permute(2, 0, 1).unsqueeze(0)  # [1, 3, H, W]
+        # if tform_name == 'Random Flip':
+        #     x = self.fixed_transforms[0](x)
+        # else:
+        #     tform_id = self.optional_names.index(tform_name)
+        #     tform = self.optional_transforms[tform_id]
+        #     x = tform(x)
+        #     if isinstance(x, tuple):
+        #         x = x[0]
+        # x = x.detach().squeeze(0).permute(1, 2, 0).numpy() * 255  # [0, 1] -> [0, 255]
+        # return Image.fromarray(x.astype(np.uint8))
+        w, h = x.size
+        x = x.resize((256, 256), Image.BILINEAR)
+        x = np.array(x).astype(np.float32) / 255.  # [0, 255] -> [0, 1]
+        x = torch.from_numpy(x).permute(2, 0, 1).unsqueeze(0)  # [1, 3, H, W]
+        if tform_name == 'Fixed Augment':
+            for tform in self.fixed_transforms:
+                x = tform(x)
+                if isinstance(x, tuple):
+                    x = x[0]
+        else:
+            tform_id = self.optional_names.index(tform_name)
+            tform = self.optional_transforms[tform_id]
+            x = tform(x)
+            if isinstance(x, tuple):
+                x = x[0]
+        x = x.detach().squeeze(0).permute(1, 2, 0).numpy() * 255  # [0, 1] -> [0, 255]
+        x = Image.fromarray(x.astype(np.uint8))
+        if (tform_name == 'Random Crop') and (self.crop_mode == 'random_crop'):
+            w, h = int(224 / 256 * w), int(224 / 256 * h)
+        x = x.resize((w, h), Image.BILINEAR)
+        return x
+if __name__ == '__main__':
+    pass

cldm/utils.py ADDED Viewed

	@@ -0,0 +1,539 @@

+import cv2
+import itertools
+import numpy as np
+import random
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from PIL import Image, ImageOps
+import matplotlib.pyplot as plt
+def random_blur_kernel(probs, N_blur, sigrange_gauss, sigrange_line, wmin_line):
+    N = N_blur
+    coords = torch.from_numpy(np.stack(np.meshgrid(range(N_blur), range(N_blur), indexing='ij'), axis=-1)) - (0.5 * (N-1)) # （7,7,2)
+    manhat = torch.sum(torch.abs(coords), dim=-1)   # (7, 7)
+    # nothing, default
+    vals_nothing = (manhat < 0.5).float()           # (7, 7)
+    # gauss
+    sig_gauss = torch.rand(1)[0] * (sigrange_gauss[1] - sigrange_gauss[0]) + sigrange_gauss[0]
+    vals_gauss = torch.exp(-torch.sum(coords ** 2, dim=-1) /2. / sig_gauss ** 2)
+    # line
+    theta = torch.rand(1)[0] * 2.* np.pi
+    v = torch.FloatTensor([torch.cos(theta), torch.sin(theta)]) # (2)
+    dists = torch.sum(coords * v, dim=-1)                       # (7, 7)
+    sig_line = torch.rand(1)[0] * (sigrange_line[1] - sigrange_line[0]) + sigrange_line[0]
+    w_line = torch.rand(1)[0] * (0.5 * (N-1) + 0.1 - wmin_line) + wmin_line
+    vals_line = torch.exp(-dists ** 2 / 2. / sig_line ** 2) * (manhat < w_line) # (7, 7)
+    t = torch.rand(1)[0]
+    vals = vals_nothing
+    if t < (probs[0] + probs[1]):
+        vals = vals_line
+    else:
+        vals = vals
+    if t < probs[0]:
+        vals = vals_gauss
+    else:
+        vals = vals
+    v = vals / torch.sum(vals)      # 归一化 (7, 7)
+    z = torch.zeros_like(v)
+    f = torch.stack([v,z,z, z,v,z, z,z,v], dim=0).reshape([3, 3, N, N])
+    return f
+def get_rand_transform_matrix(image_size, d, batch_size):
+    Ms = np.zeros((batch_size, 2, 3, 3))
+    for i in range(batch_size):
+        tl_x = random.uniform(-d, d)     # Top left corner, top
+        tl_y = random.uniform(-d, d)    # Top left corner, left
+        bl_x = random.uniform(-d, d)   # Bot left corner, bot
+        bl_y = random.uniform(-d, d)    # Bot left corner, left
+        tr_x = random.uniform(-d, d)     # Top right corner, top
+        tr_y = random.uniform(-d, d)   # Top right corner, right
+        br_x = random.uniform(-d, d)  # Bot right corner, bot
+        br_y = random.uniform(-d, d)   # Bot right corner, right
+        rect = np.array([
+            [tl_x, tl_y],
+            [tr_x + image_size, tr_y],
+            [br_x + image_size, br_y + image_size],
+            [bl_x, bl_y +  image_size]], dtype = "float32")
+        dst = np.array([
+            [0, 0],
+            [image_size, 0],
+            [image_size, image_size],
+            [0, image_size]], dtype = "float32")
+        M = cv2.getPerspectiveTransform(rect, dst)
+        M_inv = np.linalg.inv(M)
+        Ms[i, 0, :, :] = M_inv
+        Ms[i, 1, :, :] = M
+    Ms = torch.from_numpy(Ms).float()
+    return Ms
+def get_rnd_brightness_torch(rnd_bri, rnd_hue, batch_size):
+    rnd_hue = torch.FloatTensor(batch_size, 3, 1, 1).uniform_(-rnd_hue, rnd_hue)
+    rnd_brightness = torch.FloatTensor(batch_size, 1, 1, 1).uniform_(-rnd_bri, rnd_bri)
+    return rnd_hue + rnd_brightness
+# reference: https://github.com/mlomnitz/DiffJPEG.git
+y_table = np.array(
+    [[16, 11, 10, 16, 24, 40, 51, 61], [12, 12, 14, 19, 26, 58, 60,
+                                        55], [14, 13, 16, 24, 40, 57, 69, 56],
+     [14, 17, 22, 29, 51, 87, 80, 62], [18, 22, 37, 56, 68, 109, 103,
+                                        77], [24, 35, 55, 64, 81, 104, 113, 92],
+     [49, 64, 78, 87, 103, 121, 120, 101], [72, 92, 95, 98, 112, 100, 103, 99]],
+    dtype=np.float32).T
+y_table = nn.Parameter(torch.from_numpy(y_table))
+c_table = np.empty((8, 8), dtype=np.float32)
+c_table.fill(99)
+c_table[:4, :4] = np.array([[17, 18, 24, 47], [18, 21, 26, 66],
+                            [24, 26, 56, 99], [47, 66, 99, 99]]).T
+c_table = nn.Parameter(torch.from_numpy(c_table))
+# 1. RGB -> YCbCr
+class rgb_to_ycbcr_jpeg(nn.Module):
+    """ Converts RGB image to YCbCr
+    Input:
+        image(tensor): batch x 3 x height x width
+    Outpput:
+        result(tensor): batch x height x width x 3
+    """
+    def __init__(self):
+        super(rgb_to_ycbcr_jpeg, self).__init__()
+        matrix = np.array(
+            [[0.299, 0.587, 0.114], [-0.168736, -0.331264, 0.5],
+             [0.5, -0.418688, -0.081312]], dtype=np.float32).T
+        self.shift = nn.Parameter(torch.tensor([0., 128., 128.]))
+        self.matrix = nn.Parameter(torch.from_numpy(matrix))
+    def forward(self, image):
+        image = image.permute(0, 2, 3, 1)
+        result = torch.tensordot(image, self.matrix, dims=1) + self.shift
+        result.view(image.shape)
+        return result
+# 2. Chroma subsampling
+class chroma_subsampling(nn.Module):
+    """ Chroma subsampling on CbCv channels
+    Input:
+        image(tensor): batch x height x width x 3
+    Output:
+        y(tensor): batch x height x width
+        cb(tensor): batch x height/2 x width/2
+        cr(tensor): batch x height/2 x width/2
+    """
+    def __init__(self):
+        super(chroma_subsampling, self).__init__()
+    def forward(self, image):
+        image_2 = image.permute(0, 3, 1, 2).clone()
+        avg_pool = nn.AvgPool2d(kernel_size=2, stride=(2, 2),
+                                count_include_pad=False)
+        cb = avg_pool(image_2[:, 1, :, :].unsqueeze(1))
+        cr = avg_pool(image_2[:, 2, :, :].unsqueeze(1))
+        cb = cb.permute(0, 2, 3, 1)
+        cr = cr.permute(0, 2, 3, 1)
+        return image[:, :, :, 0], cb.squeeze(3), cr.squeeze(3)
+# 3. Block splitting
+class block_splitting(nn.Module):
+    """ Splitting image into patches
+    Input:
+        image(tensor): batch x height x width
+    Output:
+        patch(tensor):  batch x h*w/64 x h x w
+    """
+    def __init__(self):
+        super(block_splitting, self).__init__()
+        self.k = 8
+    def forward(self, image):
+        height, width = image.shape[1:3]
+        batch_size = image.shape[0]
+        image_reshaped = image.view(batch_size, height // self.k, self.k, -1, self.k)
+        image_transposed = image_reshaped.permute(0, 1, 3, 2, 4)
+        return image_transposed.contiguous().view(batch_size, -1, self.k, self.k)
+# 4. DCT
+class dct_8x8(nn.Module):
+    """ Discrete Cosine Transformation
+    Input:
+        image(tensor): batch x height x width
+    Output:
+        dcp(tensor): batch x height x width
+    """
+    def __init__(self):
+        super(dct_8x8, self).__init__()
+        tensor = np.zeros((8, 8, 8, 8), dtype=np.float32)
+        for x, y, u, v in itertools.product(range(8), repeat=4):
+            tensor[x, y, u, v] = np.cos((2 * x + 1) * u * np.pi / 16) * np.cos(
+                (2 * y + 1) * v * np.pi / 16)
+        alpha = np.array([1. / np.sqrt(2)] + [1] * 7)
+        #
+        self.tensor =  nn.Parameter(torch.from_numpy(tensor).float())
+        self.scale = nn.Parameter(torch.from_numpy(np.outer(alpha, alpha) * 0.25).float() )
+    def forward(self, image):
+        image = image - 128
+        result = self.scale * torch.tensordot(image, self.tensor, dims=2)
+        result.view(image.shape)
+        return result
+# 5. Quantization
+class y_quantize(nn.Module):
+    """ JPEG Quantization for Y channel
+    Input:
+        image(tensor): batch x height x width
+        rounding(function): rounding function to use
+        factor(float): Degree of compression
+    Output:
+        image(tensor): batch x height x width
+    """
+    def __init__(self, rounding, factor=1):
+        super(y_quantize, self).__init__()
+        self.rounding = rounding
+        self.factor = factor
+        self.y_table = y_table
+    def forward(self, image):
+        image = image.float() / (self.y_table * self.factor)
+        image = self.rounding(image)
+        return image
+class c_quantize(nn.Module):
+    """ JPEG Quantization for CrCb channels
+    Input:
+        image(tensor): batch x height x width
+        rounding(function): rounding function to use
+        factor(float): Degree of compression
+    Output:
+        image(tensor): batch x height x width
+    """
+    def __init__(self, rounding, factor=1):
+        super(c_quantize, self).__init__()
+        self.rounding = rounding
+        self.factor = factor
+        self.c_table = c_table
+    def forward(self, image):
+        image = image.float() / (self.c_table * self.factor)
+        image = self.rounding(image)
+        return image
+class compress_jpeg(nn.Module):
+    """ Full JPEG compression algortihm
+    Input:
+        imgs(tensor): batch x 3 x height x width
+        rounding(function): rounding function to use
+        factor(float): Compression factor
+    Ouput:
+        compressed(dict(tensor)): batch x h*w/64 x 8 x 8
+    """
+    def __init__(self, rounding=torch.round, factor=1):
+        super(compress_jpeg, self).__init__()
+        self.l1 = nn.Sequential(
+            rgb_to_ycbcr_jpeg(),
+            chroma_subsampling()
+        )
+        self.l2 = nn.Sequential(
+            block_splitting(),
+            dct_8x8()
+        )
+        self.c_quantize = c_quantize(rounding=rounding, factor=factor)
+        self.y_quantize = y_quantize(rounding=rounding, factor=factor)
+    def forward(self, image):
+        y, cb, cr = self.l1(image*255)
+        components = {'y': y, 'cb': cb, 'cr': cr}
+        for k in components.keys():
+            comp = self.l2(components[k])
+            if k in ('cb', 'cr'):
+                comp = self.c_quantize(comp)
+            else:
+                comp = self.y_quantize(comp)
+            components[k] = comp
+        return components['y'], components['cb'], components['cr']
+# -5. Dequantization
+class y_dequantize(nn.Module):
+    """ Dequantize Y channel
+    Inputs:
+        image(tensor): batch x height x width
+        factor(float): compression factor
+    Outputs:
+        image(tensor): batch x height x width
+    """
+    def __init__(self, factor=1):
+        super(y_dequantize, self).__init__()
+        self.y_table = y_table
+        self.factor = factor
+    def forward(self, image):
+        return image * (self.y_table * self.factor)
+class c_dequantize(nn.Module):
+    """ Dequantize CbCr channel
+    Inputs:
+        image(tensor): batch x height x width
+        factor(float): compression factor
+    Outputs:
+        image(tensor): batch x height x width
+    """
+    def __init__(self, factor=1):
+        super(c_dequantize, self).__init__()
+        self.factor = factor
+        self.c_table = c_table
+    def forward(self, image):
+        return image * (self.c_table * self.factor)
+# -4. Inverse DCT
+class idct_8x8(nn.Module):
+    """ Inverse discrete Cosine Transformation
+    Input:
+        dcp(tensor): batch x height x width
+    Output:
+        image(tensor): batch x height x width
+    """
+    def __init__(self):
+        super(idct_8x8, self).__init__()
+        alpha = np.array([1. / np.sqrt(2)] + [1] * 7)
+        self.alpha = nn.Parameter(torch.from_numpy(np.outer(alpha, alpha)).float())
+        tensor = np.zeros((8, 8, 8, 8), dtype=np.float32)
+        for x, y, u, v in itertools.product(range(8), repeat=4):
+            tensor[x, y, u, v] = np.cos((2 * u + 1) * x * np.pi / 16) * np.cos(
+                (2 * v + 1) * y * np.pi / 16)
+        self.tensor = nn.Parameter(torch.from_numpy(tensor).float())
+    def forward(self, image):
+        image = image * self.alpha
+        result = 0.25 * torch.tensordot(image, self.tensor, dims=2) + 128
+        result.view(image.shape)
+        return result
+# -3. Block joining
+class block_merging(nn.Module):
+    """ Merge pathces into image
+    Inputs:
+        patches(tensor) batch x height*width/64, height x width
+        height(int)
+        width(int)
+    Output:
+        image(tensor): batch x height x width
+    """
+    def __init__(self):
+        super(block_merging, self).__init__()
+    def forward(self, patches, height, width):
+        k = 8
+        batch_size = patches.shape[0]
+        image_reshaped = patches.view(batch_size, height//k, width//k, k, k)
+        image_transposed = image_reshaped.permute(0, 1, 3, 2, 4)
+        return image_transposed.contiguous().view(batch_size, height, width)
+# -2. Chroma upsampling
+class chroma_upsampling(nn.Module):
+    """ Upsample chroma layers
+    Input:
+        y(tensor): y channel image
+        cb(tensor): cb channel
+        cr(tensor): cr channel
+    Ouput:
+        image(tensor): batch x height x width x 3
+    """
+    def __init__(self):
+        super(chroma_upsampling, self).__init__()
+    def forward(self, y, cb, cr):
+        def repeat(x, k=2):
+            height, width = x.shape[1:3]
+            x = x.unsqueeze(-1)
+            x = x.repeat(1, 1, k, k)
+            x = x.view(-1, height * k, width * k)
+            return x
+        cb = repeat(cb)
+        cr = repeat(cr)
+        return torch.cat([y.unsqueeze(3), cb.unsqueeze(3), cr.unsqueeze(3)], dim=3)
+# -1: YCbCr -> RGB
+class ycbcr_to_rgb_jpeg(nn.Module):
+    """ Converts YCbCr image to RGB JPEG
+    Input:
+        image(tensor): batch x height x width x 3
+    Outpput:
+        result(tensor): batch x 3 x height x width
+    """
+    def __init__(self):
+        super(ycbcr_to_rgb_jpeg, self).__init__()
+        matrix = np.array(
+            [[1., 0., 1.402], [1, -0.344136, -0.714136], [1, 1.772, 0]],
+            dtype=np.float32).T
+        self.shift = nn.Parameter(torch.tensor([0, -128., -128.]))
+        self.matrix = nn.Parameter(torch.from_numpy(matrix))
+    def forward(self, image):
+        result = torch.tensordot(image + self.shift, self.matrix, dims=1)
+        result.view(image.shape)
+        return result.permute(0, 3, 1, 2)
+class decompress_jpeg(nn.Module):
+    """ Full JPEG decompression algortihm
+    Input:
+        compressed(dict(tensor)): batch x h*w/64 x 8 x 8
+        rounding(function): rounding function to use
+        factor(float): Compression factor
+    Ouput:
+        image(tensor): batch x 3 x height x width
+    """
+    def __init__(self, height, width, rounding=torch.round, factor=1):
+        super(decompress_jpeg, self).__init__()
+        self.c_dequantize = c_dequantize(factor=factor)
+        self.y_dequantize = y_dequantize(factor=factor)
+        self.idct = idct_8x8()
+        self.merging = block_merging()
+        self.chroma = chroma_upsampling()
+        self.colors = ycbcr_to_rgb_jpeg()
+        self.height, self.width = height, width
+    def forward(self, y, cb, cr):
+        components = {'y': y, 'cb': cb, 'cr': cr}
+        for k in components.keys():
+            if k in ('cb', 'cr'):
+                comp = self.c_dequantize(components[k])
+                height, width = int(self.height/2), int(self.width/2)
+            else:
+                comp = self.y_dequantize(components[k])
+                height, width = self.height, self.width
+            comp = self.idct(comp)
+            components[k] = self.merging(comp, height, width)
+            #
+        image = self.chroma(components['y'], components['cb'], components['cr'])
+        image = self.colors(image)
+        image = torch.min(255*torch.ones_like(image),
+                          torch.max(torch.zeros_like(image), image))
+        return image/255
+def diff_round(x):
+    """ Differentiable rounding function
+    Input:
+        x(tensor)
+    Output:
+        x(tensor)
+    """
+    return torch.round(x) + (x - torch.round(x))**3
+def round_only_at_0(x):
+    cond = (torch.abs(x) < 0.5).float()
+    return cond * (x ** 3) + (1 - cond) * x
+def quality_to_factor(quality):
+    """ Calculate factor corresponding to quality
+    Input:
+        quality(float): Quality for jpeg compression
+    Output:
+        factor(float): Compression factor
+    """
+    if quality < 50:
+        quality = 5000. / quality
+    else:
+        quality = 200. - quality*2
+    return quality / 100.
+def jpeg_compress_decompress(image,
+                            #  downsample_c=True,
+                             rounding=round_only_at_0,
+                             quality=80):
+    # image_r = image * 255
+    height, width = image.shape[2:4]
+    # orig_height, orig_width = height, width
+    # if height % 16 != 0 or width % 16 != 0:
+    #     # Round up to next multiple of 16
+    #     height = ((height - 1) // 16 + 1) * 16
+    #     width = ((width - 1) // 16 + 1) * 16
+    #     vpad = height - orig_height
+    #     wpad = width - orig_width
+    #     top = vpad // 2
+    #     bottom = vpad - top
+    #     left = wpad // 2
+    #     right = wpad - left
+    # #image = tf.pad(image, [[0, 0], [top, bottom], [left, right], [0, 0]], 'SYMMETRIC')
+    # image = torch.pad(image, [[0, 0], [0, vpad], [0, wpad], [0, 0]], 'reflect')
+    factor = quality_to_factor(quality)
+    compress = compress_jpeg(rounding=rounding, factor=factor).to(image.device)
+    decompress = decompress_jpeg(height, width, rounding=rounding, factor=factor).to(image.device)
+    y, cb, cr = compress(image)
+    recovered = decompress(y, cb, cr)
+    return recovered.contiguous()
+if __name__ == '__main__':
+    ''' test JPEG compress and decompress'''
+    # img = Image.open('house.jpg')
+    # img = np.array(img) / 255.
+    # img_r = np.transpose(img, [2, 0, 1])
+    # img_tensor = torch.from_numpy(img_r).unsqueeze(0).float()
+    # recover = jpeg_compress_decompress(img_tensor)
+    # recover_arr = recover.detach().squeeze(0).numpy()
+    # recover_arr = np.transpose(recover_arr, [1, 2, 0])
+    # plt.subplot(121)
+    # plt.imshow(img)
+    # plt.subplot(122)
+    # plt.imshow(recover_arr)
+    # plt.show()
+    ''' test blur '''
+    # blur
+    img = Image.open('house.jpg')
+    img = np.array(img) / 255.
+    img_r = np.transpose(img, [2, 0, 1])
+    img_tensor = torch.from_numpy(img_r).unsqueeze(0).float()
+    print(img_tensor.shape)
+    N_blur=7
+    f = random_blur_kernel(probs=[.25, .25], N_blur=N_blur, sigrange_gauss=[1., 3.], sigrange_line=[.25, 1.], wmin_line=3)
+    # print(f.shape)
+    # print(type(f))
+    encoded_image = F.conv2d(img_tensor, f, bias=None, padding=int((N_blur-1)/2))
+    encoded_image = encoded_image.detach().squeeze(0).numpy()
+    encoded_image = np.transpose(encoded_image, [1, 2, 0])
+    plt.subplot(121)
+    plt.imshow(img)
+    plt.subplot(122)
+    plt.imshow(encoded_image)
+    plt.show()

flae/models.py ADDED Viewed

	@@ -0,0 +1,325 @@

+import math
+import torch
+from torch import nn
+from torch.nn import functional as thf
+import pytorch_lightning as pl
+from ldm.util import instantiate_from_config
+import einops
+import kornia
+import numpy as np
+import torchvision
+from contextlib import contextmanager
+from ldm.modules.ema import LitEma
+class FlAE(pl.LightningModule):
+    def __init__(self,
+                 cover_key,
+                 secret_key,
+                 secret_len,
+                 resolution,
+                 secret_encoder_config,
+                 secret_decoder_config,
+                 loss_config,
+                 noise_config='__none__',
+                 ckpt_path="__none__",
+                 use_ema=False
+                 ):
+        super().__init__()
+        self.cover_key = cover_key
+        self.secret_key = secret_key
+        secret_encoder_config.params.secret_len = secret_len
+        secret_decoder_config.params.secret_len = secret_len
+        secret_encoder_config.params.resolution = resolution
+        secret_decoder_config.params.resolution = 224
+        self.encoder = instantiate_from_config(secret_encoder_config)
+        self.decoder = instantiate_from_config(secret_decoder_config)
+        self.loss_layer = instantiate_from_config(loss_config)
+        if noise_config != '__none__':
+            print('Using noise')
+            self.noise = instantiate_from_config(noise_config)
+        self.use_ema = use_ema
+        if self.use_ema:
+            print('Using EMA')
+            self.encoder_ema = LitEma(self.encoder)
+            self.decoder_ema = LitEma(self.decoder)
+            print(f"Keeping EMAs of {len(list(self.encoder_ema.buffers()) + list(self.decoder_ema.buffers()))}.")
+        if ckpt_path != "__none__":
+            self.init_from_ckpt(ckpt_path, ignore_keys=[])
+        # early training phase
+        self.fixed_img = None
+        self.fixed_secret = None
+        self.register_buffer("fixed_input", torch.tensor(True))
+        self.crop = kornia.augmentation.CenterCrop((224, 224), cropping_mode="resample")  # early training phase
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        self.load_state_dict(sd, strict=False)
+        print(f"Restored from {path}")
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.encoder_ema.store(self.encoder.parameters())
+            self.decoder_ema.store(self.decoder.parameters())
+            self.encoder_ema.copy_to(self.encoder)
+            self.decoder_ema.copy_to(self.decoder)
+            if context is not None:
+                print(f"{context}: Switched to EMA weights")
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.encoder_ema.restore(self.encoder.parameters())
+                self.decoder_ema.restore(self.decoder.parameters())
+                if context is not None:
+                    print(f"{context}: Restored training weights")
+    def on_train_batch_end(self, *args, **kwargs):
+        if self.use_ema:
+            self.encoder_ema(self.encoder)
+            self.decoder_ema(self.decoder)
+    @torch.no_grad()
+    def get_input(self, batch, bs=None):
+        image = batch[self.cover_key]
+        secret = batch[self.secret_key]
+        if bs is not None:
+            image = image[:bs]
+            secret = secret[:bs]
+        else:
+            bs = image.shape[0]
+        # encode image 1st stage
+        image = einops.rearrange(image, "b h w c -> b c h w").contiguous()
+        # check if using fixed input (early training phase)
+        # if self.training and self.fixed_input:
+        if self.fixed_input:
+            if self.fixed_img is None:  # first iteration
+                print('[TRAINING] Warmup - using fixed input image for now!')
+                self.fixed_img = image.detach().clone()[:bs]
+                self.fixed_secret = secret.detach().clone()[:bs]  # use for log_images with fixed_input option only
+            image = self.fixed_img
+            new_bs = min(secret.shape[0], image.shape[0])
+            image, secret = image[:new_bs], secret[:new_bs]
+        out = [image, secret]
+        return out
+    def forward(self, cover, secret):
+        # return a tuple (stego, residual)
+        enc_out = self.encoder(cover, secret)
+        if self.encoder.return_residual:
+            return cover + enc_out, enc_out
+        else:
+            return enc_out, enc_out - cover
+    def shared_step(self, batch):
+        x, s = self.get_input(batch)
+        stego, residual = self(x, s)
+        if hasattr(self, "noise") and self.noise.is_activated():
+            stego_noised = self.noise(stego, self.global_step, p=0.9)
+        else:
+            stego_noised = self.crop(stego)
+        stego_noised = torch.clamp(stego_noised, -1, 1)
+        spred = self.decoder(stego_noised)
+        loss, loss_dict = self.loss_layer(x, stego, None, s, spred, self.global_step)
+        bit_acc = loss_dict["bit_acc"]
+        bit_acc_ = bit_acc.item()
+        if (bit_acc_ > 0.98) and (not self.fixed_input) and self.noise.is_activated():
+            self.loss_layer.activate_ramp(self.global_step)
+        if (bit_acc_ > 0.95) and (not self.fixed_input):  # ramp up image loss at late training stage
+            if hasattr(self, 'noise') and (not self.noise.is_activated()):
+                self.noise.activate(self.global_step)
+        if (bit_acc_ > 0.9) and self.fixed_input:  # execute only once
+            print(f'[TRAINING] High bit acc ({bit_acc_}) achieved, switch to full image dataset training.')
+            self.fixed_input = ~self.fixed_input
+        return loss, loss_dict
+    def training_step(self, batch, batch_idx):
+        loss, loss_dict = self.shared_step(batch)
+        loss_dict = {f"train/{key}": val for key, val in loss_dict.items()}
+        self.log_dict(loss_dict, prog_bar=True,
+                      logger=True, on_step=True, on_epoch=True)
+        self.log("global_step", self.global_step,
+                 prog_bar=True, logger=True, on_step=True, on_epoch=False)
+        # if self.use_scheduler:
+        #     lr = self.optimizers().param_groups[0]['lr']
+        #     self.log('lr_abs', lr, prog_bar=True, logger=True, on_step=True, on_epoch=False)
+        return loss
+    @torch.no_grad()
+    def validation_step(self, batch, batch_idx):
+        _, loss_dict_no_ema = self.shared_step(batch)
+        loss_dict_no_ema = {f"val/{key}": val for key, val in loss_dict_no_ema.items() if key != 'img_lw'}
+        with self.ema_scope():
+            _, loss_dict_ema = self.shared_step(batch)
+            loss_dict_ema = {'val/' + key + '_ema': loss_dict_ema[key] for key in loss_dict_ema}
+        self.log_dict(loss_dict_no_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True)
+        self.log_dict(loss_dict_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True)
+    @torch.no_grad()
+    def log_images(self, batch, fixed_input=False, **kwargs):
+        log = dict()
+        if fixed_input and self.fixed_img is not None:
+            x, s = self.fixed_img, self.fixed_secret
+        else:
+            x, s = self.get_input(batch)
+        stego, residual = self(x, s)
+        if hasattr(self, 'noise') and self.noise.is_activated():
+            img_noise = self.noise(stego, self.global_step, p=1.0)
+            log['noised'] = img_noise
+        log['input'] = x
+        log['stego'] = stego
+        log['residual'] = (residual - residual.min()) / (residual.max() - residual.min() + 1e-8)*2 - 1
+        return log
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        params = list(self.encoder.parameters()) + list(self.decoder.parameters())
+        optimizer = torch.optim.AdamW(params, lr=lr)
+        return optimizer
+class SecretEncoder(nn.Module):
+    def __init__(self, resolution=256, secret_len=100, return_residual=False, act='tanh') -> None:
+        super().__init__()
+        self.secret_len = secret_len
+        self.return_residual = return_residual
+        self.act_fn = lambda x: torch.tanh(x) if act == 'tanh' else thf.sigmoid(x) * 2.0 -1.0
+        self.secret_dense = nn.Linear(secret_len, 16*16*3)
+        log_resolution = int(math.log(resolution, 2))
+        assert resolution == 2 ** log_resolution, f"Image resolution must be a power of 2, got {resolution}."
+        self.secret_upsample = nn.Upsample(scale_factor=(2**(log_resolution-4), 2**(log_resolution-4)))
+        self.conv1 = nn.Conv2d(2 * 3, 32, 3, 1, 1)
+        self.conv2 = nn.Conv2d(32, 32, 3, 2, 1)
+        self.conv3 = nn.Conv2d(32, 64, 3, 2, 1)
+        self.conv4 = nn.Conv2d(64, 128, 3, 2, 1)
+        self.conv5 = nn.Conv2d(128, 256, 3, 2, 1)
+        self.pad6 = nn.ZeroPad2d((0, 1, 0, 1))
+        self.up6 = nn.Conv2d(256, 128, 2, 1)
+        self.upsample6 = nn.Upsample(scale_factor=(2, 2))
+        self.conv6 = nn.Conv2d(128 + 128, 128, 3, 1, 1)
+        self.pad7 = nn.ZeroPad2d((0, 1, 0, 1))
+        self.up7 = nn.Conv2d(128, 64, 2, 1)
+        self.upsample7 = nn.Upsample(scale_factor=(2, 2))
+        self.conv7 = nn.Conv2d(64 + 64, 64, 3, 1, 1)
+        self.pad8 = nn.ZeroPad2d((0, 1, 0, 1))
+        self.up8 = nn.Conv2d(64, 32, 2, 1)
+        self.upsample8 = nn.Upsample(scale_factor=(2, 2))
+        self.conv8 = nn.Conv2d(32 + 32, 32, 3, 1, 1)
+        self.pad9 = nn.ZeroPad2d((0, 1, 0, 1))
+        self.up9 = nn.Conv2d(32, 32, 2, 1)
+        self.upsample9 = nn.Upsample(scale_factor=(2, 2))
+        self.conv9 = nn.Conv2d(32 + 32 + 2 * 3, 32, 3, 1, 1)
+        self.conv10 = nn.Conv2d(32, 32, 3, 1, 1)
+        self.residual = nn.Conv2d(32, 3, 1)
+    def forward(self, image, secret):
+        fingerprint = thf.relu(self.secret_dense(secret))
+        fingerprint = fingerprint.view((-1, 3, 16, 16))
+        fingerprint_enlarged = self.secret_upsample(fingerprint)
+        # try:
+        inputs = torch.cat([fingerprint_enlarged, image], dim=1)
+        # except:
+        #     print(fingerprint_enlarged.shape, image.shape, fingerprint.shape)
+        #     import pdb; pdb.set_trace()
+        conv1 = thf.relu(self.conv1(inputs))
+        conv2 = thf.relu(self.conv2(conv1))
+        conv3 = thf.relu(self.conv3(conv2))
+        conv4 = thf.relu(self.conv4(conv3))
+        conv5 = thf.relu(self.conv5(conv4))
+        up6 = thf.relu(self.up6(self.pad6(self.upsample6(conv5))))
+        merge6 = torch.cat([conv4, up6], dim=1)
+        conv6 = thf.relu(self.conv6(merge6))
+        up7 = thf.relu(self.up7(self.pad7(self.upsample7(conv6))))
+        merge7 = torch.cat([conv3, up7], dim=1)
+        conv7 = thf.relu(self.conv7(merge7))
+        up8 = thf.relu(self.up8(self.pad8(self.upsample8(conv7))))
+        merge8 = torch.cat([conv2, up8], dim=1)
+        conv8 = thf.relu(self.conv8(merge8))
+        up9 = thf.relu(self.up9(self.pad9(self.upsample9(conv8))))
+        merge9 = torch.cat([conv1, up9, inputs], dim=1)
+        conv9 = thf.relu(self.conv9(merge9))
+        conv10 = thf.relu(self.conv10(conv9))
+        residual = self.residual(conv10)
+        residual = self.act_fn(residual)
+        return residual
+class SecretEncoder1(nn.Module):
+    def __init__(self, resolution=256, secret_len=100) -> None:
+        pass
+class SecretDecoder(nn.Module):
+    def __init__(self, arch='resnet18', resolution=224, secret_len=100):
+        super().__init__()
+        self.resolution = resolution
+        self.arch = arch
+        if arch == 'resnet18':
+            self.decoder = torchvision.models.resnet18(pretrained=True, progress=False)
+            self.decoder.fc = nn.Linear(self.decoder.fc.in_features, secret_len)
+        elif arch == 'resnet50':
+            self.decoder = torchvision.models.resnet50(pretrained=True, progress=False)
+            self.decoder.fc = nn.Linear(self.decoder.fc.in_features, secret_len)
+        elif arch == 'simple':
+            self.decoder = SimpleCNN(resolution, secret_len)
+        else:
+            raise ValueError('Unknown architecture')
+    def forward(self, image):
+        if self.arch in ['resnet50', 'resnet18'] and image.shape[-1] > self.resolution:
+            image = thf.interpolate(image, size=(self.resolution, self.resolution), mode='bilinear', align_corners=False)
+        x = self.decoder(image)
+        return x
+class SimpleCNN(nn.Module):
+    def __init__(self, resolution=224, secret_len=100):
+        super().__init__()
+        self.resolution = resolution
+        self.IMAGE_CHANNELS = 3
+        self.decoder = nn.Sequential(
+            nn.Conv2d(self.IMAGE_CHANNELS, 32, (3, 3), 2, 1),  # resolution / 2
+            nn.ReLU(),
+            nn.Conv2d(32, 32, 3, 1, 1),
+            nn.ReLU(),
+            nn.Conv2d(32, 64, 3, 2, 1),  # resolution / 4
+            nn.ReLU(),
+            nn.Conv2d(64, 64, 3, 1, 1),
+            nn.ReLU(),
+            nn.Conv2d(64, 64, 3, 2, 1),  # resolution / 8
+            nn.ReLU(),
+            nn.Conv2d(64, 128, 3, 2, 1),  # resolution / 16
+            nn.ReLU(),
+            nn.Conv2d(128, 128, (3, 3), 2, 1),  # resolution / 32
+            nn.ReLU(),
+        )
+        self.dense = nn.Sequential(
+            nn.Linear(resolution * resolution * 128 // 32 // 32, 512),
+            nn.ReLU(),
+            nn.Linear(512, secret_len),
+        )
+    def forward(self, image):
+        x = self.decoder(image)
+        x = x.view(-1, self.resolution * self.resolution * 128 // 32 // 32)
+        return self.dense(x)

flae/munit.py ADDED Viewed

	@@ -0,0 +1,576 @@

+"""
+Copyright (C) 2018 NVIDIA Corporation.  All rights reserved.
+Licensed under the CC BY-NC-SA 4.0 license (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode).
+"""
+from torch import nn
+from torch.autograd import Variable
+import torch
+import torch.nn.functional as F
+try:
+    from itertools import izip as zip
+except ImportError: # will be 3.x series
+    pass
+##################################################################################
+# Discriminator
+##################################################################################
+class MsImageDis(nn.Module):
+    # Multi-scale discriminator architecture
+    def __init__(self, input_dim, params):
+        super(MsImageDis, self).__init__()
+        self.n_layer = params['n_layer']
+        self.gan_type = params['gan_type']
+        self.dim = params['dim']
+        self.norm = params['norm']
+        self.activ = params['activ']
+        self.num_scales = params['num_scales']
+        self.pad_type = params['pad_type']
+        self.input_dim = input_dim
+        self.downsample = nn.AvgPool2d(3, stride=2, padding=[1, 1], count_include_pad=False)
+        self.cnns = nn.ModuleList()
+        for _ in range(self.num_scales):
+            self.cnns.append(self._make_net())
+    def _make_net(self):
+        dim = self.dim
+        cnn_x = []
+        cnn_x += [Conv2dBlock(self.input_dim, dim, 4, 2, 1, norm='none', activation=self.activ, pad_type=self.pad_type)]
+        for i in range(self.n_layer - 1):
+            cnn_x += [Conv2dBlock(dim, dim * 2, 4, 2, 1, norm=self.norm, activation=self.activ, pad_type=self.pad_type)]
+            dim *= 2
+        cnn_x += [nn.Conv2d(dim, 1, 1, 1, 0)]
+        cnn_x = nn.Sequential(*cnn_x)
+        return cnn_x
+    def forward(self, x):
+        outputs = []
+        for model in self.cnns:
+            outputs.append(model(x))
+            x = self.downsample(x)
+        return outputs
+    def calc_dis_loss(self, input_fake, input_real):
+        # calculate the loss to train D
+        outs0 = self.forward(input_fake)
+        outs1 = self.forward(input_real)
+        loss = 0
+        for it, (out0, out1) in enumerate(zip(outs0, outs1)):
+            if self.gan_type == 'lsgan':
+                loss += torch.mean((out0 - 0)**2) + torch.mean((out1 - 1)**2)
+            elif self.gan_type == 'nsgan':
+                all0 = Variable(torch.zeros_like(out0.data).cuda(), requires_grad=False)
+                all1 = Variable(torch.ones_like(out1.data).cuda(), requires_grad=False)
+                loss += torch.mean(F.binary_cross_entropy(F.sigmoid(out0), all0) +
+                                   F.binary_cross_entropy(F.sigmoid(out1), all1))
+            else:
+                assert 0, "Unsupported GAN type: {}".format(self.gan_type)
+        return loss
+    def calc_gen_loss(self, input_fake):
+        # calculate the loss to train G
+        outs0 = self.forward(input_fake)
+        loss = 0
+        for it, (out0) in enumerate(outs0):
+            if self.gan_type == 'lsgan':
+                loss += torch.mean((out0 - 1)**2) # LSGAN
+            elif self.gan_type == 'nsgan':
+                all1 = Variable(torch.ones_like(out0.data).cuda(), requires_grad=False)
+                loss += torch.mean(F.binary_cross_entropy(F.sigmoid(out0), all1))
+            else:
+                assert 0, "Unsupported GAN type: {}".format(self.gan_type)
+        return loss
+##################################################################################
+# Generator
+##################################################################################
+class AdaINGen(nn.Module):
+    # AdaIN auto-encoder architecture
+    def __init__(self, input_dim, params):
+        super(AdaINGen, self).__init__()
+        dim = params['dim']
+        style_dim = params['style_dim']
+        n_downsample = params['n_downsample']
+        n_res = params['n_res']
+        activ = params['activ']
+        pad_type = params['pad_type']
+        mlp_dim = params['mlp_dim']
+        # style encoder
+        self.enc_style = StyleEncoder(4, input_dim, dim, style_dim, norm='none', activ=activ, pad_type=pad_type)
+        # content encoder
+        self.enc_content = ContentEncoder(n_downsample, n_res, input_dim, dim, 'in', activ, pad_type=pad_type)
+        self.dec = Decoder(n_downsample, n_res, self.enc_content.output_dim, input_dim, res_norm='adain', activ=activ, pad_type=pad_type)
+        # MLP to generate AdaIN parameters
+        self.mlp = MLP(style_dim, self.get_num_adain_params(self.dec), mlp_dim, 3, norm='none', activ=activ)
+    def forward(self, images):
+        # reconstruct an image
+        content, style_fake = self.encode(images)
+        images_recon = self.decode(content, style_fake)
+        return images_recon
+    def encode(self, images):
+        # encode an image to its content and style codes
+        style_fake = self.enc_style(images)
+        content = self.enc_content(images)
+        return content, style_fake
+    def decode(self, content, style):
+        # decode content and style codes to an image
+        adain_params = self.mlp(style)
+        self.assign_adain_params(adain_params, self.dec)
+        images = self.dec(content)
+        return images
+    def assign_adain_params(self, adain_params, model):
+        # assign the adain_params to the AdaIN layers in model
+        for m in model.modules():
+            if m.__class__.__name__ == "AdaptiveInstanceNorm2d":
+                mean = adain_params[:, :m.num_features]
+                std = adain_params[:, m.num_features:2*m.num_features]
+                m.bias = mean.contiguous().view(-1)
+                m.weight = std.contiguous().view(-1)
+                if adain_params.size(1) > 2*m.num_features:
+                    adain_params = adain_params[:, 2*m.num_features:]
+    def get_num_adain_params(self, model):
+        # return the number of AdaIN parameters needed by the model
+        num_adain_params = 0
+        for m in model.modules():
+            if m.__class__.__name__ == "AdaptiveInstanceNorm2d":
+                num_adain_params += 2*m.num_features
+        return num_adain_params
+class VAEGen(nn.Module):
+    # VAE architecture
+    def __init__(self, input_dim, params):
+        super(VAEGen, self).__init__()
+        dim = params['dim']
+        n_downsample = params['n_downsample']
+        n_res = params['n_res']
+        activ = params['activ']
+        pad_type = params['pad_type']
+        # content encoder
+        self.enc = ContentEncoder(n_downsample, n_res, input_dim, dim, 'in', activ, pad_type=pad_type)
+        self.dec = Decoder(n_downsample, n_res, self.enc.output_dim, input_dim, res_norm='in', activ=activ, pad_type=pad_type)
+    def forward(self, images):
+        # This is a reduced VAE implementation where we assume the outputs are multivariate Gaussian distribution with mean = hiddens and std_dev = all ones.
+        hiddens = self.encode(images)
+        if self.training == True:
+            noise = Variable(torch.randn(hiddens.size()).cuda(hiddens.data.get_device()))
+            images_recon = self.decode(hiddens + noise)
+        else:
+            images_recon = self.decode(hiddens)
+        return images_recon, hiddens
+    def encode(self, images):
+        hiddens = self.enc(images)
+        noise = Variable(torch.randn(hiddens.size()).cuda(hiddens.data.get_device()))
+        return hiddens, noise
+    def decode(self, hiddens):
+        images = self.dec(hiddens)
+        return images
+##################################################################################
+# Encoder and Decoders
+##################################################################################
+class StyleEncoder(nn.Module):
+    def __init__(self, n_downsample, input_dim, dim, style_dim, norm, activ, pad_type):
+        super(StyleEncoder, self).__init__()
+        self.model = []
+        self.model += [Conv2dBlock(input_dim, dim, 7, 1, 3, norm=norm, activation=activ, pad_type=pad_type)]
+        for i in range(2):
+            self.model += [Conv2dBlock(dim, 2 * dim, 4, 2, 1, norm=norm, activation=activ, pad_type=pad_type)]
+            dim *= 2
+        for i in range(n_downsample - 2):
+            self.model += [Conv2dBlock(dim, dim, 4, 2, 1, norm=norm, activation=activ, pad_type=pad_type)]
+        self.model += [nn.AdaptiveAvgPool2d(1)] # global average pooling
+        self.model += [nn.Conv2d(dim, style_dim, 1, 1, 0)]
+        self.model = nn.Sequential(*self.model)
+        self.output_dim = dim
+    def forward(self, x):
+        return self.model(x)
+class ContentEncoder(nn.Module):
+    def __init__(self, n_downsample, n_res, input_dim, dim, norm, activ, pad_type):
+        super(ContentEncoder, self).__init__()
+        self.model = []
+        self.model += [Conv2dBlock(input_dim, dim, 7, 1, 3, norm=norm, activation=activ, pad_type=pad_type)]
+        # downsampling blocks
+        for i in range(n_downsample):
+            self.model += [Conv2dBlock(dim, 2 * dim, 4, 2, 1, norm=norm, activation=activ, pad_type=pad_type)]
+            dim *= 2
+        # residual blocks
+        self.model += [ResBlocks(n_res, dim, norm=norm, activation=activ, pad_type=pad_type)]
+        self.model = nn.Sequential(*self.model)
+        self.output_dim = dim
+    def forward(self, x):
+        return self.model(x)
+class Decoder(nn.Module):
+    def __init__(self, n_upsample, n_res, dim, output_dim, res_norm='adain', activ='relu', pad_type='zero'):
+        super(Decoder, self).__init__()
+        self.model = []
+        # AdaIN residual blocks
+        self.model += [ResBlocks(n_res, dim, res_norm, activ, pad_type=pad_type)]
+        # upsampling blocks
+        for i in range(n_upsample):
+            self.model += [nn.Upsample(scale_factor=2),
+                           Conv2dBlock(dim, dim // 2, 5, 1, 2, norm='ln', activation=activ, pad_type=pad_type)]
+            dim //= 2
+        # use reflection padding in the last conv layer
+        self.model += [Conv2dBlock(dim, output_dim, 7, 1, 3, norm='none', activation='tanh', pad_type=pad_type)]
+        self.model = nn.Sequential(*self.model)
+    def forward(self, x):
+        return self.model(x)
+##################################################################################
+# Sequential Models
+##################################################################################
+class ResBlocks(nn.Module):
+    def __init__(self, num_blocks, dim, norm='in', activation='relu', pad_type='zero'):
+        super(ResBlocks, self).__init__()
+        self.model = []
+        for i in range(num_blocks):
+            self.model += [ResBlock(dim, norm=norm, activation=activation, pad_type=pad_type)]
+        self.model = nn.Sequential(*self.model)
+    def forward(self, x):
+        return self.model(x)
+class MLP(nn.Module):
+    def __init__(self, input_dim, output_dim, dim, n_blk, norm='none', activ='relu'):
+        super(MLP, self).__init__()
+        self.model = []
+        self.model += [LinearBlock(input_dim, dim, norm=norm, activation=activ)]
+        for i in range(n_blk - 2):
+            self.model += [LinearBlock(dim, dim, norm=norm, activation=activ)]
+        self.model += [LinearBlock(dim, output_dim, norm='none', activation='none')] # no output activations
+        self.model = nn.Sequential(*self.model)
+    def forward(self, x):
+        return self.model(x.view(x.size(0), -1))
+##################################################################################
+# Basic Blocks
+##################################################################################
+class ResBlock(nn.Module):
+    def __init__(self, dim, norm='in', activation='relu', pad_type='zero'):
+        super(ResBlock, self).__init__()
+        model = []
+        model += [Conv2dBlock(dim ,dim, 3, 1, 1, norm=norm, activation=activation, pad_type=pad_type)]
+        model += [Conv2dBlock(dim ,dim, 3, 1, 1, norm=norm, activation='none', pad_type=pad_type)]
+        self.model = nn.Sequential(*model)
+    def forward(self, x):
+        residual = x
+        out = self.model(x)
+        out += residual
+        return out
+class Conv2dBlock(nn.Module):
+    def __init__(self, input_dim ,output_dim, kernel_size, stride,
+                 padding=0, norm='none', activation='relu', pad_type='zero'):
+        super(Conv2dBlock, self).__init__()
+        self.use_bias = True
+        # initialize padding
+        if pad_type == 'reflect':
+            self.pad = nn.ReflectionPad2d(padding)
+        elif pad_type == 'replicate':
+            self.pad = nn.ReplicationPad2d(padding)
+        elif pad_type == 'zero':
+            self.pad = nn.ZeroPad2d(padding)
+        else:
+            assert 0, "Unsupported padding type: {}".format(pad_type)
+        # initialize normalization
+        norm_dim = output_dim
+        if norm == 'bn':
+            self.norm = nn.BatchNorm2d(norm_dim)
+        elif norm == 'in':
+            #self.norm = nn.InstanceNorm2d(norm_dim, track_running_stats=True)
+            self.norm = nn.InstanceNorm2d(norm_dim)
+        elif norm == 'ln':
+            self.norm = LayerNorm(norm_dim)
+        elif norm == 'adain':
+            self.norm = AdaptiveInstanceNorm2d(norm_dim)
+        elif norm == 'none' or norm == 'sn':
+            self.norm = None
+        else:
+            assert 0, "Unsupported normalization: {}".format(norm)
+        # initialize activation
+        if activation == 'relu':
+            self.activation = nn.ReLU(inplace=True)
+        elif activation == 'lrelu':
+            self.activation = nn.LeakyReLU(0.2, inplace=True)
+        elif activation == 'prelu':
+            self.activation = nn.PReLU()
+        elif activation == 'selu':
+            self.activation = nn.SELU(inplace=True)
+        elif activation == 'tanh':
+            self.activation = nn.Tanh()
+        elif activation == 'none':
+            self.activation = None
+        else:
+            assert 0, "Unsupported activation: {}".format(activation)
+        # initialize convolution
+        if norm == 'sn':
+            self.conv = SpectralNorm(nn.Conv2d(input_dim, output_dim, kernel_size, stride, bias=self.use_bias))
+        else:
+            self.conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride, bias=self.use_bias)
+    def forward(self, x):
+        x = self.conv(self.pad(x))
+        if self.norm:
+            x = self.norm(x)
+        if self.activation:
+            x = self.activation(x)
+        return x
+class LinearBlock(nn.Module):
+    def __init__(self, input_dim, output_dim, norm='none', activation='relu'):
+        super(LinearBlock, self).__init__()
+        use_bias = True
+        # initialize fully connected layer
+        if norm == 'sn':
+            self.fc = SpectralNorm(nn.Linear(input_dim, output_dim, bias=use_bias))
+        else:
+            self.fc = nn.Linear(input_dim, output_dim, bias=use_bias)
+        # initialize normalization
+        norm_dim = output_dim
+        if norm == 'bn':
+            self.norm = nn.BatchNorm1d(norm_dim)
+        elif norm == 'in':
+            self.norm = nn.InstanceNorm1d(norm_dim)
+        elif norm == 'ln':
+            self.norm = LayerNorm(norm_dim)
+        elif norm == 'none' or norm == 'sn':
+            self.norm = None
+        else:
+            assert 0, "Unsupported normalization: {}".format(norm)
+        # initialize activation
+        if activation == 'relu':
+            self.activation = nn.ReLU(inplace=True)
+        elif activation == 'lrelu':
+            self.activation = nn.LeakyReLU(0.2, inplace=True)
+        elif activation == 'prelu':
+            self.activation = nn.PReLU()
+        elif activation == 'selu':
+            self.activation = nn.SELU(inplace=True)
+        elif activation == 'tanh':
+            self.activation = nn.Tanh()
+        elif activation == 'none':
+            self.activation = None
+        else:
+            assert 0, "Unsupported activation: {}".format(activation)
+    def forward(self, x):
+        out = self.fc(x)
+        if self.norm:
+            out = self.norm(out)
+        if self.activation:
+            out = self.activation(out)
+        return out
+##################################################################################
+# VGG network definition
+##################################################################################
+class Vgg16(nn.Module):
+    def __init__(self):
+        super(Vgg16, self).__init__()
+        self.conv1_1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
+        self.conv1_2 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
+        self.conv2_1 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
+        self.conv2_2 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1)
+        self.conv3_1 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1)
+        self.conv3_2 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
+        self.conv3_3 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
+        self.conv4_1 = nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1)
+        self.conv4_2 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
+        self.conv4_3 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
+        self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
+        self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
+        self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
+    def forward(self, X):
+        h = F.relu(self.conv1_1(X), inplace=True)
+        h = F.relu(self.conv1_2(h), inplace=True)
+        # relu1_2 = h
+        h = F.max_pool2d(h, kernel_size=2, stride=2)
+        h = F.relu(self.conv2_1(h), inplace=True)
+        h = F.relu(self.conv2_2(h), inplace=True)
+        # relu2_2 = h
+        h = F.max_pool2d(h, kernel_size=2, stride=2)
+        h = F.relu(self.conv3_1(h), inplace=True)
+        h = F.relu(self.conv3_2(h), inplace=True)
+        h = F.relu(self.conv3_3(h), inplace=True)
+        # relu3_3 = h
+        h = F.max_pool2d(h, kernel_size=2, stride=2)
+        h = F.relu(self.conv4_1(h), inplace=True)
+        h = F.relu(self.conv4_2(h), inplace=True)
+        h = F.relu(self.conv4_3(h), inplace=True)
+        # relu4_3 = h
+        h = F.relu(self.conv5_1(h), inplace=True)
+        h = F.relu(self.conv5_2(h), inplace=True)
+        h = F.relu(self.conv5_3(h), inplace=True)
+        relu5_3 = h
+        return relu5_3
+        # return [relu1_2, relu2_2, relu3_3, relu4_3]
+##################################################################################
+# Normalization layers
+##################################################################################
+class AdaptiveInstanceNorm2d(nn.Module):
+    def __init__(self, num_features, eps=1e-5, momentum=0.1):
+        super(AdaptiveInstanceNorm2d, self).__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.momentum = momentum
+        # weight and bias are dynamically assigned
+        self.weight = None
+        self.bias = None
+        # just dummy buffers, not used
+        self.register_buffer('running_mean', torch.zeros(num_features))
+        self.register_buffer('running_var', torch.ones(num_features))
+    def forward(self, x):
+        assert self.weight is not None and self.bias is not None, "Please assign weight and bias before calling AdaIN!"
+        b, c = x.size(0), x.size(1)
+        running_mean = self.running_mean.repeat(b)
+        running_var = self.running_var.repeat(b)
+        # Apply instance norm
+        x_reshaped = x.contiguous().view(1, b * c, *x.size()[2:])
+        out = F.batch_norm(
+            x_reshaped, running_mean, running_var, self.weight, self.bias,
+            True, self.momentum, self.eps)
+        return out.view(b, c, *x.size()[2:])
+    def __repr__(self):
+        return self.__class__.__name__ + '(' + str(self.num_features) + ')'
+class LayerNorm(nn.Module):
+    def __init__(self, num_features, eps=1e-5, affine=True):
+        super(LayerNorm, self).__init__()
+        self.num_features = num_features
+        self.affine = affine
+        self.eps = eps
+        if self.affine:
+            self.gamma = nn.Parameter(torch.Tensor(num_features).uniform_())
+            self.beta = nn.Parameter(torch.zeros(num_features))
+    def forward(self, x):
+        shape = [-1] + [1] * (x.dim() - 1)
+        # print(x.size())
+        if x.size(0) == 1:
+            # These two lines run much faster in pytorch 0.4 than the two lines listed below.
+            mean = x.view(-1).mean().view(*shape)
+            std = x.view(-1).std().view(*shape)
+        else:
+            mean = x.view(x.size(0), -1).mean(1).view(*shape)
+            std = x.view(x.size(0), -1).std(1).view(*shape)
+        x = (x - mean) / (std + self.eps)
+        if self.affine:
+            shape = [1, -1] + [1] * (x.dim() - 2)
+            x = x * self.gamma.view(*shape) + self.beta.view(*shape)
+        return x
+def l2normalize(v, eps=1e-12):
+    return v / (v.norm() + eps)
+class SpectralNorm(nn.Module):
+    """
+    Based on the paper "Spectral Normalization for Generative Adversarial Networks" by Takeru Miyato, Toshiki Kataoka, Masanori Koyama, Yuichi Yoshida
+    and the Pytorch implementation https://github.com/christiancosgrove/pytorch-spectral-normalization-gan
+    """
+    def __init__(self, module, name='weight', power_iterations=1):
+        super(SpectralNorm, self).__init__()
+        self.module = module
+        self.name = name
+        self.power_iterations = power_iterations
+        if not self._made_params():
+            self._make_params()
+    def _update_u_v(self):
+        u = getattr(self.module, self.name + "_u")
+        v = getattr(self.module, self.name + "_v")
+        w = getattr(self.module, self.name + "_bar")
+        height = w.data.shape[0]
+        for _ in range(self.power_iterations):
+            v.data = l2normalize(torch.mv(torch.t(w.view(height,-1).data), u.data))
+            u.data = l2normalize(torch.mv(w.view(height,-1).data, v.data))
+        # sigma = torch.dot(u.data, torch.mv(w.view(height,-1).data, v.data))
+        sigma = u.dot(w.view(height, -1).mv(v))
+        setattr(self.module, self.name, w / sigma.expand_as(w))
+    def _made_params(self):
+        try:
+            u = getattr(self.module, self.name + "_u")
+            v = getattr(self.module, self.name + "_v")
+            w = getattr(self.module, self.name + "_bar")
+            return True
+        except AttributeError:
+            return False
+    def _make_params(self):
+        w = getattr(self.module, self.name)
+        height = w.data.shape[0]
+        width = w.view(height, -1).data.shape[1]
+        u = nn.Parameter(w.data.new(height).normal_(0, 1), requires_grad=False)
+        v = nn.Parameter(w.data.new(width).normal_(0, 1), requires_grad=False)
+        u.data = l2normalize(u.data)
+        v.data = l2normalize(v.data)
+        w_bar = nn.Parameter(w.data)
+        del self.module._parameters[self.name]
+        self.module.register_parameter(self.name + "_u", u)
+        self.module.register_parameter(self.name + "_v", v)
+        self.module.register_parameter(self.name + "_bar", w_bar)
+    def forward(self, *args):
+        self._update_u_v()
+        return self.module.forward(*args)

flae/unet.py ADDED Viewed

	@@ -0,0 +1,123 @@

+from torch import nn
+from torch.autograd import Variable
+import torch
+import torch.nn.functional as F
+from .munit import ResBlocks, Conv2dBlock
+import math
+class Unet(nn.Module):
+    def __init__(self, resolution=256, secret_len=100, return_residual=False) -> None:
+        super().__init__()
+        self.secret_len = secret_len
+        self.return_residual = return_residual
+        self.secret_dense = nn.Linear(secret_len, 16*16*3)
+        log_resolution = int(math.log(resolution, 2))
+        assert resolution == 2 ** log_resolution, f"Image resolution must be a power of 2, got {resolution}."
+        self.secret_upsample = nn.Upsample(scale_factor=(2**(log_resolution-4), 2**(log_resolution-4)))
+        self.enc = Encoder(2, 4, 6, 64, 'bn' , 'relu', 'reflect')
+        self.dec = Decoder(2, 4, self.enc.output_dim, 3, 'bn', 'relu', 'reflect')
+    def forward(self, image, secret):
+        # import pdb; pdb.set_trace()
+        fingerprint = F.relu(self.secret_dense(secret))
+        fingerprint = fingerprint.view((-1, 3, 16, 16))
+        fingerprint_enlarged = self.secret_upsample(fingerprint)
+        inputs = torch.cat([fingerprint_enlarged, image], dim=1)
+        emb = self.enc(inputs)
+        # import pdb; pdb.set_trace()
+        out = self.dec(emb)
+        return out
+class Encoder(nn.Module):
+    def __init__(self, n_downsample, n_res, input_dim, dim, norm, activ, pad_type):
+        super().__init__()
+        self.model = []
+        self.model += [Conv2dBlock(input_dim, dim, 7, 1, 3, norm=norm, activation=activ, pad_type=pad_type)]
+        # downsampling blocks
+        for i in range(n_downsample):
+            self.model += [Conv2dBlock(dim, 2 * dim, 4, 2, 1, norm=norm, activation=activ, pad_type=pad_type)]
+            dim *= 2
+        # residual blocks
+        self.model += [ResBlocks(n_res, dim, norm=norm, activation=activ, pad_type=pad_type)]
+        # self.model = nn.(*self.model)
+        self.model = nn.ModuleList(self.model)
+        self.output_dim = dim
+    def forward(self, x):
+        out = []
+        for block in self.model:
+            x = block(x)
+            out.append(x)
+            # print(x.shape)
+        return out
+class Decoder(nn.Module):
+    def __init__(self, n_upsample, n_res, dim, output_dim, res_norm='adain', activ='relu', pad_type='zero'):
+        super(Decoder, self).__init__()
+        self.model = []
+        # AdaIN residual blocks
+        self.model += [DecoderBlock('resblock', n_res, dim, res_norm, activ, pad_type=pad_type)]
+        # upsampling blocks
+        for i in range(n_upsample):
+            self.model += [DecoderBlock('upsample', dim, dim//2,'bn', activ, pad_type)
+                           ]
+            dim //= 2
+        # use reflection padding in the last conv layer
+        self.output_layer = Conv2dBlock(dim, output_dim, 7, 1, 3, norm='none', activation='tanh', pad_type=pad_type)
+        # self.model = nn.Sequential(*self.model)
+        self.model = nn.ModuleList(self.model)
+    def forward(self, x):
+        x1 = x.pop()
+        for block in self.model:
+            x2 = x.pop()
+            # print(x1.shape, x2.shape)
+            x1 = block(x1, x2)
+        x1 = self.output_layer(x1)
+        return x1
+class Merge(nn.Module):
+    def __init__(self, dim, activation='relu'):
+        super().__init__()
+        self.conv = nn.Conv2d(2*dim, dim, 3, 1, 1)
+        # initialize activation
+        if activation == 'relu':
+            self.activation = nn.ReLU(inplace=True)
+        elif activation == 'lrelu':
+            self.activation = nn.LeakyReLU(0.2, inplace=True)
+        elif activation == 'prelu':
+            self.activation = nn.PReLU()
+        elif activation == 'selu':
+            self.activation = nn.SELU(inplace=True)
+        elif activation == 'tanh':
+            self.activation = nn.Tanh()
+        elif activation == 'none':
+            self.activation = None
+        else:
+            assert 0, "Unsupported activation: {}".format(activation)
+    def forward(self, x1, x2):
+        x = torch.cat([x1, x2], dim=1)  # 2xdim
+        x = self.conv(x)  # B,dim,H,W
+        x = self.activation(x)
+        return x
+class DecoderBlock(nn.Module):
+    def __init__(self, block_type, in_dim, out_dim, norm, activ='relu', pad_type='reflect'):
+        super().__init__()
+        assert block_type in ['resblock', 'upsample']
+        if block_type == 'resblock':
+            self.core_layer = ResBlocks(in_dim, out_dim, norm, activ, pad_type=pad_type)
+        else:
+            assert out_dim == in_dim//2
+            self.core_layer = nn.Sequential(nn.Upsample(scale_factor=2),
+                           Conv2dBlock(in_dim, out_dim, 5, 1, 2, norm=norm, activation=activ, pad_type=pad_type))
+        self.merge = Merge(out_dim, activ)
+    def forward(self, x1, x2):
+        x1 = self.core_layer(x1)
+        return self.merge(x1, x2)

ldm/modules/ema.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import torch
+from torch import nn
+class LitEma(nn.Module):
+    def __init__(self, model, decay=0.9999, use_num_upates=True):
+        super().__init__()
+        if decay < 0.0 or decay > 1.0:
+            raise ValueError('Decay must be between 0 and 1')
+        self.m_name2s_name = {}
+        self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32))
+        self.register_buffer('num_updates', torch.tensor(0, dtype=torch.int) if use_num_upates
+        else torch.tensor(-1, dtype=torch.int))
+        for name, p in model.named_parameters():
+            if p.requires_grad:
+                # remove as '.'-character is not allowed in buffers
+                s_name = name.replace('.', '')
+                self.m_name2s_name.update({name: s_name})
+                self.register_buffer(s_name, p.clone().detach().data)
+        self.collected_params = []
+    def reset_num_updates(self):
+        del self.num_updates
+        self.register_buffer('num_updates', torch.tensor(0, dtype=torch.int))
+    def forward(self, model):
+        decay = self.decay
+        if self.num_updates >= 0:
+            self.num_updates += 1
+            decay = min(self.decay, (1 + self.num_updates) / (10 + self.num_updates))
+        one_minus_decay = 1.0 - decay
+        with torch.no_grad():
+            m_param = dict(model.named_parameters())
+            shadow_params = dict(self.named_buffers())
+            for key in m_param:
+                if m_param[key].requires_grad:
+                    sname = self.m_name2s_name[key]
+                    shadow_params[sname] = shadow_params[sname].type_as(m_param[key])
+                    shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key]))
+                else:
+                    assert not key in self.m_name2s_name
+    def copy_to(self, model):
+        m_param = dict(model.named_parameters())
+        shadow_params = dict(self.named_buffers())
+        for key in m_param:
+            if m_param[key].requires_grad:
+                m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data)
+            else:
+                assert not key in self.m_name2s_name
+    def store(self, parameters):
+        """
+        Save the current parameters for restoring later.
+        Args:
+          parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+            temporarily stored.
+        """
+        self.collected_params = [param.clone() for param in parameters]
+    def restore(self, parameters):
+        """
+        Restore the parameters stored with the `store` method.
+        Useful to validate the model with EMA parameters without affecting the
+        original optimization process. Store the parameters before the
+        `copy_to` method. After validation (or model saving), use this to
+        restore the former parameters.
+        Args:
+          parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+            updated with the stored parameters.
+        """
+        for c_param, param in zip(self.collected_params, parameters):
+            param.data.copy_(c_param.data)

ldm/util.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import importlib
+import torch
+from torch import optim
+import numpy as np
+from inspect import isfunction
+from PIL import Image, ImageDraw, ImageFont
+def log_txt_as_img(wh, xc, size=10):
+    # wh a tuple of (width, height)
+    # xc a list of captions to plot
+    b = len(xc)
+    txts = list()
+    for bi in range(b):
+        txt = Image.new("RGB", wh, color="white")
+        draw = ImageDraw.Draw(txt)
+        font = ImageFont.truetype('data/DejaVuSans.ttf', size=size)
+        nc = int(40 * (wh[0] / 256))
+        lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc))
+        try:
+            draw.text((0, 0), lines, fill="black", font=font)
+        except UnicodeEncodeError:
+            print("Cant encode string for logging. Skipping.")
+        txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
+        txts.append(txt)
+    txts = np.stack(txts)
+    txts = torch.tensor(txts)
+    return txts
+def ismap(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] > 3)
+def isimage(x):
+    if not isinstance(x,torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
+def exists(x):
+    return x is not None
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+def mean_flat(tensor):
+    """
+    https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+def count_params(model, verbose=False):
+    total_params = sum(p.numel() for p in model.parameters())
+    if verbose:
+        print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
+    return total_params
+def instantiate_from_config(config):
+    if not "target" in config:
+        if config == '__is_first_stage__':
+            return None
+        elif config == "__is_unconditional__":
+            return None
+        raise KeyError("Expected key `target` to instantiate.")
+    return get_obj_from_str(config["target"])(**config.get("params", dict()))
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+class AdamWwithEMAandWings(optim.Optimizer):
+    # credit to https://gist.github.com/crowsonkb/65f7265353f403714fce3b2595e0b298
+    def __init__(self, params, lr=1.e-3, betas=(0.9, 0.999), eps=1.e-8,  # TODO: check hyperparameters before using
+                 weight_decay=1.e-2, amsgrad=False, ema_decay=0.9999,   # ema decay to match previous code
+                 ema_power=1., param_names=()):
+        """AdamW that saves EMA versions of the parameters."""
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        if not 0.0 <= ema_decay <= 1.0:
+            raise ValueError("Invalid ema_decay value: {}".format(ema_decay))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay, amsgrad=amsgrad, ema_decay=ema_decay,
+                        ema_power=ema_power, param_names=param_names)
+        super().__init__(params, defaults)
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            params_with_grad = []
+            grads = []
+            exp_avgs = []
+            exp_avg_sqs = []
+            ema_params_with_grad = []
+            state_sums = []
+            max_exp_avg_sqs = []
+            state_steps = []
+            amsgrad = group['amsgrad']
+            beta1, beta2 = group['betas']
+            ema_decay = group['ema_decay']
+            ema_power = group['ema_power']
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                params_with_grad.append(p)
+                if p.grad.is_sparse:
+                    raise RuntimeError('AdamW does not support sparse gradients')
+                grads.append(p.grad)
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    # Exponential moving average of parameter values
+                    state['param_exp_avg'] = p.detach().float().clone()
+                exp_avgs.append(state['exp_avg'])
+                exp_avg_sqs.append(state['exp_avg_sq'])
+                ema_params_with_grad.append(state['param_exp_avg'])
+                if amsgrad:
+                    max_exp_avg_sqs.append(state['max_exp_avg_sq'])
+                # update the steps for each param group update
+                state['step'] += 1
+                # record the step after step update
+                state_steps.append(state['step'])
+            optim._functional.adamw(params_with_grad,
+                    grads,
+                    exp_avgs,
+                    exp_avg_sqs,
+                    max_exp_avg_sqs,
+                    state_steps,
+                    amsgrad=amsgrad,
+                    beta1=beta1,
+                    beta2=beta2,
+                    lr=group['lr'],
+                    weight_decay=group['weight_decay'],
+                    eps=group['eps'],
+                    maximize=False)
+            cur_ema_decay = min(ema_decay, 1 - state['step'] ** -ema_power)
+            for param, ema_param in zip(params_with_grad, ema_params_with_grad):
+                ema_param.mul_(cur_ema_decay).add_(param.float(), alpha=1 - cur_ema_decay)
+        return loss

pages/Extract_Secret.py ADDED Viewed

	@@ -0,0 +1,108 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+streamlit app demo
+how to run:
+streamlit run app.py --server.port 8501
+@author: Tu Bui @surrey.ac.uk
+"""
+import os, sys, torch
+import inspect
+cdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+sys.path.insert(1, os.path.join(cdir, '../'))
+import argparse
+from pathlib import Path
+import numpy as np
+import pickle
+import pytorch_lightning as pl
+from torchvision import transforms
+import argparse
+from ldm.util import instantiate_from_config
+from omegaconf import OmegaConf
+from PIL import Image
+from tools.augment_imagenetc import RandomImagenetC
+from cldm.transformations2 import TransformNet
+from io import BytesIO
+from tools.helpers import welcome_message
+from tools.ecc import BCH, RSC
+import streamlit as st
+from Embed_Secret import parse_st_args, load_ecc, load_model, decode_secret, to_bytes, model_names
+def app(args):
+    st.title('Watermarking Demo')
+    # setup model
+    model_name = st.selectbox("Choose the model", model_names)
+    model, tform_emb, tform_det, secret_len = load_model(model_name, args)
+    display_width = 300
+    ecc = load_ecc('BCH', secret_len)
+    noise = TransformNet(p=1.0, crop_mode='resized_crop')
+    noise_names = noise.optional_names
+    # setup st
+    st.subheader("Input")
+    image_file = None
+    image_file = st.file_uploader("Upload stego image", type=["png","jpg","jpeg"])
+    if image_file is not None:
+        im = Image.open(image_file).convert('RGB')
+        ext = image_file.name.split('.')[-1]
+        st.image(im, width=display_width)
+    # add crop
+    st.subheader("Corruptions")
+    crop_button = st.button('Regenerate Crop/Flip/Resize', key='crop')
+    if image_file is not None:
+        im_crop = noise.apply_transform_on_pil_image(im, 'Fixed Augment')
+        if crop_button:
+            im_crop = noise.apply_transform_on_pil_image(im, 'Fixed Augment')
+        # st.image(im_crop, width=display_width)
+    # add noise source 1
+    corrupt_method1 = st.selectbox("Choose noise source #1", ['None'] + noise_names, key='noise1')
+    if image_file is not None:
+        if corrupt_method1=='None':
+            im_noise1 = im_crop
+        else:
+            im_noise1 = noise.apply_transform_on_pil_image(im_crop, corrupt_method1)
+            # st.image(im_noise1, width=display_width)
+    # add noise source 2
+    corrupt_method2 = st.selectbox("Choose noise source #2", ['None'] + noise_names, key='noise2')
+    if image_file is not None:
+        if corrupt_method2=='None':
+            im_noise2 = im_noise1
+        else:
+            im_noise2 = noise.apply_transform_on_pil_image(im_noise1, corrupt_method2)
+    st.subheader("Output")
+    if image_file is not None:
+        st.image(im_noise2, width=display_width)
+        mime='image/jpeg' if ext=='jpg' else f'image/{ext}'
+        im_noise2_bytes = to_bytes(np.uint8(im_noise2), mime)
+        st.download_button(label='Download image', data=im_noise2_bytes, file_name=f'corrupted.{ext}', mime=mime)
+    # prediction
+    st.subheader('Extract Secret From Output')
+    status = st.empty()
+    if image_file is not None:
+        secret_pred = decode_secret(model_name, model, im_noise2, tform_det)
+        secret_decoded = ecc.decode_text(secret_pred)[0]
+        status.markdown(f'Predicted secret: **{secret_decoded}**', unsafe_allow_html=True)
+    # bit acc
+    st.subheader('Accuracy')
+    secret_text = st.text_input('Input groundtruth secret')
+    bit_acc_status = st.empty()
+    if image_file is not None and secret_text:
+        secret = ecc.encode_text([secret_text])  # (1, 100)
+        bit_acc = (secret_pred == secret).mean()
+        # bit_acc_status.markdown('**Bit Accuracy**: {:.2f}%'.format(bit_acc*100), unsafe_allow_html=True)
+        word_acc = int(secret_decoded == secret_text)
+        bit_acc_status.markdown(f'Bit Accuracy: **{bit_acc*100:.2f}%**<br />Word Accuracy: **{word_acc}**', unsafe_allow_html=True)
+if __name__ == '__main__':
+    args = parse_st_args()
+    app(args)

tools/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .helpers import *
+from .hparams import HParams
+from .slack_bot import Notifier

tools/augment_imagenetc.py ADDED Viewed

	@@ -0,0 +1,155 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+wrapper for imagenet-c transformations
+@author: Tu Bui @surrey.ac.uk
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+import random
+import numpy as np
+from PIL import Image
+from imagenet_c import corrupt, corruption_dict
+class IdentityAugment(object):
+    def __call__(self, x):
+        return x
+    def __repr__(self):
+        s = f'()'
+        return self.__class__.__name__ + s
+class RandomImagenetC(object):
+    # transform id 5 (motion blur) and 7 (snow) requires WandImage which is not fork-safe, while id 4 (glass blur) and 6 (zoom blur) are super slow thus we move it to validation (unseen), 12 (elastic transform) is non realistic
+    methods = {'train': np.array([0,1,2,3,8,9,10,11,13,14,15, 16, 17, 18]),#np.arange(15),
+               'val': np.array([4, 5, 6, 7, 12]),
+               'test': np.array([0,1,2,3,8,9,10,11,13,14,15, 16, 17, 18])
+    }
+    method_names = list(corruption_dict.keys())
+    def __init__(self, min_severity=1, max_severity=5, phase='all', p=1.0,n=19):
+        assert phase in ['train', 'val', 'test', 'all'], ValueError(f'{phase} not recognised. Must be one of [train, val, all]')
+        if phase == 'all':
+            self.corrupt_ids = np.concatenate(list(self.methods.values()))
+        else:
+            self.corrupt_ids = self.methods[phase]
+        self.corrupt_ids = self.corrupt_ids[:n]  # first n tforms
+        self.phase = phase
+        self.severity = np.arange(min_severity, max_severity+1)
+        self.p = p  # probability to apply a transformation
+    def __call__(self, x, corrupt_id=None, corrupt_strength=None):
+        # input: x PIL image
+        if corrupt_id is None:
+            if len(self.corrupt_ids)==0:  # do nothing
+                return x
+            corrupt_id = np.random.choice(self.corrupt_ids)
+        else:
+            assert corrupt_id in range(19)
+        severity = np.random.choice(self.severity) if corrupt_strength is None else corrupt_strength
+        assert severity in self.severity, f'Error! Corrupt strength {severity} isnt supported.'
+        if np.random.rand() < self.p:
+            org_size = x.size
+            x = np.asarray(x.convert('RGB').resize((224, 224), Image.BILINEAR))[:,:,::-1]
+            x = corrupt(x, severity, corruption_number=corrupt_id)
+            x = Image.fromarray(x[:,:,::-1])
+            if x.size != org_size:
+                x = x.resize(org_size, Image.BILINEAR)
+        return x
+    def transform_with_fixed_severity(self, x, severity, corrupt_id=None):
+        if corrupt_id is None:
+            corrupt_id = np.random.choice(self.corrupt_ids)
+        else:
+            assert corrupt_id in self.corrupt_ids
+        assert severity > 0 and severity < 6
+        org_size = x.size
+        x = np.asarray(x.convert('RGB').resize((224, 224), Image.BILINEAR))[:,:,::-1]
+        x = corrupt(x, severity, corruption_number=corrupt_id)
+        x = Image.fromarray(x[:,:,::-1])
+        if x.size != org_size:
+            x = x.resize(org_size, Image.BILINEAR)
+        return x
+    def __repr__(self):
+        s = f'(severity={self.severity}, phase={self.phase}, p={self.p},ids={self.corrupt_ids})'
+        return self.__class__.__name__ + s
+class NoiseResidual(object):
+    def __init__(self, k=16):
+        self.k = k
+    def __call__(self, x):
+        h, w = x.height, x.width
+        x1 = x.resize((w//self.k,h//self.k), Image.BILINEAR).resize((w, h), Image.BILINEAR)
+        x1 = np.abs(np.array(x).astype(np.float32) - np.array(x1).astype(np.float32))
+        x1 = (x1 - x1.min())/(x1.max() - x1.min() + np.finfo(np.float32).eps)
+        x1 = Image.fromarray((x1*255).astype(np.uint8))
+        return x1
+    def __repr__(self):
+        s = f'(k={self.k}'
+        return self.__class__.__name__ + s
+def get_transforms(img_mean=[0.5, 0.5, 0.5], img_std=[0.5, 0.5, 0.5], rsize=256, csize=224, pertubation=True, dct=False, residual=False, max_c=19):
+    from torchvision import transforms
+    prep = transforms.Compose([
+            transforms.Resize(rsize),
+            transforms.RandomHorizontalFlip(),
+            transforms.RandomCrop(csize)])
+    if pertubation:
+        pertubation_train = RandomImagenetC(max_severity=5, phase='train', p=0.95,n=max_c)
+        pertubation_val = RandomImagenetC(max_severity=5, phase='train', p=1.0,n=max_c)
+        pertubation_test = RandomImagenetC(max_severity=5, phase='val', p=1.0,n=max_c)
+    else:
+        pertubation_train = pertubation_val = pertubation_test = IdentityAugment()
+    if dct:
+        from .image_tools import DCT
+        norm = [
+                DCT(),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=img_mean, std=img_std)]
+    else:
+        norm = [
+                transforms.ToTensor(),
+                transforms.Normalize(mean=img_mean, std=img_std)]
+    if residual:
+        norm.insert(0, NoiseResidual())
+    preprocess = {
+        'train': [prep, pertubation_train, transforms.Compose(norm)],
+        'val': [prep, pertubation_val, transforms.Compose(norm)],
+        'test_unseen': [prep, pertubation_test, transforms.Compose(norm)],
+        'clean': transforms.Compose([transforms.Resize(csize)] + norm)
+        }
+    return preprocess
+# ## example
+# from PIL import Image
+# import numpy as np
+# import time
+# from imagenet_c import corrupt, corruption_dict
+# im = Image.open('/vol/research/tubui1/projects/gan_prov/gan_models/stargan2/test.jpg').convert('RGB').resize((224,224), Image.BILINEAR)
+# im.save('original.jpg')
+# im = np.array(im)[:,:,::-1]  # BRG
+# t = np.zeros(19)
+# for i, key in enumerate(corruption_dict.keys()):
+#     begin = time.time()
+#     for j in range(10):
+#         out = corrupt(im, 5, corruption_number=i)
+#     end = time.time()
+#     t[i] = end-begin
+#     # Image.fromarray(out[:,:,::-1]).save(f'imc_{key}.jpg')
+#     print(f'{i} - {key}: {end-begin}')
+# for i,k in enumerate(corruption_dict.keys()):
+#     print(i, k, t[i])

tools/base_lmdb.py ADDED Viewed

	@@ -0,0 +1,588 @@

+from typing import Any, Optional, Union
+from pathlib import Path
+import os
+import io
+import lmdb
+import pickle
+import gzip
+import bz2
+import lzma
+import shutil
+from tqdm import tqdm
+import pandas as pd
+import numpy as np
+from numpy import ndarray
+import time
+import torch
+from torch import Tensor
+from distutils.dir_util import copy_tree
+from PIL import Image
+from PIL import ImageFile
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+def _default_encode(data: Any, protocol: int) -> bytes:
+    return pickle.dumps(data, protocol=protocol)
+def _ascii_encode(data: str) -> bytes:
+    return data.encode("ascii")
+def _default_decode(data: bytes) -> Any:
+    return pickle.loads(data)
+def _default_decompress(data: bytes) -> bytes:
+    return data
+def _decompress(compression: Optional[str]):
+    if compression is None:
+        _decompress = _default_decompress
+    elif compression == "gzip":
+        _decompress = gzip.decompress
+    elif compression == "bz2":
+        _decompress = bz2.decompress
+    elif compression == "lzma":
+        _decompress = lzma.decompress
+    else:
+        raise ValueError(f"Unknown compression algorithm: {compression}")
+    return _decompress
+class BaseLMDB(object):
+    _database = None
+    _protocol = None
+    _length = None
+    def __init__(
+        self,
+        path: Union[str, Path],
+        readahead: bool = False,
+        pre_open: bool = False,
+        compression: Optional[str] = None
+    ):
+        """
+        Base class for LMDB-backed databases.
+        :param path: Path to the database.
+        :param readahead: Enables the filesystem readahead mechanism.
+        :param pre_open: If set to True, the first iterations will be faster, but it will raise error when doing multi-gpu training. If set to False, the database will open when you will retrieve the first item.
+        """
+        if not isinstance(path, str):
+            path = str(path)
+        self.path = path
+        self.readahead = readahead
+        self.pre_open = pre_open
+        self._decompress = _decompress(compression)
+        self._has_fetched_an_item = False
+    @property
+    def database(self):
+        if self._database is None:
+            self._database = lmdb.open(
+                path=self.path,
+                readonly=True,
+                readahead=self.readahead,
+                max_spare_txns=256,
+                lock=False,
+            )
+        return self._database
+    @database.deleter
+    def database(self):
+        if self._database is not None:
+            self._database.close()
+            self._database = None
+    @property
+    def protocol(self):
+        """
+        Read the pickle protocol contained in the database.
+        :return: The set of available keys.
+        """
+        if self._protocol is None:
+            self._protocol = self._get(
+                item="protocol",
+                encode_key=_ascii_encode,
+                decompress_value=_default_decompress,
+                decode_value=_default_decode,
+            )
+        return self._protocol
+    @property
+    def keys(self):
+        """
+        Read the keys contained in the database.
+        :return: The set of available keys.
+        """
+        protocol = self.protocol
+        keys = self._get(
+            item="keys",
+            encode_key=lambda key: _default_encode(key, protocol=protocol),
+            decompress_value=_default_decompress,
+            decode_value=_default_decode,
+        )
+        return keys
+    def __len__(self):
+        """
+        Returns the number of keys available in the database.
+        :return: The number of keys.
+        """
+        if self._length is None:
+            self._length = len(self.keys)
+        return self._length
+    def __getitem__(self, item):
+        """
+        Retrieves an item or a list of items from the database.
+        :param item: A key or a list of keys.
+        :return: A value or a list of values.
+        """
+        self._has_fetched_an_item = True
+        if not isinstance(item, list):
+            item = self._get(
+                item=item,
+                encode_key=self._encode_key,
+                decompress_value=self._decompress_value,
+                decode_value=self._decode_value,
+            )
+        else:
+            item = self._gets(
+                items=item,
+                encode_keys=self._encode_keys,
+                decompress_values=self._decompress_values,
+                decode_values=self._decode_values,
+            )
+        return item
+    def _get(self, item, encode_key, decompress_value, decode_value):
+        """
+        Instantiates a transaction and its associated cursor to fetch an item.
+        :param item: A key.
+        :param encode_key:
+        :param decode_value:
+        :return:
+        """
+        with self.database.begin() as txn:
+            with txn.cursor() as cursor:
+                item = self._fetch(
+                    cursor=cursor,
+                    key=item,
+                    encode_key=encode_key,
+                    decompress_value=decompress_value,
+                    decode_value=decode_value,
+                )
+        self._keep_database()
+        return item
+    def _gets(self, items, encode_keys, decompress_values, decode_values):
+        """
+        Instantiates a transaction and its associated cursor to fetch a list of items.
+        :param items: A list of keys.
+        :param encode_keys:
+        :param decode_values:
+        :return:
+        """
+        with self.database.begin() as txn:
+            with txn.cursor() as cursor:
+                items = self._fetchs(
+                    cursor=cursor,
+                    keys=items,
+                    encode_keys=encode_keys,
+                    decompress_values=decompress_values,
+                    decode_values=decode_values,
+                )
+        self._keep_database()
+        return items
+    def _fetch(self, cursor, key, encode_key, decompress_value, decode_value):
+        """
+        Retrieve a value given a key.
+        :param cursor:
+        :param key: A key.
+        :param encode_key:
+        :param decode_value:
+        :return: A value.
+        """
+        key = encode_key(key)
+        value = cursor.get(key)
+        value = decompress_value(value)
+        value = decode_value(value)
+        return value
+    def _fetchs(self, cursor, keys, encode_keys, decompress_values, decode_values):
+        """
+        Retrieve a list of values given a list of keys.
+        :param cursor:
+        :param keys: A list of keys.
+        :param encode_keys:
+        :param decode_values:
+        :return: A list of values.
+        """
+        keys = encode_keys(keys)
+        _, values = list(zip(*cursor.getmulti(keys)))
+        values = decompress_values(values)
+        values = decode_values(values)
+        return values
+    def _encode_key(self, key: Any) -> bytes:
+        """
+        Converts a key into a byte key.
+        :param key: A key.
+        :return: A byte key.
+        """
+        return pickle.dumps(key, protocol=self.protocol)
+    def _encode_keys(self, keys: list) -> list:
+        """
+        Converts keys into byte keys.
+        :param keys: A list of keys.
+        :return: A list of byte keys.
+        """
+        return [self._encode_key(key=key) for key in keys]
+    def _decompress_value(self, value: bytes) -> bytes:
+        return self._decompress(value)
+    def _decompress_values(self, values: list) -> list:
+        return [self._decompress_value(value=value) for value in values]
+    def _decode_value(self, value: bytes) -> Any:
+        """
+        Converts a byte value back into a value.
+        :param value: A byte value.
+        :return: A value
+        """
+        return pickle.loads(value)
+    def _decode_values(self, values: list) -> list:
+        """
+        Converts bytes values back into values.
+        :param values: A list of byte values.
+        :return: A list of values.
+        """
+        return [self._decode_value(value=value) for value in values]
+    def _keep_database(self):
+        """
+        Checks if the database must be deleted.
+        :return:
+        """
+        if not self.pre_open and not self._has_fetched_an_item:
+            del self.database
+    def __iter__(self):
+        """
+        Provides an iterator over the keys when iterating over the database.
+        :return: An iterator on the keys.
+        """
+        return iter(self.keys)
+    def __del__(self):
+        """
+        Closes the database properly.
+        """
+        del self.database
+    @staticmethod
+    def write(data_lst, indir, outdir):
+        raise NotImplementedError
+class PILlmdb(BaseLMDB):
+    def __init__(
+        self,
+        lmdb_dir: Union[str, Path],
+        image_list: Union[str, Path, pd.DataFrame]=None,
+        index_key='id',
+        **kwargs
+    ):
+        super().__init__(path=lmdb_dir, **kwargs)
+        if image_list is None:
+            self.ids = list(range(len(self.keys)))
+            self.labels = list(range(len(self.ids)))
+        else:
+            df = pd.read_csv(str(image_list))
+            assert index_key in df, f'[PILlmdb] Error! {image_list} must have id keys.'
+            self.ids = df[index_key].tolist()
+            assert max(self.ids) < len(self.keys)
+            if 'label' in df:
+                self.labels = df['label'].tolist()
+            else:  # all numeric keys other than 'id' are labels
+                keys = [key for key in df if (key!=index_key and type(df[key][0]) in [int, np.int64])]
+                # df = df.drop('id', axis=1)
+                self.labels = df[keys].to_numpy()
+        self._length = len(self.ids)
+    def __len__(self):
+        return self._length
+    def __iter__(self):
+        return iter([self.keys[i] for i in self.ids])
+    def __getitem__(self, index):
+        key = self.keys[self.ids[index]]
+        return super().__getitem__(key)
+    def set_ids(self, ids):
+        self.ids = [self.ids[i] for i in ids]
+        self.labels = [self.labels[i] for i in ids]
+        self._length = len(self.ids)
+    def _decode_value(self, value: bytes):
+        """
+        Converts a byte image back into a PIL Image.
+        :param value: A byte image.
+        :return: A PIL Image image.
+        """
+        return Image.open(io.BytesIO(value))
+    @staticmethod
+    def write(indir, outdir, data_lst=None, transform=None):
+        """
+        create lmdb given data directory and list of image paths; or an iterator
+        :param data_lst None or csv file containing 'path' key to store relative paths to the images
+        :param indir root directory of the images
+        :param outdir output lmdb, data.mdb and lock.mdb will be written here
+        """
+        outdir = Path(outdir)
+        outdir.mkdir(parents=True, exist_ok=True)
+        tmp_dir = Path("/tmp") / f"TEMP_{time.time()}"
+        tmp_dir.mkdir(parents=True, exist_ok=True)
+        dtype = {'str': False, 'pil': False}
+        if isinstance(indir, str) or isinstance(indir, Path):
+            indir = Path(indir)
+            if data_lst is None:  # grab all images in this dir
+                lst = list(indir.glob('**/*.jpg')) + list(indir.glob('**/*.png'))
+            else:
+                lst = pd.read_csv(data_lst)['path'].tolist()
+                lst = [indir/p for p in lst]
+            assert len(lst) > 0, f'Couldnt find any image in {indir} (Support only .jpg and .png) or list (must have path field).'
+            n = len(lst)
+            dtype['str'] = True
+        else:  # iterator
+            n = len(indir)
+            lst = iter(indir)
+            dtype['pil'] = True
+        with lmdb.open(path=str(tmp_dir), map_size=2 ** 40) as env:
+            # Add the protocol to the database.
+            with env.begin(write=True) as txn:
+                key = "protocol".encode("ascii")
+                value = pickle.dumps(pickle.DEFAULT_PROTOCOL)
+                txn.put(key=key, value=value, dupdata=False)
+            # Add the keys to the database.
+            with env.begin(write=True) as txn:
+                key = pickle.dumps("keys")
+                value = pickle.dumps(list(range(n)))
+                txn.put(key=key, value=value, dupdata=False)
+            # Add the images to the database.
+            for key, value in tqdm(enumerate(lst), total=n, miniters=n//100, mininterval=300):
+                with env.begin(write=True) as txn:
+                    key = pickle.dumps(key)
+                    if dtype['str']:
+                        with value.open("rb") as file:
+                            byteimg = file.read()
+                    else:  # PIL
+                        data = io.BytesIO()
+                        value.save(data, 'png')
+                        byteimg = data.getvalue()
+                    if transform is not None:
+                        im = Image.open(io.BytesIO(byteimg))
+                        im = transform(im)
+                        data = io.BytesIO()
+                        im.save(data, 'png')
+                        byteimg = data.getvalue()
+                    txn.put(key=key, value=byteimg, dupdata=False)
+        # Move the database to its destination.
+        copy_tree(str(tmp_dir), str(outdir))
+        shutil.rmtree(str(tmp_dir))
+class MaskDatabase(PILlmdb):
+    def _decode_value(self, value: bytes):
+        """
+        Converts a byte image back into a PIL Image.
+        :param value: A byte image.
+        :return: A PIL Image image.
+        """
+        return Image.open(io.BytesIO(value)).convert("1")
+class LabelDatabase(BaseLMDB):
+    pass
+class ArrayDatabase(BaseLMDB):
+    _dtype = None
+    _shape = None
+    def __init__(
+        self,
+        lmdb_dir: Union[str, Path],
+        image_list: Union[str, Path, pd.DataFrame]=None,
+        **kwargs
+    ):
+        super().__init__(path=lmdb_dir, **kwargs)
+        if image_list is None:
+            self.ids = list(range(len(self.keys)))
+            self.labels = list(range(len(self.ids)))
+        else:
+            df = pd.read_csv(str(image_list))
+            assert 'id' in df, f'[ArrayDatabase] Error! {image_list} must have id keys.'
+            self.ids = df['id'].tolist()
+            assert max(self.ids) < len(self.keys)
+            if 'label' in df:
+                self.labels = df['label'].tolist()
+            else:  # all numeric keys other than 'id' are labels
+                keys = [key for key in df if (key!='id' and type(df[key][0]) in [int, np.int64])]
+                # df = df.drop('id', axis=1)
+                self.labels = df[keys].to_numpy()
+        self._length = len(self.ids)
+    def set_ids(self, ids):
+        self.ids = [self.ids[i] for i in ids]
+        self.labels = [self.labels[i] for i in ids]
+        self._length = len(self.ids)
+    def __len__(self):
+        return self._length
+    def __iter__(self):
+        return iter([self.keys[i] for i in self.ids])
+    def __getitem__(self, index):
+        key = self.keys[self.ids[index]]
+        return super().__getitem__(key)
+    @property
+    def dtype(self):
+        if self._dtype is None:
+            protocol = self.protocol
+            self._dtype = self._get(
+                item="dtype",
+                encode_key=lambda key: _default_encode(key, protocol=protocol),
+                decompress_value=_default_decompress,
+                decode_value=_default_decode,
+            )
+        return self._dtype
+    @property
+    def shape(self):
+        if self._shape is None:
+            protocol = self.protocol
+            self._shape = self._get(
+                item="shape",
+                encode_key=lambda key: _default_encode(key, protocol=protocol),
+                decompress_value=_default_decompress,
+                decode_value=_default_decode,
+            )
+        return self._shape
+    def _decode_value(self, value: bytes) -> ndarray:
+        value = super()._decode_value(value)
+        return np.frombuffer(value, dtype=self.dtype).reshape(self.shape)
+    def _decode_values(self, values: list) -> ndarray:
+        shape = (len(values),) + self.shape
+        return np.frombuffer(b"".join(values), dtype=self.dtype).reshape(shape)
+    @staticmethod
+    def write(diter, outdir):
+        """
+        diter is an iterator that has __len__ method
+        class Myiter():
+            def __init__(self, data):
+                self.data = data
+            def __iter__(self):
+                self.counter = 0
+                return self
+            def __len__(self):
+                return len(self.data)
+            def __next__(self):
+                if self.counter < len(self):
+                    out = self.data[self.counter]
+                    self.counter+=1
+                    return out
+                else:
+                    raise StopIteration
+        a = iter(Myiter([1,2,3]))
+        for i in a:
+            print(i)
+        """
+        outdir = Path(outdir)
+        outdir.mkdir(parents=True, exist_ok=True)
+        tmp_dir = Path("/tmp") / f"TEMP_{time.time()}"
+        tmp_dir.mkdir(parents=True, exist_ok=True)
+        # Create the database.
+        n = len(diter)
+        with lmdb.open(path=str(tmp_dir), map_size=2 ** 40) as env:
+            # Add the protocol to the database.
+            with env.begin(write=True) as txn:
+                key = "protocol".encode("ascii")
+                value = pickle.dumps(pickle.DEFAULT_PROTOCOL)
+                txn.put(key=key, value=value, dupdata=False)
+            # Add the keys to the database.
+            with env.begin(write=True) as txn:
+                key = pickle.dumps("keys")
+                value = pickle.dumps(list(range(n)))
+                txn.put(key=key, value=value, dupdata=False)
+            # Extract the shape and dtype of the values.
+            value = next(iter(diter))
+            shape = value.shape
+            dtype = value.dtype
+            # Add the shape to the database.
+            with env.begin(write=True) as txn:
+                key = pickle.dumps("shape")
+                value = pickle.dumps(shape)
+                txn.put(key=key, value=value, dupdata=False)
+            # Add the dtype to the database.
+            with env.begin(write=True) as txn:
+                key = pickle.dumps("dtype")
+                value = pickle.dumps(dtype)
+                txn.put(key=key, value=value, dupdata=False)
+            # Add the values to the database.
+            with env.begin(write=True) as txn:
+                for key, value in tqdm(enumerate(iter(diter)), total=n, miniters=n//100, mininterval=300):
+                    key = pickle.dumps(key)
+                    value = pickle.dumps(value)
+                    txn.put(key=key, value=value, dupdata=False)
+        # Move the database to its destination.
+        copy_tree(str(tmp_dir), str(outdir))
+        shutil.rmtree(str(tmp_dir))
+class TensorDatabase(ArrayDatabase):
+    def _decode_value(self, value: bytes) -> Tensor:
+        return torch.from_numpy(super(TensorDatabase, self)._decode_value(value))
+    def _decode_values(self, values: list) -> Tensor:
+        return torch.from_numpy(super(TensorDatabase, self)._decode_values(values))

tools/ecc.py ADDED Viewed

	@@ -0,0 +1,281 @@

+import bchlib
+import numpy as np
+from typing import List, Tuple
+import random
+from copy import deepcopy
+class RSC(object):
+    def __init__(self, data_bytes=16, ecc_bytes=4, verbose=False, **kwargs):
+        from reedsolo import RSCodec
+        self.rs = RSCodec(ecc_bytes)
+        if verbose:
+            print(f'Reed-Solomon ECC len: {ecc_bytes*8} bits')
+        self.data_len = data_bytes
+        self.dlen = data_bytes * 8  # data length in bits
+        self.ecc_len = ecc_bytes * 8  # ecc length in bits
+    def get_total_len(self):
+        return self.dlen + self.ecc_len
+    def encode_text(self, text: List[str]):
+        return np.array([self._encode_text(t) for t in text])
+    def _encode_text(self, text: str):
+        text = text + ' ' * (self.dlen // 8 - len(text))
+        out = self.rs.encode(text.encode('utf-8'))  # bytearray
+        out = ''.join(format(x, '08b') for x in out)  # bit string
+        out = np.array([int(x) for x in out], dtype=np.float32)
+        return out
+    def decode_text(self, data: np.array):
+        assert len(data.shape)==2
+        return [self._decode_text(d) for d in data]
+    def _decode_text(self, data: np.array):
+        assert len(data.shape)==1
+        data = ''.join([str(int(bit)) for bit in data])
+        data = bytes(int(data[i: i + 8], 2) for i in range(0, len(data), 8))
+        data = bytearray(data)
+        try:
+            data = self.rs.decode(data)[0]
+            data = data.decode('utf-8').strip()
+        except:
+            print('Error: Decode failed')
+            data = get_random_unicode(self.get_total_len()//8)
+        return data
+def get_random_unicode(length):
+    # Update this to include code point ranges to be sampled
+    include_ranges = [
+        ( 0x0021, 0x0021 ),
+        ( 0x0023, 0x0026 ),
+        ( 0x0028, 0x007E ),
+        ( 0x00A1, 0x00AC ),
+        ( 0x00AE, 0x00FF ),
+        ( 0x0100, 0x017F ),
+        ( 0x0180, 0x024F ),
+        ( 0x2C60, 0x2C7F ),
+        ( 0x16A0, 0x16F0 ),
+        ( 0x0370, 0x0377 ),
+        ( 0x037A, 0x037E ),
+        ( 0x0384, 0x038A ),
+        ( 0x038C, 0x038C ),
+    ]
+    alphabet = [
+        chr(code_point) for current_range in include_ranges
+            for code_point in range(current_range[0], current_range[1] + 1)
+    ]
+    return ''.join(random.choice(alphabet) for i in range(length))
+class BCH(object):
+    def __init__(self, BCH_POLYNOMIAL = 137, BCH_BITS = 5, payload_len=100, verbose=True,**kwargs):
+        self.bch = bchlib.BCH(BCH_POLYNOMIAL, BCH_BITS)
+        self.payload_len = payload_len  # in bits
+        self.data_len = (self.payload_len - self.bch.ecc_bytes*8)//7  # in ascii characters
+        assert self.data_len*7+self.bch.ecc_bytes*8 <= self.bch.n, f'Error! BCH with poly {BCH_POLYNOMIAL} and bits {BCH_BITS} can only encode max {self.bch.n//8} bytes of total payload'
+        if verbose:
+            print(f'BCH: POLYNOMIAL={BCH_POLYNOMIAL}, protected bits={BCH_BITS}, payload_len={payload_len} bits, data_len={self.data_len*7} bits ({self.data_len} ascii chars), ecc len={self.bch.ecc_bytes*8} bits')
+    def get_total_len(self):
+        return self.payload_len
+    def encode_text(self, text: List[str]):
+        return np.array([self._encode_text(t) for t in text])
+    def _encode_text(self, text: str):
+        text = text + ' ' * (self.data_len - len(text))
+        # data = text.encode('utf-8')  # bytearray
+        data = encode_text_ascii(text)  # bytearray
+        ecc = self.bch.encode(data)  # bytearray
+        packet = data + ecc  # payload in bytearray
+        packet = ''.join(format(x, '08b') for x in packet)
+        packet = [int(x) for x in packet]
+        packet.extend([0]*(self.payload_len - len(packet)))
+        packet = np.array(packet, dtype=np.float32)
+        return packet
+    def decode_text(self, data: np.array):
+        assert len(data.shape)==2
+        return [self._decode_text(d) for d in data]
+    def _decode_text(self, packet: np.array):
+        assert len(packet.shape)==1
+        packet = ''.join([str(int(bit)) for bit in packet])  # bit string
+        packet = packet[:(len(packet)//8*8)]  # trim to multiple of 8 bits
+        packet = bytes(int(packet[i: i + 8], 2) for i in range(0, len(packet), 8))
+        packet = bytearray(packet)
+        # assert len(packet) == self.data_len + self.bch.ecc_bytes
+        data, ecc = packet[:-self.bch.ecc_bytes], packet[-self.bch.ecc_bytes:]
+        data0 = decode_text_ascii(deepcopy(data)).strip()
+        bitflips = self.bch.decode_inplace(data, ecc)
+        if bitflips == -1:  # error, return random text
+            data = data0
+        else:
+            # data = data.decode('utf-8').strip()
+            data = decode_text_ascii(data).strip()
+        return data
+def encode_text_ascii(text: str):
+    # encode text to 7-bit ascii
+    # input: text, str
+    # output: encoded text, bytearray
+    text_int7 = [ord(t) & 127 for t in text]
+    text_bitstr = ''.join(format(t,'07b') for t in text_int7)
+    if len(text_bitstr) % 8 != 0:
+        text_bitstr =  '0'*(8-len(text_bitstr)%8) + text_bitstr  # pad to multiple of 8
+    text_int8 = [int(text_bitstr[i:i+8], 2) for i in range(0, len(text_bitstr), 8)]
+    return bytearray(text_int8)
+def decode_text_ascii(text: bytearray):
+    # decode text from 7-bit ascii
+    # input: text, bytearray
+    # output: decoded text, str
+    text_bitstr = ''.join(format(t,'08b') for t in text)  # bit string
+    pad = len(text_bitstr) % 7
+    if pad != 0:  # has padding, remove
+        text_bitstr = text_bitstr[pad:]
+    text_int7 = [int(text_bitstr[i:i+7], 2) for i in range(0, len(text_bitstr), 7)]
+    text_bytes = bytes(text_int7)
+    return text_bytes.decode('utf-8')
+class ECC(object):
+    def __init__(self, BCH_POLYNOMIAL = 137, BCH_BITS = 5, **kwargs):
+        self.bch = bchlib.BCH(BCH_POLYNOMIAL, BCH_BITS)
+    def get_total_len(self):
+        return 100
+    def _encode(self, x):
+        # x: 56 bits, {0, 1}, np.array
+        # return: 100 bits, {0, 1}, np.array
+        dlen = len(x)
+        data_str = ''.join(str(x) for x in x.astype(int))
+        packet = bytes(int(data_str[i: i + 8], 2) for i in range(0, dlen, 8))
+        packet = bytearray(packet)
+        ecc = self.bch.encode(packet)
+        packet = packet + ecc  # 96 bits
+        packet = ''.join(format(x, '08b') for x in packet)
+        packet = [int(x) for x in packet]
+        packet.extend([0, 0, 0, 0])
+        packet = np.array(packet, dtype=np.float32)  # 100
+        return packet
+    def _decode(self, x):
+        # x: 100 bits, {0, 1}, np.array
+        # return: 56 bits, {0, 1}, np.array
+        packet_binary = "".join([str(int(bit)) for bit in x])
+        packet = bytes(int(packet_binary[i: i + 8], 2) for i in range(0, len(packet_binary), 8))
+        packet = bytearray(packet)
+        data, ecc = packet[:-self.bch.ecc_bytes], packet[-self.bch.ecc_bytes:]
+        bitflips = self.bch.decode_inplace(data, ecc)
+        if bitflips == -1:  # error, return random data
+            data = np.random.binomial(1, .5, 56)
+        else:
+            data = ''.join(format(x, '08b') for x in data)
+            data = np.array([int(x) for x in data], dtype=np.float32)
+        return data  # 56 bits
+    def _generate(self):
+        dlen = 56
+        data= np.random.binomial(1, .5, dlen)
+        packet = self._encode(data)
+        return packet, data
+    def generate(self, nsamples=1):
+        # generate random 56 bit secret
+        data = [self._generate() for _ in range(nsamples)]
+        data = (np.array([d[0] for d in data]), np.array([d[1] for d in data]))
+        return data  # data with ecc, data org
+    def _to_text(self, data):
+        # data:  {0, 1}, np.array
+        # return: str
+        data = ''.join([str(int(bit)) for bit in data])
+        all_bytes = [ data[i: i+8] for i in range(0, len(data), 8) ]
+        text = ''.join([chr(int(byte, 2)) for byte in all_bytes])
+        return text.strip()
+    def _to_binary(self, s):
+        if isinstance(s, str):
+            out = ''.join([ format(ord(i), "08b") for i in s ])
+        elif isinstance(s, bytes):
+            out = ''.join([ format(i, "08b") for i in s ])
+        elif isinstance(s, np.ndarray) and s.dtype is np.dtype(bool):
+            out = ''.join([chr(int(i)) for i in s])
+        elif isinstance(s, int) or isinstance(s, np.uint8):
+            out = format(s, "08b")
+        elif isinstance(s, np.ndarray):
+            out = [ format(i, "08b") for i in s ]
+        else:
+            raise TypeError("Type not supported.")
+        return np.array([float(i) for i in out], dtype=np.float32)
+    def _encode_text(self, s):
+        s = s + ' '*(7-len(s))  # 7 chars
+        s = self._to_binary(s)  # 56 bits
+        packet = self._encode(s)  # 100 bits
+        return packet, s
+    def encode_text(self, secret_list, return_pre_ecc=False):
+        """encode secret with BCH ECC.
+        Input: secret (list of strings)
+        Output: secret (np array) with shape (B, 100) type float23, val {0,1}"""
+        assert np.all(np.array([len(s) for s in secret_list]) <= 7), 'Error! all strings must be less than 7 characters'
+        secret_list = [self._encode_text(s) for s in secret_list]
+        ecc = np.array([s[0] for s in secret_list], dtype=np.float32)
+        if return_pre_ecc:
+            return ecc, np.array([s[1] for s in secret_list], dtype=np.float32)
+        return ecc
+    def decode_text(self, data):
+        """Decode secret with BCH ECC and convert to string.
+        Input: secret (torch.tensor) with shape (B, 100) type bool
+        Output: secret (B, 56)"""
+        data = self.decode(data)
+        data = [self._to_text(d) for d in data]
+        return data
+    def decode(self, data):
+        """Decode secret with BCH ECC and convert to string.
+        Input: secret (torch.tensor) with shape (B, 100) type bool
+        Output: secret (B, 56)"""
+        data = data[:, :96]
+        data = [self._decode(d) for d in data]
+        return np.array(data)
+def test_ecc():
+    ecc = ECC()
+    batch_size = 10
+    secret_ecc, secret_org = ecc.generate(batch_size)  # 10x100 ecc secret, 10x56 org secret
+    # modify secret_ecc
+    secret_pred = secret_ecc.copy()
+    secret_pred[:,3:6] = 1 - secret_pred[:,3:6]
+    # pass secret_ecc to model and get predicted as secret_pred
+    secret_pred_org = ecc.decode(secret_pred)  # 10x56
+    assert np.all(secret_pred_org == secret_org)  # 10
+def test_bch():
+    # test 100 bit
+    def check(text, poly, k, l):
+        bch = BCH(poly, k, l)
+        # text = 'secrets'
+        encode = bch.encode_text([text])
+        for ind in np.random.choice(l, k):
+            encode[0, ind] = 1 - encode[0, ind]
+        text_recon = bch.decode_text(encode)[0]
+        assert text==text_recon
+    check('secrets', 137, 5, 100)
+    check('some secret', 285, 10, 160)
+if __name__ == '__main__':
+    test_ecc()
+    test_bch()

tools/eval_metrics.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import torch
+import numpy as np
+import skimage.metrics
+import lpips
+from PIL import Image
+from .sifid import SIFID
+def resize_array(x, size=256):
+    """
+    Resize image array to given size.
+    Args:
+        x (np.ndarray): Image array of shape (N, H, W, C) in range [0, 255].
+        size (int): Size of output image.
+    Returns:
+        (np.ndarray): Image array of shape (N, H, W, C) in range [0, 255].
+    """
+    if x.shape[1] != size:
+        x = [Image.fromarray(x[i]).resize((size, size), resample=Image.BILINEAR) for i in range(x.shape[0])]
+        x = np.array([np.array(i) for i in x])
+    return x
+def resize_tensor(x, size=256):
+    """
+    Resize image tensor to given size.
+    Args:
+        x (torch.Tensor): Image tensor of shape (N, C, H, W) in range [-1, 1].
+        size (int): Size of output image.
+    Returns:
+        (torch.Tensor): Image tensor of shape (N, C, H, W) in range [-1, 1].
+    """
+    if x.shape[2] != size:
+        x = torch.nn.functional.interpolate(x, size=(size, size), mode='bilinear', align_corners=False)
+    return x
+def normalise(x):
+    """
+    Normalise image array to range [-1, 1] and tensor.
+    Args:
+        x (np.ndarray): Image array of shape (N, H, W, C) in range [0, 255].
+    Returns:
+        (torch.Tensor): Image tensor of shape (N, C, H, W) in range [-1, 1].
+    """
+    x = x.astype(np.float32)
+    x = x / 255
+    x = (x - 0.5) / 0.5
+    x = torch.from_numpy(x)
+    x = x.permute(0, 3, 1, 2)
+    return x
+def unormalise(x, vrange=[-1, 1]):
+    """
+    Unormalise image tensor to range [0, 255] and RGB array.
+    Args:
+        x (torch.Tensor): Image tensor of shape (N, C, H, W) in range [-1, 1].
+    Returns:
+        (np.ndarray): Image array of shape (N, H, W, C) in range [0, 255].
+    """
+    x = (x - vrange[0])/(vrange[1] - vrange[0])
+    x = x * 255
+    x = x.permute(0, 2, 3, 1)
+    x = x.cpu().numpy().astype(np.uint8)
+    return x
+def compute_mse(x, y):
+    """
+    Compute mean squared error between two image arrays.
+    Args:
+        x (np.ndarray): Image of shape (N, H, W, C) in range [0, 255].
+        y (np.ndarray): Image of shape (N, H, W, C) in range [0, 255].
+    Returns:
+        (1darray): Mean squared error.
+    """
+    return np.square(x - y).reshape(x.shape[0], -1).mean(axis=1)
+def compute_psnr(x, y):
+    """
+    Compute peak signal-to-noise ratio between two images.
+    Args:
+        x (np.ndarray): Image of shape (N, H, W, C) in range [0, 255].
+        y (np.ndarray): Image of shape (N, H, W, C) in range [0, 255].
+    Returns:
+        (float): Peak signal-to-noise ratio.
+    """
+    return 10 * np.log10(255 ** 2 / compute_mse(x, y))
+def compute_ssim(x, y):
+    """
+    Compute structural similarity index between two images.
+    Args:
+        x (np.ndarray): Image of shape (N, H, W, C) in range [0, 255].
+        y (np.ndarray): Image of shape (N, H, W, C) in range [0, 255].
+    Returns:
+        (float): Structural similarity index.
+    """
+    return np.array([skimage.metrics.structural_similarity(xi, yi, channel_axis=2, gaussian_weights=True, sigma=1.5, use_sample_covariance=False, data_range=255) for xi, yi in zip(x, y)])
+def compute_lpips(x, y, net='alex'):
+    """
+    Compute LPIPS between two images.
+    Args:
+        x (torch.Tensor): Image tensor of shape (N, C, H, W) in range [-1, 1].
+        y (torch.Tensor): Image tensor of shape (N, C, H, W) in range [-1, 1].
+    Returns:
+        (float): LPIPS.
+    """
+    lpips_fn = lpips.LPIPS(net=net, verbose=False).cuda() if isinstance(net, str) else net
+    x, y = x.cuda(), y.cuda()
+    return lpips_fn(x, y).detach().cpu().numpy().squeeze()
+def compute_sifid(x, y, net=None):
+    """
+    Compute SIFID between two images.
+    Args:
+        x (torch.Tensor): Image tensor of shape (N, C, H, W) in range [-1, 1].
+        y (torch.Tensor): Image tensor of shape (N, C, H, W) in range [-1, 1].
+    Returns:
+        (float): SIFID.
+    """
+    fn = SIFID() if net is None else net
+    out = [fn(xi, yi) for xi, yi in zip(x, y)]
+    return np.array(out)

tools/fid.py ADDED Viewed

	@@ -0,0 +1,672 @@

+"""Calculates the Frechet Inception Distance (FID) to evalulate GANs
+The FID metric calculates the distance between two distributions of images.
+Typically, we have summary statistics (mean & covariance matrix) of one
+of these distributions, while the 2nd distribution is given by a GAN.
+When run as a stand-alone program, it compares the distribution of
+images that are stored as PNG/JPEG at a specified location with a
+distribution given by summary statistics (in pickle format).
+The FID is calculated by assuming that X_1 and X_2 are the activations of
+the pool_3 layer of the inception net for generated samples and real world
+samples respectively.
+See --help to see further details.
+Code apapted from https://github.com/bioinf-jku/TTUR to use PyTorch instead
+of Tensorflow
+Copyright 2018 Institute of Bioinformatics, JKU Linz
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+import pathlib
+from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
+import numpy as np
+import torch
+import torchvision.transforms as TF
+from PIL import Image
+from scipy import linalg
+from torch.nn.functional import adaptive_avg_pool2d
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision
+try:
+    from tqdm import tqdm
+except ImportError:
+    # If tqdm is not available, provide a mock version of it
+    def tqdm(x):
+        return x
+IMAGE_EXTENSIONS = {'bmp', 'jpg', 'jpeg', 'pgm', 'png', 'ppm',
+                    'tif', 'tiff', 'webp'}
+try:
+    from torchvision.models.utils import load_state_dict_from_url
+except ImportError:
+    from torch.utils.model_zoo import load_url as load_state_dict_from_url
+# Inception weights ported to Pytorch from
+# http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
+FID_WEIGHTS_URL = 'https://github.com/mseitzer/pytorch-fid/releases/download/fid_weights/pt_inception-2015-12-05-6726825d.pth'  # noqa: E501
+class InceptionV3(nn.Module):
+    """Pretrained InceptionV3 network returning feature maps"""
+    # Index of default block of inception to return,
+    # corresponds to output of final average pooling
+    DEFAULT_BLOCK_INDEX = 3
+    # Maps feature dimensionality to their output blocks indices
+    BLOCK_INDEX_BY_DIM = {
+        64: 0,   # First max pooling features
+        192: 1,  # Second max pooling featurs
+        768: 2,  # Pre-aux classifier features
+        2048: 3  # Final average pooling features
+    }
+    def __init__(self,
+                 output_blocks=(DEFAULT_BLOCK_INDEX,),
+                 resize_input=True,
+                 normalize_input=True,
+                 requires_grad=False,
+                 use_fid_inception=True):
+        """Build pretrained InceptionV3
+        Parameters
+        ----------
+        output_blocks : list of int
+            Indices of blocks to return features of. Possible values are:
+                - 0: corresponds to output of first max pooling
+                - 1: corresponds to output of second max pooling
+                - 2: corresponds to output which is fed to aux classifier
+                - 3: corresponds to output of final average pooling
+        resize_input : bool
+            If true, bilinearly resizes input to width and height 299 before
+            feeding input to model. As the network without fully connected
+            layers is fully convolutional, it should be able to handle inputs
+            of arbitrary size, so resizing might not be strictly needed
+        normalize_input : bool
+            If true, scales the input from range (0, 1) to the range the
+            pretrained Inception network expects, namely (-1, 1)
+        requires_grad : bool
+            If true, parameters of the model require gradients. Possibly useful
+            for finetuning the network
+        use_fid_inception : bool
+            If true, uses the pretrained Inception model used in Tensorflow's
+            FID implementation. If false, uses the pretrained Inception model
+            available in torchvision. The FID Inception model has different
+            weights and a slightly different structure from torchvision's
+            Inception model. If you want to compute FID scores, you are
+            strongly advised to set this parameter to true to get comparable
+            results.
+        """
+        super(InceptionV3, self).__init__()
+        self.resize_input = resize_input
+        self.normalize_input = normalize_input
+        self.output_blocks = sorted(output_blocks)
+        self.last_needed_block = max(output_blocks)
+        assert self.last_needed_block <= 3, \
+            'Last possible output block index is 3'
+        self.blocks = nn.ModuleList()
+        if use_fid_inception:
+            inception = fid_inception_v3()
+        else:
+            inception = _inception_v3(weights='DEFAULT')
+        # Block 0: input to maxpool1
+        block0 = [
+            inception.Conv2d_1a_3x3,
+            inception.Conv2d_2a_3x3,
+            inception.Conv2d_2b_3x3,
+            nn.MaxPool2d(kernel_size=3, stride=2)
+        ]
+        self.blocks.append(nn.Sequential(*block0))
+        # Block 1: maxpool1 to maxpool2
+        if self.last_needed_block >= 1:
+            block1 = [
+                inception.Conv2d_3b_1x1,
+                inception.Conv2d_4a_3x3,
+                nn.MaxPool2d(kernel_size=3, stride=2)
+            ]
+            self.blocks.append(nn.Sequential(*block1))
+        # Block 2: maxpool2 to aux classifier
+        if self.last_needed_block >= 2:
+            block2 = [
+                inception.Mixed_5b,
+                inception.Mixed_5c,
+                inception.Mixed_5d,
+                inception.Mixed_6a,
+                inception.Mixed_6b,
+                inception.Mixed_6c,
+                inception.Mixed_6d,
+                inception.Mixed_6e,
+            ]
+            self.blocks.append(nn.Sequential(*block2))
+        # Block 3: aux classifier to final avgpool
+        if self.last_needed_block >= 3:
+            block3 = [
+                inception.Mixed_7a,
+                inception.Mixed_7b,
+                inception.Mixed_7c,
+                nn.AdaptiveAvgPool2d(output_size=(1, 1))
+            ]
+            self.blocks.append(nn.Sequential(*block3))
+        for param in self.parameters():
+            param.requires_grad = requires_grad
+    def forward(self, inp):
+        """Get Inception feature maps
+        Parameters
+        ----------
+        inp : torch.autograd.Variable
+            Input tensor of shape Bx3xHxW. Values are expected to be in
+            range (0, 1)
+        Returns
+        -------
+        List of torch.autograd.Variable, corresponding to the selected output
+        block, sorted ascending by index
+        """
+        outp = []
+        x = inp
+        if self.resize_input:
+            x = F.interpolate(x,
+                              size=(299, 299),
+                              mode='bilinear',
+                              align_corners=False)
+        if self.normalize_input:
+            x = 2 * x - 1  # Scale from range (0, 1) to range (-1, 1)
+        for idx, block in enumerate(self.blocks):
+            x = block(x)
+            if idx in self.output_blocks:
+                outp.append(x)
+            if idx == self.last_needed_block:
+                break
+        return outp
+def _inception_v3(*args, **kwargs):
+    """Wraps `torchvision.models.inception_v3`"""
+    try:
+        version = tuple(map(int, torchvision.__version__.split('.')[:2]))
+    except ValueError:
+        # Just a caution against weird version strings
+        version = (0,)
+    # Skips default weight inititialization if supported by torchvision
+    # version. See https://github.com/mseitzer/pytorch-fid/issues/28.
+    if version >= (0, 6):
+        kwargs['init_weights'] = False
+    # Backwards compatibility: `weights` argument was handled by `pretrained`
+    # argument prior to version 0.13.
+    if version < (0, 13) and 'weights' in kwargs:
+        if kwargs['weights'] == 'DEFAULT':
+            kwargs['pretrained'] = True
+        elif kwargs['weights'] is None:
+            kwargs['pretrained'] = False
+        else:
+            raise ValueError(
+                'weights=={} not supported in torchvision {}'.format(
+                    kwargs['weights'], torchvision.__version__
+                )
+            )
+        del kwargs['weights']
+    return torchvision.models.inception_v3(*args, **kwargs)
+def fid_inception_v3():
+    """Build pretrained Inception model for FID computation
+    The Inception model for FID computation uses a different set of weights
+    and has a slightly different structure than torchvision's Inception.
+    This method first constructs torchvision's Inception and then patches the
+    necessary parts that are different in the FID Inception model.
+    """
+    inception = _inception_v3(num_classes=1008,
+                              aux_logits=False,
+                              weights=None)
+    inception.Mixed_5b = FIDInceptionA(192, pool_features=32)
+    inception.Mixed_5c = FIDInceptionA(256, pool_features=64)
+    inception.Mixed_5d = FIDInceptionA(288, pool_features=64)
+    inception.Mixed_6b = FIDInceptionC(768, channels_7x7=128)
+    inception.Mixed_6c = FIDInceptionC(768, channels_7x7=160)
+    inception.Mixed_6d = FIDInceptionC(768, channels_7x7=160)
+    inception.Mixed_6e = FIDInceptionC(768, channels_7x7=192)
+    inception.Mixed_7b = FIDInceptionE_1(1280)
+    inception.Mixed_7c = FIDInceptionE_2(2048)
+    state_dict = load_state_dict_from_url(FID_WEIGHTS_URL, progress=True)
+    inception.load_state_dict(state_dict)
+    return inception
+class FIDInceptionA(torchvision.models.inception.InceptionA):
+    """InceptionA block patched for FID computation"""
+    def __init__(self, in_channels, pool_features):
+        super(FIDInceptionA, self).__init__(in_channels, pool_features)
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch5x5 = self.branch5x5_1(x)
+        branch5x5 = self.branch5x5_2(branch5x5)
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+        # Patch: Tensorflow's average pool does not use the padded zero's in
+        # its average calculation
+        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
+                                   count_include_pad=False)
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
+        return torch.cat(outputs, 1)
+class FIDInceptionC(torchvision.models.inception.InceptionC):
+    """InceptionC block patched for FID computation"""
+    def __init__(self, in_channels, channels_7x7):
+        super(FIDInceptionC, self).__init__(in_channels, channels_7x7)
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch7x7 = self.branch7x7_1(x)
+        branch7x7 = self.branch7x7_2(branch7x7)
+        branch7x7 = self.branch7x7_3(branch7x7)
+        branch7x7dbl = self.branch7x7dbl_1(x)
+        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
+        # Patch: Tensorflow's average pool does not use the padded zero's in
+        # its average calculation
+        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
+                                   count_include_pad=False)
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool]
+        return torch.cat(outputs, 1)
+class FIDInceptionE_1(torchvision.models.inception.InceptionE):
+    """First InceptionE block patched for FID computation"""
+    def __init__(self, in_channels):
+        super(FIDInceptionE_1, self).__init__(in_channels)
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = [
+            self.branch3x3_2a(branch3x3),
+            self.branch3x3_2b(branch3x3),
+        ]
+        branch3x3 = torch.cat(branch3x3, 1)
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = [
+            self.branch3x3dbl_3a(branch3x3dbl),
+            self.branch3x3dbl_3b(branch3x3dbl),
+        ]
+        branch3x3dbl = torch.cat(branch3x3dbl, 1)
+        # Patch: Tensorflow's average pool does not use the padded zero's in
+        # its average calculation
+        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
+                                   count_include_pad=False)
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
+        return torch.cat(outputs, 1)
+class FIDInceptionE_2(torchvision.models.inception.InceptionE):
+    """Second InceptionE block patched for FID computation"""
+    def __init__(self, in_channels):
+        super(FIDInceptionE_2, self).__init__(in_channels)
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = [
+            self.branch3x3_2a(branch3x3),
+            self.branch3x3_2b(branch3x3),
+        ]
+        branch3x3 = torch.cat(branch3x3, 1)
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = [
+            self.branch3x3dbl_3a(branch3x3dbl),
+            self.branch3x3dbl_3b(branch3x3dbl),
+        ]
+        branch3x3dbl = torch.cat(branch3x3dbl, 1)
+        # Patch: The FID Inception model uses max pooling instead of average
+        # pooling. This is likely an error in this specific Inception
+        # implementation, as other Inception models use average pooling here
+        # (which matches the description in the paper).
+        branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1)
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
+        return torch.cat(outputs, 1)
+class ImagePathDataset(torch.utils.data.Dataset):
+    def __init__(self, files, transforms=None):
+        self.files = files
+        self.transforms = transforms
+    def __len__(self):
+        return len(self.files)
+    def __getitem__(self, i):
+        path = self.files[i]
+        img = Image.open(path).convert('RGB')
+        if self.transforms is not None:
+            img = self.transforms(img)
+        return img
+def get_activations(files, model, batch_size=50, dims=2048, device='cpu',
+                    num_workers=1, resize=0):
+    """Calculates the activations of the pool_3 layer for all images.
+    Params:
+    -- files       : List of image files paths
+    -- model       : Instance of inception model
+    -- batch_size  : Batch size of images for the model to process at once.
+                     Make sure that the number of samples is a multiple of
+                     the batch size, otherwise some samples are ignored. This
+                     behavior is retained to match the original FID score
+                     implementation.
+    -- dims        : Dimensionality of features returned by Inception
+    -- device      : Device to run calculations
+    -- num_workers : Number of parallel dataloader workers
+    Returns:
+    -- A numpy array of dimension (num images, dims) that contains the
+       activations of the given tensor when feeding inception with the
+       query tensor.
+    """
+    model.eval()
+    if batch_size > len(files):
+        print(('Warning: batch size is bigger than the data size. '
+               'Setting batch size to data size'))
+        batch_size = len(files)
+    if resize > 0:
+        tform = TF.Compose([TF.Resize((resize, resize)), TF.ToTensor()])
+    else:
+        tform = TF.ToTensor()
+    dataset = ImagePathDataset(files, transforms=tform)
+    dataloader = torch.utils.data.DataLoader(dataset,
+                                             batch_size=batch_size,
+                                             shuffle=False,
+                                             drop_last=False,
+                                             num_workers=num_workers)
+    pred_arr = np.empty((len(files), dims))
+    start_idx = 0
+    for batch in tqdm(dataloader):
+        batch = batch.to(device)
+        with torch.no_grad():
+            pred = model(batch)[0]
+        # If model output is not scalar, apply global spatial average pooling.
+        # This happens if you choose a dimensionality not equal 2048.
+        if pred.size(2) != 1 or pred.size(3) != 1:
+            pred = adaptive_avg_pool2d(pred, output_size=(1, 1))
+        pred = pred.squeeze(3).squeeze(2).cpu().numpy()
+        pred_arr[start_idx:start_idx + pred.shape[0]] = pred
+        start_idx = start_idx + pred.shape[0]
+    return pred_arr
+def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
+    """Numpy implementation of the Frechet Distance.
+    The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
+    and X_2 ~ N(mu_2, C_2) is
+            d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
+    Stable version by Dougal J. Sutherland.
+    Params:
+    -- mu1   : Numpy array containing the activations of a layer of the
+               inception net (like returned by the function 'get_predictions')
+               for generated samples.
+    -- mu2   : The sample mean over activations, precalculated on an
+               representative data set.
+    -- sigma1: The covariance matrix over activations for generated samples.
+    -- sigma2: The covariance matrix over activations, precalculated on an
+               representative data set.
+    Returns:
+    --   : The Frechet Distance.
+    """
+    mu1 = np.atleast_1d(mu1)
+    mu2 = np.atleast_1d(mu2)
+    sigma1 = np.atleast_2d(sigma1)
+    sigma2 = np.atleast_2d(sigma2)
+    assert mu1.shape == mu2.shape, \
+        'Training and test mean vectors have different lengths'
+    assert sigma1.shape == sigma2.shape, \
+        'Training and test covariances have different dimensions'
+    diff = mu1 - mu2
+    # Product might be almost singular
+    covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+    if not np.isfinite(covmean).all():
+        msg = ('fid calculation produces singular product; '
+               'adding %s to diagonal of cov estimates') % eps
+        print(msg)
+        offset = np.eye(sigma1.shape[0]) * eps
+        covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+    # Numerical error might give slight imaginary component
+    if np.iscomplexobj(covmean):
+        if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+            m = np.max(np.abs(covmean.imag))
+            raise ValueError('Imaginary component {}'.format(m))
+        covmean = covmean.real
+    tr_covmean = np.trace(covmean)
+    return (diff.dot(diff) + np.trace(sigma1)
+            + np.trace(sigma2) - 2 * tr_covmean)
+def calculate_activation_statistics(files, model, batch_size=50, dims=2048,
+                                    device='cpu', num_workers=1, resize=0):
+    """Calculation of the statistics used by the FID.
+    Params:
+    -- files       : List of image files paths
+    -- model       : Instance of inception model
+    -- batch_size  : The images numpy array is split into batches with
+                     batch size batch_size. A reasonable batch size
+                     depends on the hardware.
+    -- dims        : Dimensionality of features returned by Inception
+    -- device      : Device to run calculations
+    -- num_workers : Number of parallel dataloader workers
+    Returns:
+    -- mu    : The mean over samples of the activations of the pool_3 layer of
+               the inception model.
+    -- sigma : The covariance matrix of the activations of the pool_3 layer of
+               the inception model.
+    """
+    act = get_activations(files, model, batch_size, dims, device, num_workers, resize)
+    mu = np.mean(act, axis=0)
+    sigma = np.cov(act, rowvar=False)
+    return mu, sigma
+def compute_statistics_of_path(path, model, batch_size, dims, device,
+                               num_workers=1, nimages=None, resize=0):
+    if path.endswith('.npz'):
+        with np.load(path) as f:
+            m, s = f['mu'][:], f['sigma'][:]
+    else:
+        path = pathlib.Path(path)
+        files = sorted([file for ext in IMAGE_EXTENSIONS
+                       for file in path.glob('**/*.{}'.format(ext))])
+        nfiles = len(files)
+        n = nfiles if nimages is None else min(nimages, nfiles)
+        print(f'Found {nfiles} images. Computing FID with {n} images.')
+        files = files[:n]
+        m, s = calculate_activation_statistics(files, model, batch_size,
+                                               dims, device, num_workers, resize)
+    return m, s
+def calculate_fid_given_paths(paths, batch_size, device, dims, num_workers=1, nimages=None, resize=0):
+    """Calculates the FID of two paths"""
+    for p in paths:
+        if not os.path.exists(p):
+            raise RuntimeError('Invalid path: %s' % p)
+    block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
+    model = InceptionV3([block_idx]).to(device)
+    m1, s1 = compute_statistics_of_path(paths[0], model, batch_size,
+                                        dims, device, num_workers, nimages, resize)
+    m2, s2 = compute_statistics_of_path(paths[1], model, batch_size,
+                                        dims, device, num_workers, nimages, resize)
+    fid_value = calculate_frechet_distance(m1, s1, m2, s2)
+    return fid_value
+def save_fid_stats(paths, batch_size, device, dims, num_workers=1, nimages=None, resize=0):
+    """Calculates the FID of two paths"""
+    if not os.path.exists(paths[0]):
+        raise RuntimeError('Invalid path: %s' % paths[0])
+    if os.path.exists(paths[1]):
+        raise RuntimeError('Existing output file: %s' % paths[1])
+    block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
+    model = InceptionV3([block_idx]).to(device)
+    print(f"Saving statistics for {paths[0]}")
+    m1, s1 = compute_statistics_of_path(paths[0], model, batch_size,
+                                        dims, device, num_workers, nimages, resize=0)
+    np.savez_compressed(paths[1], mu=m1, sigma=s1)
+def main():
+    parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--batch-size', type=int, default=20,
+                        help='Batch size to use')
+    parser.add_argument('--num-workers', type=int,
+                        help=('Number of processes to use for data loading. '
+                            'Defaults to `min(8, num_cpus)`'))
+    parser.add_argument('--device', type=str, default='cuda:0',
+                        help='Device to use. Like cuda, cuda:0 or cpu')
+    parser.add_argument('--dims', type=int, default=2048,
+                        choices=list(InceptionV3.BLOCK_INDEX_BY_DIM),
+                        help=('Dimensionality of Inception features to use. '
+                            'By default, uses pool3 features'))
+    parser.add_argument('--nimages', type=int, default=50000, help='max number of images to use')
+    parser.add_argument('--resize', type=int, default=0, help='resize images to this size, 0 mean keep original size')
+    parser.add_argument('--save-stats', action='store_true',
+                        help=('Generate an npz archive from a directory of samples. '
+                            'The first path is used as input and the second as output.'))
+    parser.add_argument('path', type=str, nargs=2,
+                        help=('Paths to the generated images or '
+                            'to .npz statistic files'))
+    args = parser.parse_args()
+    if args.device is None:
+        device = torch.device('cuda' if (torch.cuda.is_available()) else 'cpu')
+    else:
+        device = torch.device(args.device)
+    if args.num_workers is None:
+        try:
+            num_cpus = len(os.sched_getaffinity(0))
+        except AttributeError:
+            # os.sched_getaffinity is not available under Windows, use
+            # os.cpu_count instead (which may not return the *available* number
+            # of CPUs).
+            num_cpus = os.cpu_count()
+        num_workers = min(num_cpus, 8) if num_cpus is not None else 0
+    else:
+        num_workers = args.num_workers
+    if args.save_stats:
+        save_fid_stats(args.path, args.batch_size, device, args.dims, num_workers, args.nimages, args.resize)
+        return
+    fid_value = calculate_fid_given_paths(args.path,
+                                          args.batch_size,
+                                          device,
+                                          args.dims,
+                                          num_workers,
+                                          args.nimages,
+                                          args.resize)
+    print('FID: ', fid_value)
+if __name__ == '__main__':
+    main()

tools/fid_lmdb.py ADDED Viewed

	@@ -0,0 +1,683 @@

+"""Calculates the Frechet Inception Distance (FID) to evalulate GANs
+The FID metric calculates the distance between two distributions of images.
+Typically, we have summary statistics (mean & covariance matrix) of one
+of these distributions, while the 2nd distribution is given by a GAN.
+When run as a stand-alone program, it compares the distribution of
+images that are stored as PNG/JPEG at a specified location with a
+distribution given by summary statistics (in pickle format).
+The FID is calculated by assuming that X_1 and X_2 are the activations of
+the pool_3 layer of the inception net for generated samples and real world
+samples respectively.
+See --help to see further details.
+Code apapted from https://github.com/bioinf-jku/TTUR to use PyTorch instead
+of Tensorflow
+Copyright 2018 Institute of Bioinformatics, JKU Linz
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import os
+import pathlib
+from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
+import numpy as np
+import torch
+import torchvision.transforms as TF
+from PIL import Image
+from scipy import linalg
+from torch.nn.functional import adaptive_avg_pool2d
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision
+import sys
+sys.path.insert(1, '/mnt/fast/nobackup/users/tb0035/projects/diffsteg/ControlNet')
+from tools.image_dataset import ImageDataset
+try:
+    from tqdm import tqdm
+except ImportError:
+    # If tqdm is not available, provide a mock version of it
+    def tqdm(x):
+        return x
+IMAGE_EXTENSIONS = {'bmp', 'jpg', 'jpeg', 'pgm', 'png', 'ppm',
+                    'tif', 'tiff', 'webp'}
+try:
+    from torchvision.models.utils import load_state_dict_from_url
+except ImportError:
+    from torch.utils.model_zoo import load_url as load_state_dict_from_url
+# Inception weights ported to Pytorch from
+# http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
+FID_WEIGHTS_URL = 'https://github.com/mseitzer/pytorch-fid/releases/download/fid_weights/pt_inception-2015-12-05-6726825d.pth'  # noqa: E501
+class InceptionV3(nn.Module):
+    """Pretrained InceptionV3 network returning feature maps"""
+    # Index of default block of inception to return,
+    # corresponds to output of final average pooling
+    DEFAULT_BLOCK_INDEX = 3
+    # Maps feature dimensionality to their output blocks indices
+    BLOCK_INDEX_BY_DIM = {
+        64: 0,   # First max pooling features
+        192: 1,  # Second max pooling featurs
+        768: 2,  # Pre-aux classifier features
+        2048: 3  # Final average pooling features
+    }
+    def __init__(self,
+                 output_blocks=(DEFAULT_BLOCK_INDEX,),
+                 resize_input=True,
+                 normalize_input=True,
+                 requires_grad=False,
+                 use_fid_inception=True):
+        """Build pretrained InceptionV3
+        Parameters
+        ----------
+        output_blocks : list of int
+            Indices of blocks to return features of. Possible values are:
+                - 0: corresponds to output of first max pooling
+                - 1: corresponds to output of second max pooling
+                - 2: corresponds to output which is fed to aux classifier
+                - 3: corresponds to output of final average pooling
+        resize_input : bool
+            If true, bilinearly resizes input to width and height 299 before
+            feeding input to model. As the network without fully connected
+            layers is fully convolutional, it should be able to handle inputs
+            of arbitrary size, so resizing might not be strictly needed
+        normalize_input : bool
+            If true, scales the input from range (0, 1) to the range the
+            pretrained Inception network expects, namely (-1, 1)
+        requires_grad : bool
+            If true, parameters of the model require gradients. Possibly useful
+            for finetuning the network
+        use_fid_inception : bool
+            If true, uses the pretrained Inception model used in Tensorflow's
+            FID implementation. If false, uses the pretrained Inception model
+            available in torchvision. The FID Inception model has different
+            weights and a slightly different structure from torchvision's
+            Inception model. If you want to compute FID scores, you are
+            strongly advised to set this parameter to true to get comparable
+            results.
+        """
+        super(InceptionV3, self).__init__()
+        self.resize_input = resize_input
+        self.normalize_input = normalize_input
+        self.output_blocks = sorted(output_blocks)
+        self.last_needed_block = max(output_blocks)
+        assert self.last_needed_block <= 3, \
+            'Last possible output block index is 3'
+        self.blocks = nn.ModuleList()
+        if use_fid_inception:
+            inception = fid_inception_v3()
+        else:
+            inception = _inception_v3(weights='DEFAULT')
+        # Block 0: input to maxpool1
+        block0 = [
+            inception.Conv2d_1a_3x3,
+            inception.Conv2d_2a_3x3,
+            inception.Conv2d_2b_3x3,
+            nn.MaxPool2d(kernel_size=3, stride=2)
+        ]
+        self.blocks.append(nn.Sequential(*block0))
+        # Block 1: maxpool1 to maxpool2
+        if self.last_needed_block >= 1:
+            block1 = [
+                inception.Conv2d_3b_1x1,
+                inception.Conv2d_4a_3x3,
+                nn.MaxPool2d(kernel_size=3, stride=2)
+            ]
+            self.blocks.append(nn.Sequential(*block1))
+        # Block 2: maxpool2 to aux classifier
+        if self.last_needed_block >= 2:
+            block2 = [
+                inception.Mixed_5b,
+                inception.Mixed_5c,
+                inception.Mixed_5d,
+                inception.Mixed_6a,
+                inception.Mixed_6b,
+                inception.Mixed_6c,
+                inception.Mixed_6d,
+                inception.Mixed_6e,
+            ]
+            self.blocks.append(nn.Sequential(*block2))
+        # Block 3: aux classifier to final avgpool
+        if self.last_needed_block >= 3:
+            block3 = [
+                inception.Mixed_7a,
+                inception.Mixed_7b,
+                inception.Mixed_7c,
+                nn.AdaptiveAvgPool2d(output_size=(1, 1))
+            ]
+            self.blocks.append(nn.Sequential(*block3))
+        for param in self.parameters():
+            param.requires_grad = requires_grad
+    def forward(self, inp):
+        """Get Inception feature maps
+        Parameters
+        ----------
+        inp : torch.autograd.Variable
+            Input tensor of shape Bx3xHxW. Values are expected to be in
+            range (0, 1)
+        Returns
+        -------
+        List of torch.autograd.Variable, corresponding to the selected output
+        block, sorted ascending by index
+        """
+        outp = []
+        x = inp
+        if self.resize_input:
+            x = F.interpolate(x,
+                              size=(299, 299),
+                              mode='bilinear',
+                              align_corners=False)
+        if self.normalize_input:
+            x = 2 * x - 1  # Scale from range (0, 1) to range (-1, 1)
+        for idx, block in enumerate(self.blocks):
+            x = block(x)
+            if idx in self.output_blocks:
+                outp.append(x)
+            if idx == self.last_needed_block:
+                break
+        return outp
+def _inception_v3(*args, **kwargs):
+    """Wraps `torchvision.models.inception_v3`"""
+    try:
+        version = tuple(map(int, torchvision.__version__.split('.')[:2]))
+    except ValueError:
+        # Just a caution against weird version strings
+        version = (0,)
+    # Skips default weight inititialization if supported by torchvision
+    # version. See https://github.com/mseitzer/pytorch-fid/issues/28.
+    if version >= (0, 6):
+        kwargs['init_weights'] = False
+    # Backwards compatibility: `weights` argument was handled by `pretrained`
+    # argument prior to version 0.13.
+    if version < (0, 13) and 'weights' in kwargs:
+        if kwargs['weights'] == 'DEFAULT':
+            kwargs['pretrained'] = True
+        elif kwargs['weights'] is None:
+            kwargs['pretrained'] = False
+        else:
+            raise ValueError(
+                'weights=={} not supported in torchvision {}'.format(
+                    kwargs['weights'], torchvision.__version__
+                )
+            )
+        del kwargs['weights']
+    return torchvision.models.inception_v3(*args, **kwargs)
+def fid_inception_v3():
+    """Build pretrained Inception model for FID computation
+    The Inception model for FID computation uses a different set of weights
+    and has a slightly different structure than torchvision's Inception.
+    This method first constructs torchvision's Inception and then patches the
+    necessary parts that are different in the FID Inception model.
+    """
+    inception = _inception_v3(num_classes=1008,
+                              aux_logits=False,
+                              weights=None)
+    inception.Mixed_5b = FIDInceptionA(192, pool_features=32)
+    inception.Mixed_5c = FIDInceptionA(256, pool_features=64)
+    inception.Mixed_5d = FIDInceptionA(288, pool_features=64)
+    inception.Mixed_6b = FIDInceptionC(768, channels_7x7=128)
+    inception.Mixed_6c = FIDInceptionC(768, channels_7x7=160)
+    inception.Mixed_6d = FIDInceptionC(768, channels_7x7=160)
+    inception.Mixed_6e = FIDInceptionC(768, channels_7x7=192)
+    inception.Mixed_7b = FIDInceptionE_1(1280)
+    inception.Mixed_7c = FIDInceptionE_2(2048)
+    state_dict = load_state_dict_from_url(FID_WEIGHTS_URL, progress=True)
+    inception.load_state_dict(state_dict)
+    return inception
+class FIDInceptionA(torchvision.models.inception.InceptionA):
+    """InceptionA block patched for FID computation"""
+    def __init__(self, in_channels, pool_features):
+        super(FIDInceptionA, self).__init__(in_channels, pool_features)
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch5x5 = self.branch5x5_1(x)
+        branch5x5 = self.branch5x5_2(branch5x5)
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+        # Patch: Tensorflow's average pool does not use the padded zero's in
+        # its average calculation
+        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
+                                   count_include_pad=False)
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
+        return torch.cat(outputs, 1)
+class FIDInceptionC(torchvision.models.inception.InceptionC):
+    """InceptionC block patched for FID computation"""
+    def __init__(self, in_channels, channels_7x7):
+        super(FIDInceptionC, self).__init__(in_channels, channels_7x7)
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch7x7 = self.branch7x7_1(x)
+        branch7x7 = self.branch7x7_2(branch7x7)
+        branch7x7 = self.branch7x7_3(branch7x7)
+        branch7x7dbl = self.branch7x7dbl_1(x)
+        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
+        # Patch: Tensorflow's average pool does not use the padded zero's in
+        # its average calculation
+        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
+                                   count_include_pad=False)
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool]
+        return torch.cat(outputs, 1)
+class FIDInceptionE_1(torchvision.models.inception.InceptionE):
+    """First InceptionE block patched for FID computation"""
+    def __init__(self, in_channels):
+        super(FIDInceptionE_1, self).__init__(in_channels)
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = [
+            self.branch3x3_2a(branch3x3),
+            self.branch3x3_2b(branch3x3),
+        ]
+        branch3x3 = torch.cat(branch3x3, 1)
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = [
+            self.branch3x3dbl_3a(branch3x3dbl),
+            self.branch3x3dbl_3b(branch3x3dbl),
+        ]
+        branch3x3dbl = torch.cat(branch3x3dbl, 1)
+        # Patch: Tensorflow's average pool does not use the padded zero's in
+        # its average calculation
+        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
+                                   count_include_pad=False)
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
+        return torch.cat(outputs, 1)
+class FIDInceptionE_2(torchvision.models.inception.InceptionE):
+    """Second InceptionE block patched for FID computation"""
+    def __init__(self, in_channels):
+        super(FIDInceptionE_2, self).__init__(in_channels)
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = [
+            self.branch3x3_2a(branch3x3),
+            self.branch3x3_2b(branch3x3),
+        ]
+        branch3x3 = torch.cat(branch3x3, 1)
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = [
+            self.branch3x3dbl_3a(branch3x3dbl),
+            self.branch3x3dbl_3b(branch3x3dbl),
+        ]
+        branch3x3dbl = torch.cat(branch3x3dbl, 1)
+        # Patch: The FID Inception model uses max pooling instead of average
+        # pooling. This is likely an error in this specific Inception
+        # implementation, as other Inception models use average pooling here
+        # (which matches the description in the paper).
+        branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1)
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
+        return torch.cat(outputs, 1)
+class ImagePathDataset(torch.utils.data.Dataset):
+    def __init__(self, files, transforms=None):
+        self.files = files
+        self.transforms = transforms
+    def __len__(self):
+        return len(self.files)
+    def __getitem__(self, i):
+        path = self.files[i]
+        img = Image.open(path).convert('RGB')
+        if self.transforms is not None:
+            img = self.transforms(img)
+        return img
+def get_activations(files, model, batch_size=50, dims=2048, device='cpu',
+                    num_workers=1, resize=0):
+    """Calculates the activations of the pool_3 layer for all images.
+    Params:
+    -- files       : List of image files paths
+    -- model       : Instance of inception model
+    -- batch_size  : Batch size of images for the model to process at once.
+                     Make sure that the number of samples is a multiple of
+                     the batch size, otherwise some samples are ignored. This
+                     behavior is retained to match the original FID score
+                     implementation.
+    -- dims        : Dimensionality of features returned by Inception
+    -- device      : Device to run calculations
+    -- num_workers : Number of parallel dataloader workers
+    Returns:
+    -- A numpy array of dimension (num images, dims) that contains the
+       activations of the given tensor when feeding inception with the
+       query tensor.
+    """
+    model.eval()
+    if batch_size > len(files):
+        print(('Warning: batch size is bigger than the data size. '
+               'Setting batch size to data size'))
+        batch_size = len(files)
+    if resize > 0:
+        tform = TF.Compose([TF.Resize((resize, resize)), TF.ToTensor()])
+    else:
+        tform = TF.ToTensor()
+    if isinstance(files, list):
+        dataset = ImagePathDataset(files, transforms=tform)
+    else:
+        files.set_transform(tform)
+        dataset = files
+    dataloader = torch.utils.data.DataLoader(dataset,
+                                             batch_size=batch_size,
+                                             shuffle=False,
+                                             drop_last=False,
+                                             num_workers=num_workers)
+    pred_arr = np.empty((len(files), dims))
+    start_idx = 0
+    for batch in tqdm(dataloader):
+        batch = batch['image'].to(device)
+        with torch.no_grad():
+            pred = model(batch)[0]
+        # If model output is not scalar, apply global spatial average pooling.
+        # This happens if you choose a dimensionality not equal 2048.
+        if pred.size(2) != 1 or pred.size(3) != 1:
+            pred = adaptive_avg_pool2d(pred, output_size=(1, 1))
+        pred = pred.squeeze(3).squeeze(2).cpu().numpy()
+        pred_arr[start_idx:start_idx + pred.shape[0]] = pred
+        start_idx = start_idx + pred.shape[0]
+    return pred_arr
+def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
+    """Numpy implementation of the Frechet Distance.
+    The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
+    and X_2 ~ N(mu_2, C_2) is
+            d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
+    Stable version by Dougal J. Sutherland.
+    Params:
+    -- mu1   : Numpy array containing the activations of a layer of the
+               inception net (like returned by the function 'get_predictions')
+               for generated samples.
+    -- mu2   : The sample mean over activations, precalculated on an
+               representative data set.
+    -- sigma1: The covariance matrix over activations for generated samples.
+    -- sigma2: The covariance matrix over activations, precalculated on an
+               representative data set.
+    Returns:
+    --   : The Frechet Distance.
+    """
+    mu1 = np.atleast_1d(mu1)
+    mu2 = np.atleast_1d(mu2)
+    sigma1 = np.atleast_2d(sigma1)
+    sigma2 = np.atleast_2d(sigma2)
+    assert mu1.shape == mu2.shape, \
+        'Training and test mean vectors have different lengths'
+    assert sigma1.shape == sigma2.shape, \
+        'Training and test covariances have different dimensions'
+    diff = mu1 - mu2
+    # Product might be almost singular
+    covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+    if not np.isfinite(covmean).all():
+        msg = ('fid calculation produces singular product; '
+               'adding %s to diagonal of cov estimates') % eps
+        print(msg)
+        offset = np.eye(sigma1.shape[0]) * eps
+        covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+    # Numerical error might give slight imaginary component
+    if np.iscomplexobj(covmean):
+        if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+            m = np.max(np.abs(covmean.imag))
+            raise ValueError('Imaginary component {}'.format(m))
+        covmean = covmean.real
+    tr_covmean = np.trace(covmean)
+    return (diff.dot(diff) + np.trace(sigma1)
+            + np.trace(sigma2) - 2 * tr_covmean)
+def calculate_activation_statistics(files, model, batch_size=50, dims=2048,
+                                    device='cpu', num_workers=1, resize=0):
+    """Calculation of the statistics used by the FID.
+    Params:
+    -- files       : List of image files paths
+    -- model       : Instance of inception model
+    -- batch_size  : The images numpy array is split into batches with
+                     batch size batch_size. A reasonable batch size
+                     depends on the hardware.
+    -- dims        : Dimensionality of features returned by Inception
+    -- device      : Device to run calculations
+    -- num_workers : Number of parallel dataloader workers
+    Returns:
+    -- mu    : The mean over samples of the activations of the pool_3 layer of
+               the inception model.
+    -- sigma : The covariance matrix of the activations of the pool_3 layer of
+               the inception model.
+    """
+    act = get_activations(files, model, batch_size, dims, device, num_workers, resize)
+    mu = np.mean(act, axis=0)
+    sigma = np.cov(act, rowvar=False)
+    return mu, sigma
+def compute_statistics_of_path(path, model, batch_size, dims, device,
+                               num_workers=1, nimages=None, resize=0):
+    if path.endswith('.npz'):
+        with np.load(path) as f:
+            m, s = f['mu'][:], f['sigma'][:]
+    else:
+        path = pathlib.Path(path)
+        if (path/'data.mdb').exists():
+            files = ImageDataset(path, None)
+            nfiles = len(files)
+            n = nfiles if nimages is None else min(nimages, nfiles)
+            files.set_ids(range(n))
+        else:
+            files = sorted([file for ext in IMAGE_EXTENSIONS
+                       for file in path.glob('**/*.{}'.format(ext))])
+            nfiles = len(files)
+            n = nfiles if nimages is None else min(nimages, nfiles)
+            files = files[:n]
+        print(f'Found {nfiles} images. Computing FID with {n} images.')
+        m, s = calculate_activation_statistics(files, model, batch_size,
+                                               dims, device, num_workers, resize)
+    return m, s
+def calculate_fid_given_paths(paths, batch_size, device, dims, num_workers=1, nimages=None, resize=0):
+    """Calculates the FID of two paths"""
+    for p in paths:
+        if not os.path.exists(p):
+            raise RuntimeError('Invalid path: %s' % p)
+    block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
+    model = InceptionV3([block_idx]).to(device)
+    m1, s1 = compute_statistics_of_path(paths[0], model, batch_size,
+                                        dims, device, num_workers, nimages, resize)
+    m2, s2 = compute_statistics_of_path(paths[1], model, batch_size,
+                                        dims, device, num_workers, nimages, resize)
+    fid_value = calculate_frechet_distance(m1, s1, m2, s2)
+    return fid_value
+def save_fid_stats(paths, batch_size, device, dims, num_workers=1, nimages=None, resize=0):
+    """Calculates the FID of two paths"""
+    if not os.path.exists(paths[0]):
+        raise RuntimeError('Invalid path: %s' % paths[0])
+    if os.path.exists(paths[1]):
+        raise RuntimeError('Existing output file: %s' % paths[1])
+    block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
+    model = InceptionV3([block_idx]).to(device)
+    print(f"Saving statistics for {paths[0]}")
+    m1, s1 = compute_statistics_of_path(paths[0], model, batch_size,
+                                        dims, device, num_workers, nimages, resize=0)
+    np.savez_compressed(paths[1], mu=m1, sigma=s1)
+def main():
+    parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--batch-size', type=int, default=20,
+                        help='Batch size to use')
+    parser.add_argument('--num-workers', type=int,
+                        help=('Number of processes to use for data loading. '
+                            'Defaults to `min(8, num_cpus)`'))
+    parser.add_argument('--device', type=str, default='cuda:0',
+                        help='Device to use. Like cuda, cuda:0 or cpu')
+    parser.add_argument('--dims', type=int, default=2048,
+                        choices=list(InceptionV3.BLOCK_INDEX_BY_DIM),
+                        help=('Dimensionality of Inception features to use. '
+                            'By default, uses pool3 features'))
+    parser.add_argument('--nimages', type=int, default=50000, help='max number of images to use')
+    parser.add_argument('--resize', type=int, default=0, help='resize images to this size, 0 mean keep original size')
+    parser.add_argument('--save-stats', action='store_true',
+                        help=('Generate an npz archive from a directory of samples. '
+                            'The first path is used as input and the second as output.'))
+    parser.add_argument('path', type=str, nargs=2,
+                        help=('Paths to the generated images or '
+                            'to .npz statistic files'))
+    args = parser.parse_args()
+    if args.device is None:
+        device = torch.device('cuda' if (torch.cuda.is_available()) else 'cpu')
+    else:
+        device = torch.device(args.device)
+    if args.num_workers is None:
+        try:
+            num_cpus = len(os.sched_getaffinity(0))
+        except AttributeError:
+            # os.sched_getaffinity is not available under Windows, use
+            # os.cpu_count instead (which may not return the *available* number
+            # of CPUs).
+            num_cpus = os.cpu_count()
+        num_workers = min(num_cpus, 8) if num_cpus is not None else 0
+    else:
+        num_workers = args.num_workers
+    if args.save_stats:
+        save_fid_stats(args.path, args.batch_size, device, args.dims, num_workers, args.nimages, args.resize)
+        return
+    fid_value = calculate_fid_given_paths(args.path,
+                                          args.batch_size,
+                                          device,
+                                          args.dims,
+                                          num_workers,
+                                          args.nimages,
+                                          args.resize)
+    print('FID: ', fid_value)
+if __name__ == '__main__':
+    main()

tools/gradcam.py ADDED Viewed

	@@ -0,0 +1,152 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+gradcam visualisation for each GAN class
+@author: Tu Bui @surrey.ac.uk
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+import inspect
+import argparse
+import torch
+import numpy as np
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import cv2
+from PIL import Image, ImageDraw, ImageFont
+import torch
+import torchvision
+from torch.autograd import Function
+import torch.nn.functional as F
+def show_cam_on_image(img, cam, cmap='jet'):
+    """
+    Args:
+    img     PIL image (H,W,3)
+    cam     heatmap (H, W), range [0,1]
+    Returns:
+            PIL image with heatmap applied.
+    """
+    cm = plt.get_cmap(cmap)
+    cam = cm(cam)[...,:3]  # RGB [0,1]
+    cam = np.array(img, dtype=np.float32)/255. + cam
+    cam /= cam.max()
+    cam = np.uint8(cam*255)
+    return Image.fromarray(cam)
+class HookedModel(object):
+    def __init__(self, model, feature_layer_name):
+        self.model = model
+        self.feature_trees = feature_layer_name.split('.')
+    def __call__(self, x):
+        x = feedforward(x, self.model, self.feature_trees)
+        return x
+def feedforward(x, module, layer_names):
+    for name, submodule in module._modules.items():
+        # print(f'Forwarding {name} ...')
+        if name == layer_names[0]:
+            if len(layer_names) == 1:  # leaf node reached
+                # print(f'    Hook {name}')
+                x = submodule(x)
+                x.register_hook(save_gradients)
+                save_features(x)
+            else:
+                # print(f'  Stepping into {name}:')
+                x = feedforward(x, submodule, layer_names[1:])
+        else:
+            x = submodule(x)
+            if name == 'avgpool':  # specific for resnet50
+                x = x.view(x.size(0), -1)
+    return x
+basket = dict(grads=[], feature_maps=[])  # global variable to hold the gradients and output features of the layers of interest
+def empty_basket():
+    basket = dict(grads=[], feature_maps=[])
+def save_gradients(grad):
+    basket['grads'].append(grad)
+def save_features(feat):
+    basket['feature_maps'].append(feat)
+class GradCam(object):
+    def __init__(self, model, feature_layer_name, use_cuda=True):
+        self.model = model
+        self.hooked_model = HookedModel(model, feature_layer_name)
+        self.cuda = use_cuda
+        if self.cuda:
+            self.model = model.cuda()
+        self.model.eval()
+    def __call__(self, x, target, act=None):
+        empty_basket()
+        target = torch.as_tensor(target, dtype=torch.float)
+        if self.cuda:
+            x = x.cuda()
+            target = target.cuda()
+        z = self.hooked_model(x)
+        if act is not None:
+            z = act(z)
+        criteria = F.cosine_similarity(z, target)
+        self.model.zero_grad()
+        criteria.backward(retain_graph=True)
+        gradients = [grad.cpu().data.numpy() for grad in basket['grads'][::-1]]  # gradients appear in reversed order
+        feature_maps = [feat.cpu().data.numpy() for feat in basket['feature_maps']]
+        cams = []
+        for feat, grad in zip(feature_maps, gradients):
+            # feat and grad have shape (1, C, H, W)
+            weight = np.mean(grad, axis=(2,3), keepdims=True)[0]  # (C,1,1)
+            cam = np.sum(weight * feat[0], axis=0)  # (H,w)
+            cam = cv2.resize(cam, x.shape[2:])
+            cam = cam - np.min(cam)
+            cam = cam / (np.max(cam) + np.finfo(np.float32).eps)
+            cams.append(cam)
+        cams = np.array(cams).mean(axis=0)  # (H,W)
+        return cams
+def gradcam_demo():
+    from torchvision import transforms
+    model = torchvision.models.resnet50(pretrained=True)
+    model.eval()
+    gradcam = GradCam(model, 'layer4.2', True)
+    tform = [
+                transforms.Resize((224, 224)),
+                # transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+                ]
+    preprocess = transforms.Compose(tform)
+    im0 = Image.open('/mnt/fast/nobackup/users/tb0035/projects/diffsteg/ControlNet/examples/catdog.jpg').convert('RGB')
+    im = preprocess(im0).unsqueeze(0)
+    target = np.zeros((1,1000), dtype=np.float32)
+    target[0, 285] = 1  # cat
+    cam = gradcam(im, target)
+    im0 = tform[0](im0)
+    out = show_cam_on_image(im0, cam)
+    out.save('test.jpg')
+    print('done')
+def make_target_vector(nclass, target_class_id):
+    out = np.zeros((1, nclass), dtype=np.float32)
+    out[0, target_class_id] = 1
+    return out
+if __name__ == '__main__':
+    gradcam_demo()

tools/helpers.py ADDED Viewed

	@@ -0,0 +1,416 @@

+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jul 12 11:05:57 2016
+some help functions to perform basic tasks
+@author: tb00083
+"""
+import os
+import sys
+import csv
+import socket
+import numpy as np
+import json
+import pickle  # python3.x
+import time
+from datetime import timedelta, datetime
+from typing import Any, List, Tuple, Union
+import subprocess
+import struct
+import errno
+from pprint import pprint
+import glob
+from threading import Thread
+def welcome_message():
+    """
+    get welcome message including hostname and command line arguments
+    """
+    hostname = socket.gethostname()
+    all_args = ' '.join(sys.argv)
+    out_text = 'On server {}: {}\n'.format(hostname, all_args)
+    return out_text
+class EasyDict(dict):
+    """Convenience class that behaves like a dict but allows access with the attribute syntax."""
+    def __init__(self, dict_to_convert=None):
+        if dict_to_convert is not None:
+            for key, val in dict_to_convert.items():
+                self[key] = val
+    def __getattr__(self, name: str) -> Any:
+        try:
+            return self[name]
+        except KeyError:
+            raise AttributeError(name)
+    def __setattr__(self, name: str, value: Any) -> None:
+        self[name] = value
+    def __delattr__(self, name: str) -> None:
+        del self[name]
+def get_time_id_str():
+    """
+    returns a string with DDHHM format, where M is the minutes cut to the tenths
+    """
+    now = datetime.now()
+    time_str = "{:02d}{:02d}{:02d}".format(now.day, now.hour, now.minute)
+    time_str = time_str[:-1]
+    return time_str
+def time_format(t):
+    m, s = divmod(t, 60)
+    h, m = divmod(m, 60)
+    m, h, s = int(m), int(h), int(s)
+    if m == 0 and h == 0:
+        return "{}s".format(s)
+    elif h == 0:
+        return "{}m{}s".format(m, s)
+    else:
+        return "{}h{}m{}s".format(h, m, s)
+def get_all_files(dir_path, trim=0, extension=''):
+    """
+    Recursively get list of all files in the given directory
+    trim = 1 : trim the dir_path from results, 0 otherwise
+    extension: get files with specific format
+    """
+    file_paths = []  # List which will store all of the full filepaths.
+    # Walk the tree.
+    for root, directories, files in os.walk(dir_path):
+        for filename in files:
+            # Join the two strings in order to form the full filepath.
+            filepath = os.path.join(root, filename)
+            file_paths.append(filepath)  # Add it to the list.
+    if trim == 1:  # trim dir_path from results
+        if dir_path[-1] != os.sep:
+            dir_path += os.sep
+        trim_len = len(dir_path)
+        file_paths = [x[trim_len:] for x in file_paths]
+    if extension:  # select only file with specific extension
+        extension = extension.lower()
+        tlen = len(extension)
+        file_paths = [x for x in file_paths if x[-tlen:] == extension]
+    return file_paths  # Self-explanatory.
+def get_all_dirs(dir_path, trim=0):
+    """
+    Recursively get list of all directories in the given directory
+    excluding the '.' and '..' directories
+    trim = 1 : trim the dir_path from results, 0 otherwise
+    """
+    out = []
+    # Walk the tree.
+    for root, directories, files in os.walk(dir_path):
+        for dirname in directories:
+            # Join the two strings in order to form the full filepath.
+            dir_full = os.path.join(root, dirname)
+            out.append(dir_full)  # Add it to the list.
+    if trim == 1:  # trim dir_path from results
+        if dir_path[-1] != os.sep:
+            dir_path += os.sep
+        trim_len = len(dir_path)
+        out = [x[trim_len:] for x in out]
+    return out
+def read_list(file_path, delimeter=' ', keep_original=True):
+    """
+    read list column wise
+    deprecated, should use pandas instead
+    """
+    out = []
+    with open(file_path, 'r') as f:
+        reader = csv.reader(f, delimiter=delimeter)
+        for row in reader:
+            out.append(row)
+    out = zip(*out)
+    if not keep_original:
+        for col in range(len(out)):
+            if out[col][0].isdigit():  # attempt to convert to numerical array
+                out[col] = np.array(out[col]).astype(np.int64)
+    return out
+def save_pickle2(file_path, **kwargs):
+    """
+    save variables to file (using pickle)
+    """
+    # check if any variable is a dict
+    var_count = 0
+    for key in kwargs:
+        var_count += 1
+        if isinstance(kwargs[key], dict):
+            sys.stderr.write('Opps! Cannot write a dictionary into pickle')
+            sys.exit(1)
+    with open(file_path, 'wb') as f:
+        pickler = pickle.Pickler(f, -1)
+        pickler.dump(var_count)
+        for key in kwargs:
+            pickler.dump(key)
+            pickler.dump(kwargs[key])
+def load_pickle2(file_path, varnum=0):
+    """
+    load variables that previously saved using self.save()
+    varnum : number of variables u want to load (0 mean it will load all)
+    Note: if you are loading class instance(s), you must have it defined in advance
+    """
+    with open(file_path, 'rb') as f:
+        pickler = pickle.Unpickler(f)
+        var_count = pickler.load()
+        if varnum:
+            var_count = min([var_count, varnum])
+        out = {}
+        for i in range(var_count):
+            key = pickler.load()
+            out[key] = pickler.load()
+    return out
+def save_pickle(path, obj):
+    """
+    simple method to save a picklable object
+    :param path: path to save
+    :param obj: a picklable object
+    :return: None
+    """
+    with open(path, 'wb') as f:
+        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
+def load_pickle(path):
+    """
+    load a pickled object
+    :param path: .pkl path
+    :return: the pickled object
+    """
+    with open(path, 'rb') as f:
+        return pickle.load(f)
+def make_new_dir(dir_path, remove_existing=False, mode=511):
+    """note: default mode in ubuntu is 511"""
+    if not os.path.exists(dir_path):
+        try:
+            if mode == 777:
+                oldmask = os.umask(000)
+                os.makedirs(dir_path, 0o777)
+                os.umask(oldmask)
+            else:
+                os.makedirs(dir_path, mode)
+        except OSError as exc:  # Python >2.5
+            if exc.errno == errno.EEXIST and os.path.isdir(dir_path):
+                pass
+            else:
+                raise
+    if remove_existing:
+        for file_obj in os.listdir(dir_path):
+            file_path = os.path.join(dir_path, file_obj)
+            if os.path.isfile(file_path):
+                os.unlink(file_path)
+def get_latest_file(root, pattern):
+    """
+    get the latest file in a directory that match the provided pattern
+    useful for getting the last checkpoint
+    :param root: search directory
+    :param pattern: search pattern containing 1 wild card representing a number e.g. 'ckpt_*.tar'
+    :return: full path of the file with largest number in wild card, None if not found
+    """
+    out = None
+    parts = pattern.split('*')
+    max_id = - np.inf
+    for path in glob.glob(os.path.join(root, pattern)):
+        id_ = os.path.basename(path)
+        for part in parts:
+            id_ = id_.replace(part, '')
+        try:
+            id_ = int(id_)
+            if id_ > max_id:
+                max_id = id_
+                out = path
+        except:
+            continue
+    return out
+class Locker(object):
+    """place a lock file in specified location
+    useful for distributed computing"""
+    def __init__(self, name='lock.txt', mode=511):
+        """INPUT: name default file name to be created as a lock
+                  mode if a directory has to be created, set its permission to mode"""
+        self.name = name
+        self.mode = mode
+    def lock(self, path):
+        make_new_dir(path, False, self.mode)
+        with open(os.path.join(path, self.name), 'w') as f:
+            f.write('progress')
+    def finish(self, path):
+        make_new_dir(path, False, self.mode)
+        with open(os.path.join(path, self.name), 'w') as f:
+            f.write('finish')
+    def customise(self, path, text):
+        make_new_dir(path, False, self.mode)
+        with open(os.path.join(path, self.name), 'w') as f:
+            f.write(text)
+    def is_locked(self, path):
+        out = False
+        check_path = os.path.join(path, self.name)
+        if os.path.exists(check_path):
+            text = open(check_path, 'r').readline().strip()
+            out = True if text == 'progress' else False
+        return out
+    def is_finished(self, path):
+        out = False
+        check_path = os.path.join(path, self.name)
+        if os.path.exists(check_path):
+            text = open(check_path, 'r').readline().strip()
+            out = True if text == 'finish' else False
+        return out
+    def is_locked_or_finished(self, path):
+        return self.is_locked(path) | self.is_finished(path)
+    def clean(self, path):
+        check_path = os.path.join(path, self.name)
+        if os.path.exists(check_path):
+            try:
+                os.remove(check_path)
+            except Exception as e:
+                print('Unable to remove %s: %s.' % (check_path, e))
+class ProgressBar(object):
+    """show progress"""
+    def __init__(self, total, increment=5):
+        self.total = total
+        self.point = self.total / 100.0
+        self.increment = increment
+        self.interval = int(self.total * self.increment / 100)
+        self.milestones = list(range(0, total, self.interval)) + [self.total, ]
+        self.id = 0
+    def show_progress(self, i):
+        if i >= self.milestones[self.id]:
+            while i >= self.milestones[self.id]:
+                self.id += 1
+            sys.stdout.write("\r[" + "=" * int(i / self.interval) +
+                             " " * int((self.total - i) / self.interval) + "]" + str(int((i + 1) / self.point)) + "%")
+            sys.stdout.flush()
+class Timer(object):
+    def __init__(self):
+        self.start_t = time.time()
+        self.last_t = self.start_t
+    def time(self, lap=False):
+        end_t = time.time()
+        if lap:
+            out = timedelta(seconds=int(end_t - self.last_t))  # count from last stop point
+        else:
+            out = timedelta(seconds=int(end_t - self.start_t))  # count from beginning
+        self.last_t = end_t
+        return out
+class ExThread(Thread):
+    def run(self):
+        self.exc = None
+        try:
+            if hasattr(self, '_Thread__target'):
+                # Thread uses name mangling prior to Python 3.
+                self.ret = self._Thread__target(*self._Thread__args, **self._Thread__kwargs)
+            else:
+                self.ret = self._target(*self._args, **self._kwargs)
+        except BaseException as e:
+            self.exc = e
+    def join(self):
+        super(ExThread, self).join()
+        if self.exc:
+            raise RuntimeError('Exception in thread.') from self.exc
+        return self.ret
+def get_gpu_free_mem():
+    """return a list of free GPU memory"""
+    sp = subprocess.Popen(['nvidia-smi', '-q'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    out_str = sp.communicate()
+    out_list = out_str[0].decode("utf-8") .split('\n')
+    out = []
+    for i in range(len(out_list)):
+        item = out_list[i]
+        if item.strip() == 'FB Memory Usage':
+            free_mem = int(out_list[i + 3].split(':')[1].strip().split(' ')[0])
+            out.append(free_mem)
+    return out
+def float2hex(x):
+    """
+    x: a vector
+    return: x in hex
+    """
+    f = np.float32(x)
+    out = ''
+    if f.size == 1:  # just a single number
+        f = [f, ]
+    for e in f:
+        h = hex(struct.unpack('<I', struct.pack('<f', e))[0])
+        out += h[2:].zfill(8)
+    return out
+def hex2float(x):
+    """
+    x: a string with len divided by 8
+    return x as array of float32
+    """
+    assert len(x) % 8 == 0, 'Error! string len = {} not divided by 8'.format(len(x))
+    l = len(x) / 8
+    out = np.empty(l, dtype=np.float32)
+    x = [x[i:i + 8] for i in range(0, len(x), 8)]
+    for i, e in enumerate(x):
+        out[i] = struct.unpack('!f', e.decode('hex'))[0]
+    return out
+def nice_print(inputs, stream=sys.stdout):
+    """print a list of string to file stream"""
+    if type(inputs) is not list:
+        tstrings = inputs.split('\n')
+        pprint(tstrings, stream=stream)
+    else:
+        for string in inputs:
+            nice_print(string, stream=stream)
+    stream.flush()

tools/hparams.py ADDED Viewed

	@@ -0,0 +1,743 @@

+# coding=utf-8
+# Copyright 2019 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# source: https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/hparam.py
+# Forked with minor changes from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/training/python/training/hparam.py  pylint: disable=line-too-long
+"""Hyperparameter values."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import json
+import numbers
+import re
+import six
+import numpy as np
+# Define the regular expression for parsing a single clause of the input
+# (delimited by commas).  A legal clause looks like:
+#   <variable name>[<index>]? = <rhs>
+# where <rhs> is either a single token or [] enclosed list of tokens.
+# For example:  "var[1] = a" or "x = [1,2,3]"
+PARAM_RE = re.compile(r"""
+    (?P<name>[a-zA-Z][\w\.]*)      # variable name: "var" or "x"
+    (\[\s*(?P<index>\d+)\s*\])?  # (optional) index: "1" or None
+    \s*=\s*
+    ((?P<val>[^,\[]*)            # single value: "a" or None
+     |
+     \[(?P<vals>[^\]]*)\])       # list of values: None or "1,2,3"
+    ($|,\s*)""", re.VERBOSE)
+def copy_hparams(hparams):
+    """Return a copy of an HParams instance."""
+    return HParams(**hparams.values())
+def print_config(hps):
+    for key, val in six.iteritems(hps.values()):
+        print('%s = %s' % (key, str(val)))
+def save_config(output_file, hps, verbose=True):
+    def convert(o):  # json cannot serialize integer in np.int64 format
+        if isinstance(o, np.int64):
+            return int(o)
+        raise TypeError
+    if verbose:
+        print_config(hps)
+    with open(output_file, 'w') as f:
+        json.dump(hps.values(), f, indent=True, default=convert)
+def load_config(hps, config_file, verbose=True):
+    """
+    parse hparams from config file
+    :param hps: hparams object whose values to be updated
+    :param config_file: json config file
+    :param verbose: print out values
+    """
+    try:
+        with open(config_file, 'r') as fin:
+            hps.parse_json(fin.read())
+        if verbose:
+            print_config(hps)
+    except Exception as e:
+        print('Error reading config file %s: %s.\nConfig will not be updated.' % (config_file, e))
+    # return hps
+def _parse_fail(name, var_type, value, values):
+    """Helper function for raising a value error for bad assignment."""
+    raise ValueError(
+        'Could not parse hparam \'%s\' of type \'%s\' with value \'%s\' in %s' %
+        (name, var_type.__name__, value, values))
+def _reuse_fail(name, values):
+    """Helper function for raising a value error for reuse of name."""
+    raise ValueError('Multiple assignments to variable \'%s\' in %s' % (name,
+                                                                        values))
+def _process_scalar_value(name, parse_fn, var_type, m_dict, values,
+                          results_dictionary):
+    """Update results_dictionary with a scalar value.
+    Used to update the results_dictionary to be returned by parse_values when
+    encountering a clause with a scalar RHS (e.g.  "s=5" or "arr[0]=5".)
+    Mutates results_dictionary.
+    Args:
+      name: Name of variable in assignment ("s" or "arr").
+      parse_fn: Function for parsing the actual value.
+      var_type: Type of named variable.
+      m_dict: Dictionary constructed from regex parsing.
+        m_dict['val']: RHS value (scalar)
+        m_dict['index']: List index value (or None)
+      values: Full expression being parsed
+      results_dictionary: The dictionary being updated for return by the parsing
+        function.
+    Raises:
+      ValueError: If the name has already been used.
+    """
+    try:
+        parsed_value = parse_fn(m_dict['val'])
+    except ValueError:
+        _parse_fail(name, var_type, m_dict['val'], values)
+    # If no index is provided
+    if not m_dict['index']:
+        if name in results_dictionary:
+            _reuse_fail(name, values)
+        results_dictionary[name] = parsed_value
+    else:
+        if name in results_dictionary:
+            # The name has already been used as a scalar, then it
+            # will be in this dictionary and map to a non-dictionary.
+            if not isinstance(results_dictionary.get(name), dict):
+                _reuse_fail(name, values)
+        else:
+            results_dictionary[name] = {}
+        index = int(m_dict['index'])
+        # Make sure the index position hasn't already been assigned a value.
+        if index in results_dictionary[name]:
+            _reuse_fail('{}[{}]'.format(name, index), values)
+        results_dictionary[name][index] = parsed_value
+def _process_list_value(name, parse_fn, var_type, m_dict, values,
+                        results_dictionary):
+    """Update results_dictionary from a list of values.
+    Used to update results_dictionary to be returned by parse_values when
+    encountering a clause with a list RHS (e.g.  "arr=[1,2,3]".)
+    Mutates results_dictionary.
+    Args:
+      name: Name of variable in assignment ("arr").
+      parse_fn: Function for parsing individual values.
+      var_type: Type of named variable.
+      m_dict: Dictionary constructed from regex parsing.
+        m_dict['val']: RHS value (scalar)
+      values: Full expression being parsed
+      results_dictionary: The dictionary being updated for return by the parsing
+        function.
+    Raises:
+      ValueError: If the name has an index or the values cannot be parsed.
+    """
+    if m_dict['index'] is not None:
+        raise ValueError('Assignment of a list to a list index.')
+    elements = filter(None, re.split('[ ,]', m_dict['vals']))
+    # Make sure the name hasn't already been assigned a value
+    if name in results_dictionary:
+        raise _reuse_fail(name, values)
+    try:
+        results_dictionary[name] = [parse_fn(e) for e in elements]
+    except ValueError:
+        _parse_fail(name, var_type, m_dict['vals'], values)
+def _cast_to_type_if_compatible(name, param_type, value):
+    """Cast hparam to the provided type, if compatible.
+    Args:
+      name: Name of the hparam to be cast.
+      param_type: The type of the hparam.
+      value: The value to be cast, if compatible.
+    Returns:
+      The result of casting `value` to `param_type`.
+    Raises:
+      ValueError: If the type of `value` is not compatible with param_type.
+        * If `param_type` is a string type, but `value` is not.
+        * If `param_type` is a boolean, but `value` is not, or vice versa.
+        * If `param_type` is an integer type, but `value` is not.
+        * If `param_type` is a float type, but `value` is not a numeric type.
+    """
+    fail_msg = (
+        "Could not cast hparam '%s' of type '%s' from value %r" %
+        (name, param_type, value))
+    # Some callers use None, for which we can't do any casting/checking. :(
+    if issubclass(param_type, type(None)):
+        return value
+    # Avoid converting a non-string type to a string.
+    if (issubclass(param_type, (six.string_types, six.binary_type)) and
+        not isinstance(value, (six.string_types, six.binary_type))):
+        raise ValueError(fail_msg)
+    # Avoid converting a number or string type to a boolean or vice versa.
+    if issubclass(param_type, bool) != isinstance(value, bool):
+        raise ValueError(fail_msg)
+    # Avoid converting float to an integer (the reverse is fine).
+    if (issubclass(param_type, numbers.Integral) and
+        not isinstance(value, numbers.Integral)):
+        raise ValueError(fail_msg)
+    # Avoid converting a non-numeric type to a numeric type.
+    if (issubclass(param_type, numbers.Number) and
+        not isinstance(value, numbers.Number)):
+        raise ValueError(fail_msg)
+    return param_type(value)
+def parse_values(values, type_map, ignore_unknown=False):
+    """Parses hyperparameter values from a string into a python map.
+    `values` is a string containing comma-separated `name=value` pairs.
+    For each pair, the value of the hyperparameter named `name` is set to
+    `value`.
+    If a hyperparameter name appears multiple times in `values`, a ValueError
+    is raised (e.g. 'a=1,a=2', 'a[1]=1,a[1]=2').
+    If a hyperparameter name in both an index assignment and scalar assignment,
+    a ValueError is raised.  (e.g. 'a=[1,2,3],a[0] = 1').
+    The hyperparameter name may contain '.' symbols, which will result in an
+    attribute name that is only accessible through the getattr and setattr
+    functions.  (And must be first explicit added through add_hparam.)
+    WARNING: Use of '.' in your variable names is allowed, but is not well
+    supported and not recommended.
+    The `value` in `name=value` must follows the syntax according to the
+    type of the parameter:
+    *  Scalar integer: A Python-parsable integer point value.  E.g.: 1,
+       100, -12.
+    *  Scalar float: A Python-parsable floating point value.  E.g.: 1.0,
+       -.54e89.
+    *  Boolean: Either true or false.
+    *  Scalar string: A non-empty sequence of characters, excluding comma,
+       spaces, and square brackets.  E.g.: foo, bar_1.
+    *  List: A comma separated list of scalar values of the parameter type
+       enclosed in square brackets.  E.g.: [1,2,3], [1.0,1e-12], [high,low].
+    When index assignment is used, the corresponding type_map key should be the
+    list name.  E.g. for "arr[1]=0" the type_map must have the key "arr" (not
+    "arr[1]").
+    Args:
+      values: String.  Comma separated list of `name=value` pairs where
+        'value' must follow the syntax described above.
+      type_map: A dictionary mapping hyperparameter names to types.  Note every
+        parameter name in values must be a key in type_map.  The values must
+        conform to the types indicated, where a value V is said to conform to a
+        type T if either V has type T, or V is a list of elements of type T.
+        Hence, for a multidimensional parameter 'x' taking float values,
+        'x=[0.1,0.2]' will parse successfully if type_map['x'] = float.
+      ignore_unknown: Bool. Whether values that are missing a type in type_map
+        should be ignored. If set to True, a ValueError will not be raised for
+        unknown hyperparameter type.
+    Returns:
+      A python map mapping each name to either:
+      * A scalar value.
+      * A list of scalar values.
+      * A dictionary mapping index numbers to scalar values.
+      (e.g. "x=5,L=[1,2],arr[1]=3" results in {'x':5,'L':[1,2],'arr':{1:3}}")
+    Raises:
+      ValueError: If there is a problem with input.
+      * If `values` cannot be parsed.
+      * If a list is assigned to a list index (e.g. 'a[1] = [1,2,3]').
+      * If the same rvalue is assigned two different values (e.g. 'a=1,a=2',
+        'a[1]=1,a[1]=2', or 'a=1,a=[1]')
+    """
+    results_dictionary = {}
+    pos = 0
+    while pos < len(values):
+        m = PARAM_RE.match(values, pos)
+        if not m:
+            raise ValueError('Malformed hyperparameter value: %s' % values[pos:])
+        # Check that there is a comma between parameters and move past it.
+        pos = m.end()
+        # Parse the values.
+        m_dict = m.groupdict()
+        name = m_dict['name']
+        if name not in type_map:
+            if ignore_unknown:
+                continue
+            raise ValueError('Unknown hyperparameter type for %s' % name)
+        type_ = type_map[name]
+        # Set up correct parsing function (depending on whether type_ is a bool)
+        if type_ == bool:
+            def parse_bool(value):
+                if value in ['true', 'True']:
+                    return True
+                elif value in ['false', 'False']:
+                    return False
+                else:
+                    try:
+                        return bool(int(value))
+                    except ValueError:
+                        _parse_fail(name, type_, value, values)
+            parse = parse_bool
+        else:
+            parse = type_
+        # If a singe value is provided
+        if m_dict['val'] is not None:
+            _process_scalar_value(name, parse, type_, m_dict, values,
+                                  results_dictionary)
+        # If the assigned value is a list:
+        elif m_dict['vals'] is not None:
+            _process_list_value(name, parse, type_, m_dict, values,
+                                results_dictionary)
+        else:  # Not assigned a list or value
+            _parse_fail(name, type_, '', values)
+    return results_dictionary
+class HParams(object):
+    """Class to hold a set of hyperparameters as name-value pairs.
+    A `HParams` object holds hyperparameters used to build and train a model,
+    such as the number of hidden units in a neural net layer or the learning rate
+    to use when training.
+    You first create a `HParams` object by specifying the names and values of the
+    hyperparameters.
+    To make them easily accessible the parameter names are added as direct
+    attributes of the class.  A typical usage is as follows:
+    ```python
+    # Create a HParams object specifying names and values of the model
+    # hyperparameters:
+    hparams = HParams(learning_rate=0.1, num_hidden_units=100)
+    # The hyperparameter are available as attributes of the HParams object:
+    hparams.learning_rate ==> 0.1
+    hparams.num_hidden_units ==> 100
+    ```
+    Hyperparameters have type, which is inferred from the type of their value
+    passed at construction type.   The currently supported types are: integer,
+    float, boolean, string, and list of integer, float, boolean, or string.
+    You can override hyperparameter values by calling the
+    [`parse()`](#HParams.parse) method, passing a string of comma separated
+    `name=value` pairs.  This is intended to make it possible to override
+    any hyperparameter values from a single command-line flag to which
+    the user passes 'hyper-param=value' pairs.  It avoids having to define
+    one flag for each hyperparameter.
+    The syntax expected for each value depends on the type of the parameter.
+    See `parse()` for a description of the syntax.
+    Example:
+    ```python
+    # Define a command line flag to pass name=value pairs.
+    # For example using argparse:
+    import argparse
+    parser = argparse.ArgumentParser(description='Train my model.')
+    parser.add_argument('--hparams', type=str,
+                        help='Comma separated list of "name=value" pairs.')
+    args = parser.parse_args()
+    ...
+    def my_program():
+      # Create a HParams object specifying the names and values of the
+      # model hyperparameters:
+      hparams = tf.HParams(learning_rate=0.1, num_hidden_units=100,
+                           activations=['relu', 'tanh'])
+      # Override hyperparameters values by parsing the command line
+      hparams.parse(args.hparams)
+      # If the user passed `--hparams=learning_rate=0.3` on the command line
+      # then 'hparams' has the following attributes:
+      hparams.learning_rate ==> 0.3
+      hparams.num_hidden_units ==> 100
+      hparams.activations ==> ['relu', 'tanh']
+      # If the hyperparameters are in json format use parse_json:
+      hparams.parse_json('{"learning_rate": 0.3, "activations": "relu"}')
+    ```
+    """
+    _HAS_DYNAMIC_ATTRIBUTES = True  # Required for pytype checks.
+    def __init__(self, model_structure=None, **kwargs):
+        """Create an instance of `HParams` from keyword arguments.
+        The keyword arguments specify name-values pairs for the hyperparameters.
+        The parameter types are inferred from the type of the values passed.
+        The parameter names are added as attributes of `HParams` object, so they
+        can be accessed directly with the dot notation `hparams._name_`.
+        Example:
+        ```python
+        # Define 3 hyperparameters: 'learning_rate' is a float parameter,
+        # 'num_hidden_units' an integer parameter, and 'activation' a string
+        # parameter.
+        hparams = tf.HParams(
+            learning_rate=0.1, num_hidden_units=100, activation='relu')
+        hparams.activation ==> 'relu'
+        ```
+        Note that a few names are reserved and cannot be used as hyperparameter
+        names.  If you use one of the reserved name the constructor raises a
+        `ValueError`.
+        Args:
+          model_structure: An instance of ModelStructure, defining the feature
+            crosses to be used in the Trial.
+          **kwargs: Key-value pairs where the key is the hyperparameter name and
+            the value is the value for the parameter.
+        Raises:
+          ValueError: If both `hparam_def` and initialization values are provided,
+            or if one of the arguments is invalid.
+        """
+        # Register the hyperparameters and their type in _hparam_types.
+        # This simplifies the implementation of parse().
+        # _hparam_types maps the parameter name to a tuple (type, bool).
+        # The type value is the type of the parameter for scalar hyperparameters,
+        # or the type of the list elements for multidimensional hyperparameters.
+        # The bool value is True if the value is a list, False otherwise.
+        self._hparam_types = {}
+        self._model_structure = model_structure
+        for name, value in six.iteritems(kwargs):
+            self.add_hparam(name, value)
+    def __add__(self, other):
+        """
+        addition operation keeping key order
+        """
+        out = HParams()
+        for key in self._hparam_types.keys():
+            out.add_hparam(key, getattr(self, key))
+        for key in other._hparam_types.keys():
+            if getattr(out, key, None) is None:  # add new param
+                out.add_hparam(key, getattr(other, key))
+            else:  # update existing param
+                out.set_hparam(key, getattr(other, key))
+        return out
+    def __str__(self):
+        s = 'HParams(\n'
+        for key, val in six.iteritems(self.values()):
+            s += f'\t{key} = {val}\n'
+            # print('%s = %s' % (key, str(val)))
+        s += ')'
+        return s
+    def __repr__(self):
+        return self.__str__()
+    def add_hparam(self, name, value):
+        """Adds {name, value} pair to hyperparameters.
+        Args:
+          name: Name of the hyperparameter.
+          value: Value of the hyperparameter. Can be one of the following types:
+            int, float, string, int list, float list, or string list.
+        Raises:
+          ValueError: if one of the arguments is invalid.
+        """
+        # Keys in kwargs are unique, but 'name' could the name of a pre-existing
+        # attribute of this object.  In that case we refuse to use it as a
+        # hyperparameter name.
+        if getattr(self, name, None) is not None:
+            raise ValueError('Hyperparameter name is reserved: %s' % name)
+        if isinstance(value, (list, tuple)):
+            if not value:
+                raise ValueError(
+                    'Multi-valued hyperparameters cannot be empty: %s' % name)
+            self._hparam_types[name] = (type(value[0]), True)
+        else:
+            self._hparam_types[name] = (type(value), False)
+        setattr(self, name, value)
+    def set_hparam(self, name, value):
+        """Set the value of an existing hyperparameter.
+        This function verifies that the type of the value matches the type of the
+        existing hyperparameter.
+        Args:
+          name: Name of the hyperparameter.
+          value: New value of the hyperparameter.
+        Raises:
+          KeyError: If the hyperparameter doesn't exist.
+          ValueError: If there is a type mismatch.
+        """
+        param_type, is_list = self._hparam_types[name]
+        if isinstance(value, list):
+            if not is_list:
+                raise ValueError(
+                    'Must not pass a list for single-valued parameter: %s' % name)
+            setattr(self, name, [
+                _cast_to_type_if_compatible(name, param_type, v) for v in value])
+        else:
+            if is_list:
+                raise ValueError(
+                    'Must pass a list for multi-valued parameter: %s.' % name)
+            setattr(self, name, _cast_to_type_if_compatible(name, param_type, value))
+    def del_hparam(self, name):
+        """Removes the hyperparameter with key 'name'.
+        Does nothing if it isn't present.
+        Args:
+          name: Name of the hyperparameter.
+        """
+        if hasattr(self, name):
+            delattr(self, name)
+            del self._hparam_types[name]
+    def parse(self, values):
+        """Override existing hyperparameter values, parsing new values from a string.
+        See parse_values for more detail on the allowed format for values.
+        Args:
+          values: String.  Comma separated list of `name=value` pairs where 'value'
+            must follow the syntax described above.
+        Returns:
+          The `HParams` instance.
+        Raises:
+          ValueError: If `values` cannot be parsed or a hyperparameter in `values`
+          doesn't exist.
+        """
+        type_map = {}
+        for name, t in self._hparam_types.items():
+            param_type, _ = t
+            type_map[name] = param_type
+        values_map = parse_values(values, type_map)
+        return self.override_from_dict(values_map)
+    def override_from_dict(self, values_dict):
+        """Override existing hyperparameter values, parsing new values from a dictionary.
+        Args:
+          values_dict: Dictionary of name:value pairs.
+        Returns:
+          The `HParams` instance.
+        Raises:
+          KeyError: If a hyperparameter in `values_dict` doesn't exist.
+          ValueError: If `values_dict` cannot be parsed.
+        """
+        for name, value in values_dict.items():
+            self.set_hparam(name, value)
+        return self
+    def set_model_structure(self, model_structure):
+        self._model_structure = model_structure
+    def get_model_structure(self):
+        return self._model_structure
+    def to_json(self, indent=None, separators=None, sort_keys=False):
+        """Serializes the hyperparameters into JSON.
+        Args:
+          indent: If a non-negative integer, JSON array elements and object members
+            will be pretty-printed with that indent level. An indent level of 0, or
+            negative, will only insert newlines. `None` (the default) selects the
+            most compact representation.
+          separators: Optional `(item_separator, key_separator)` tuple. Default is
+            `(', ', ': ')`.
+          sort_keys: If `True`, the output dictionaries will be sorted by key.
+        Returns:
+          A JSON string.
+        """
+        def remove_callables(x):
+            """Omit callable elements from input with arbitrary nesting."""
+            if isinstance(x, dict):
+                return {k: remove_callables(v) for k, v in six.iteritems(x)
+                        if not callable(v)}
+            elif isinstance(x, list):
+                return [remove_callables(i) for i in x if not callable(i)]
+            return x
+        return json.dumps(
+                          remove_callables(self.values()),
+                          indent=indent,
+                          separators=separators,
+                          sort_keys=sort_keys)
+    def parse_json(self, values_json):
+        """Override existing hyperparameter values, parsing new values from a json object.
+        Args:
+          values_json: String containing a json object of name:value pairs.
+        Returns:
+          The `HParams` instance.
+        Raises:
+          KeyError: If a hyperparameter in `values_json` doesn't exist.
+          ValueError: If `values_json` cannot be parsed.
+        """
+        values_map = json.loads(values_json)
+        return self.override_from_dict(values_map)
+    def values(self):
+        """Return the hyperparameter values as a Python dictionary.
+        Returns:
+          A dictionary with hyperparameter names as keys.  The values are the
+          hyperparameter values.
+        """
+        return {n: getattr(self, n) for n in self._hparam_types.keys()}
+    def get(self, key, default=None):
+        """Returns the value of `key` if it exists, else `default`."""
+        if key in self._hparam_types:
+            # Ensure that default is compatible with the parameter type.
+            if default is not None:
+                param_type, is_param_list = self._hparam_types[key]
+                type_str = 'list<%s>' % param_type if is_param_list else str(param_type)
+                fail_msg = ("Hparam '%s' of type '%s' is incompatible with "
+                            'default=%s' % (key, type_str, default))
+                is_default_list = isinstance(default, list)
+                if is_param_list != is_default_list:
+                    raise ValueError(fail_msg)
+                try:
+                    if is_default_list:
+                        for value in default:
+                            _cast_to_type_if_compatible(key, param_type, value)
+                    else:
+                        _cast_to_type_if_compatible(key, param_type, default)
+                except ValueError as e:
+                  raise ValueError('%s. %s' % (fail_msg, e))
+            return getattr(self, key)
+        return default
+    def __contains__(self, key):
+        return key in self._hparam_types
+    @staticmethod
+    def _get_kind_name(param_type, is_list):
+        """Returns the field name given parameter type and is_list.
+        Args:
+          param_type: Data type of the hparam.
+          is_list: Whether this is a list.
+        Returns:
+          A string representation of the field name.
+        Raises:
+          ValueError: If parameter type is not recognized.
+        """
+        if issubclass(param_type, bool):
+            # This check must happen before issubclass(param_type, six.integer_types),
+            # since Python considers bool to be a subclass of int.
+            typename = 'bool'
+        elif issubclass(param_type, six.integer_types):
+            # Setting 'int' and 'long' types to be 'int64' to ensure the type is
+            # compatible with both Python2 and Python3.
+            typename = 'int64'
+        elif issubclass(param_type, (six.string_types, six.binary_type)):
+            # Setting 'string' and 'bytes' types to be 'bytes' to ensure the type is
+            # compatible with both Python2 and Python3.
+            typename = 'bytes'
+        elif issubclass(param_type, float):
+            typename = 'float'
+        else:
+            raise ValueError('Unsupported parameter type: %s' % str(param_type))
+        suffix = 'list' if is_list else 'value'
+        return '_'.join([typename, suffix])
+    @staticmethod
+    def save_config(self, output_file, verbose=True):
+        def convert(o):  # json cannot serialize integer in np.int64 format
+            if isinstance(o, np.int64):
+                return int(o)
+            raise TypeError
+        if verbose:
+            print(self)
+        with open(output_file, 'w') as f:
+            json.dump(self.values(), f, indent=True, default=convert)
+    @staticmethod
+    def load_config(config_file, verbose=True):
+        """
+        parse hparams from config file
+        :param config_file: json config file
+        :param verbose: print out values
+        """
+        try:
+            with open(config_file, 'r') as fin:
+                json_dict = json.loads(fin.read())
+            hps = HParams(**json_dict)
+            if verbose:
+                print_config(hps)
+        except Exception as e:
+            print('Error reading config file %s: %s.\nConfig will not be updated.' % (config_file, e))
+        return hps
+    @staticmethod
+    def clone(self):
+        """
+        return a deep copy of this object
+        """
+        return HParams(**self.values)

tools/image_dataset.py ADDED Viewed

	@@ -0,0 +1,184 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+imagefolder loader
+inspired from https://github.com/adambielski/siamese-triplet/blob/master/datasets.py
+@author: Tu Bui @surrey.ac.uk
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+import io
+import time
+import pandas as pd
+import numpy as np
+import random
+from PIL import Image
+from typing import Any, Callable, List, Optional, Tuple
+import torch
+from torchvision import transforms
+from .base_lmdb import PILlmdb, ArrayDatabase
+# from . import debug
+def worker_init_fn(worker_id):
+    # to be passed to torch.utils.data.DataLoader to fix the
+    #  random seed issue with numpy in multi-worker settings
+    torch_seed = torch.initial_seed()
+    random.seed(torch_seed + worker_id)
+    if torch_seed >= 2**30:  # make sure torch_seed + workder_id < 2**32
+        torch_seed = torch_seed % 2**30
+    np.random.seed(torch_seed + worker_id)
+def pil_loader(path: str) -> Image.Image:
+    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
+    with open(path, 'rb') as f:
+        img = Image.open(f)
+        return img.convert('RGB')
+def dataset_wrapper(data_dir, data_list, **kwargs):
+    if os.path.exists(os.path.join(data_dir, 'data.mdb')):
+        return ImageDataset(data_dir, data_list, **kwargs)
+    else:
+        return ImageFolder(data_dir, data_list, **kwargs)
+class ImageFolder(torch.utils.data.Dataset):
+    _repr_indent = 4
+    def __init__(self, data_dir, data_list, secret_len=100, resize=256, transform=None, **kwargs):
+        super().__init__()
+        self.transform = transforms.RandomResizedCrop((resize, resize), scale=(0.8, 1.0), ratio=(0.75, 1.3333333333333333)) if transform is None else transform
+        self.build_data(data_dir, data_list, **kwargs)
+        self.kwargs = kwargs
+        self.secret_len = secret_len
+    def build_data(self, data_dir, data_list, **kwargs):
+        self.data_dir = data_dir
+        if isinstance(data_list, list):
+            self.data_list = data_list
+        elif isinstance(data_list, str):
+            self.data_list = pd.read_csv(data_list)['path'].tolist()
+        elif isinstance(data_list, pd.DataFrame):
+            self.data_list = data_list['path'].tolist()
+        else:
+            raise ValueError('data_list must be a list, str or pd.DataFrame')
+        self.N = len(self.data_list)
+    def __getitem__(self, index):
+        path = self.data_list[index]
+        img = pil_loader(os.path.join(self.data_dir, path))
+        img = self.transform(img)
+        img = np.array(img, dtype=np.float32)/127.5-1.  # [-1, 1]
+        secret = torch.zeros(self.secret_len, dtype=torch.float).random_(0, 2)
+        return {'image': img, 'secret': secret}  # {'img': x, 'index': index}
+    def __len__(self) -> int:
+        # raise NotImplementedError
+        return self.N
+class ImageDataset(torch.utils.data.Dataset):
+    r"""
+    Customised Image Folder class for pytorch.
+    Accept lmdb and a csv list as the input.
+    Usage:
+        dataset = ImageDataset(img_dir, img_list)
+        dataset.set_transform(some_pytorch_transforms)
+        loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True,
+            num_workers=4, worker_init_fn=worker_init_fn)
+        for x,y in loader:
+            # x and y is input and target (dict), the keys can be customised.
+    """
+    _repr_indent = 4
+    def __init__(self, data_dir, data_list, secret_len=100, resize=None,  transform=None, target_transform=None, **kwargs):
+        super().__init__()
+        if resize is not None:
+            self.resize = transforms.Resize((resize, resize))
+        self.set_transform(transform, target_transform)
+        self.build_data(data_dir, data_list, **kwargs)
+        self.secret_len = secret_len
+        self.kwargs = kwargs
+    def set_transform(self, transform, target_transform=None):
+        self.transform, self.target_transform = transform, target_transform
+    def build_data(self, data_dir, data_list, **kwargs):
+        """
+        Args:
+            data_list    (text file) must have at least 3 fields: id, path and label
+        This method must create an attribute self.samples containing ID, input and target samples; and another attribute N storing the dataset size
+        Optional attributes: classes (list of unique classes), group (useful for
+        metric learning)
+        """
+        self.data_dir, self.list = data_dir, data_list
+        if ('dtype' in kwargs) and (kwargs['dtype'].lower() == 'array'):
+            data = ArrayDatabase(data_dir, data_list)
+        else:
+            data = PILlmdb(data_dir, data_list, **kwargs)
+        self.N = len(data)
+        self.classes = np.unique(data.labels)
+        self.samples = {'x': data, 'y': data.labels}
+        # assert isinstance(data_list, str) or isinstance(data_list, pd.DataFrame)
+        # df = pd.read_csv(data_list) if isinstance(data_list, str) else data_list
+        # assert 'id' in df and 'label' in df, f'[DATA] Error! {data_list} must contains "id" and "label".'
+        # ids = df['id'].tolist()
+        # labels = np.array(df['label'].tolist())
+        # data = PILlmdb(data_dir)
+        # assert set(ids).issubset(set(data.keys))  # ids should exist in lmdb
+        # self.N = len(ids)
+        # self.classes, inds = np.unique(labels, return_index=True)
+        # self.samples = {'id': ids, 'x': data, 'y': labels}
+    def set_ids(self, ids):
+        self.samples['x'].set_ids(ids)
+        self.samples['y'] = [self.samples['y'][i] for i in ids]
+        self.N = len(self.samples['x'])
+    def __getitem__(self, index: int) -> Any:
+        """
+        Args:
+            index (int): Index
+        Returns:
+            dict: (x: sample, y: target, **kwargs)
+        """
+        x, y = self.samples['x'][index], self.samples['y'][index]
+        if hasattr(self, 'resize'):
+            x = self.resize(x)
+        if self.transform is not None:
+            x = self.transform(x)
+        if self.target_transform is not None:
+            y = self.target_transform(y)
+        x = np.array(x, dtype=np.float32)/127.5-1.
+        secret = torch.zeros(self.secret_len, dtype=torch.float).random_(0, 2)
+        return {'image': x, 'secret': secret}  # {'img': x, 'index': index}
+    def __len__(self) -> int:
+        # raise NotImplementedError
+        return self.N
+    def __repr__(self) -> str:
+        head = "\nDataset " + self.__class__.__name__
+        body = ["Number of datapoints: {}".format(self.__len__())]
+        if hasattr(self, 'data_dir') and self.data_dir is not None:
+            body.append("data_dir location: {}".format(self.data_dir))
+        if hasattr(self, 'kwargs'):
+            body.append(f'kwargs: {self.kwargs}')
+        body += self.extra_repr().splitlines()
+        if hasattr(self, "transform") and self.transform is not None:
+            body += [repr(self.transform)]
+        lines = [head] + [" " * self._repr_indent + line for line in body]
+        return '\n'.join(lines)
+    def _format_transform_repr(self, transform: Callable, head: str) -> List[str]:
+        lines = transform.__repr__().splitlines()
+        return (["{}{}".format(head, lines[0])] +
+                ["{}{}".format(" " * len(head), line) for line in lines[1:]])
+    def extra_repr(self) -> str:
+        return ""

tools/image_dataset_generic.py ADDED Viewed

	@@ -0,0 +1,157 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+imagefolder loader
+inspired from https://github.com/adambielski/siamese-triplet/blob/master/datasets.py
+@author: Tu Bui @surrey.ac.uk
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+import io
+import time
+import pandas as pd
+import numpy as np
+import random
+from PIL import Image
+from typing import Any, Callable, List, Optional, Tuple
+import torch
+from .base_lmdb import PILlmdb, ArrayDatabase
+from torchvision import transforms
+# from . import debug
+def worker_init_fn(worker_id):
+    # to be passed to torch.utils.data.DataLoader to fix the
+    #  random seed issue with numpy in multi-worker settings
+    torch_seed = torch.initial_seed()
+    random.seed(torch_seed + worker_id)
+    if torch_seed >= 2**30:  # make sure torch_seed + workder_id < 2**32
+        torch_seed = torch_seed % 2**30
+    np.random.seed(torch_seed + worker_id)
+def pil_loader(path: str) -> Image.Image:
+    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
+    with open(path, 'rb') as f:
+        img = Image.open(f)
+        return img.convert('RGB')
+class ImageDataset(torch.utils.data.Dataset):
+    r"""
+    Customised Image Folder class for pytorch.
+    Accept lmdb and a csv list as the input.
+    Usage:
+        dataset = ImageDataset(img_dir, img_list)
+        dataset.set_transform(some_pytorch_transforms)
+        loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True,
+            num_workers=4, worker_init_fn=worker_init_fn)
+        for x,y in loader:
+            # x and y is input and target (dict), the keys can be customised.
+    """
+    _repr_indent = 4
+    def __init__(self, data_dir, data_list, secret_len=100, transform=None, target_transform=None, **kwargs):
+        super().__init__()
+        self.set_transform(transform, target_transform)
+        self.build_data(data_dir, data_list, **kwargs)
+        self.secret_len = secret_len
+        self.kwargs = kwargs
+    def set_transform(self, transform, target_transform=None):
+        self.transform, self.target_transform = transform, target_transform
+    def build_data(self, data_dir, data_list, **kwargs):
+        """
+        Args:
+            data_list    (text file) must have at least 3 fields: id, path and label
+        This method must create an attribute self.samples containing ID, input and target samples; and another attribute N storing the dataset size
+        Optional attributes: classes (list of unique classes), group (useful for
+        metric learning)
+        """
+        self.data_dir, self.list = data_dir, data_list
+        if ('dtype' in kwargs) and (kwargs['dtype'].lower() == 'array'):
+            data = ArrayDatabase(data_dir, data_list)
+        else:
+            data = PILlmdb(data_dir, data_list, **kwargs)
+        self.N = len(data)
+        self.classes = np.unique(data.labels)
+        self.samples = {'x': data, 'y': data.labels}
+    def __getitem__(self, index: int) -> Any:
+        """
+        Args:
+            index (int): Index
+        Returns:
+            dict: (x: sample, y: target, **kwargs)
+        """
+        x, y = self.samples['x'][index], self.samples['y'][index]
+        if self.transform is not None:
+            x = self.transform(x)
+        if self.target_transform is not None:
+            y = self.target_transform(y)
+        x = np.array(x, dtype=np.float32)/127.5-1.
+        secret = torch.zeros(self.secret_len, dtype=torch.float).random_(0, 2)
+        return {'image': x, 'secret': secret}  # {'img': x, 'index': index}
+    def __len__(self) -> int:
+        # raise NotImplementedError
+        return self.N
+    def __repr__(self) -> str:
+        head = "\nDataset " + self.__class__.__name__
+        body = ["Number of datapoints: {}".format(self.__len__())]
+        if hasattr(self, 'data_dir') and self.data_dir is not None:
+            body.append("data_dir location: {}".format(self.data_dir))
+        if hasattr(self, 'kwargs'):
+            body.append(f'kwargs: {self.kwargs}')
+        body += self.extra_repr().splitlines()
+        if hasattr(self, "transform") and self.transform is not None:
+            body += [repr(self.transform)]
+        lines = [head] + [" " * self._repr_indent + line for line in body]
+        return '\n'.join(lines)
+    def _format_transform_repr(self, transform: Callable, head: str) -> List[str]:
+        lines = transform.__repr__().splitlines()
+        return (["{}{}".format(head, lines[0])] +
+                ["{}{}".format(" " * len(head), line) for line in lines[1:]])
+    def extra_repr(self) -> str:
+        return ""
+class ImageFolder(torch.utils.data.Dataset):
+    _repr_indent = 4
+    def __init__(self, data_dir, data_list, secret_len=100, resize=256, transform=None, **kwargs):
+        super().__init__()
+        self.transform = transforms.Resize((resize, resize)) if transform is None else transform
+        self.build_data(data_dir, data_list, **kwargs)
+        self.kwargs = kwargs
+        self.secret_len = secret_len
+    def build_data(self, data_dir, data_list, **kwargs):
+        self.data_dir = data_dir
+        if isinstance(data_list, list):
+            self.data_list = data_list
+        elif isinstance(data_list, str):
+            self.data_list = pd.read_csv(data_list)['path'].tolist()
+        elif isinstance(data_list, pd.DataFrame):
+            self.data_list = data_list['path'].tolist()
+        else:
+            raise ValueError('data_list must be a list, str or pd.DataFrame')
+        self.N = len(self.data_list)
+    def __getitem__(self, index):
+        path = self.data_list[index]
+        img = pil_loader(os.path.join(self.data_dir, path))
+        img = self.transform(img)
+        img = np.array(img, dtype=np.float32)/127.5-1.  # [-1, 1]
+        secret =  torch.zeros(self.secret_len, dtype=torch.float).random_(0, 2)  # not used
+        return {'image': img, 'secret': secret}  # {'img': x, 'index': index}
+    def __len__(self) -> int:
+        # raise NotImplementedError
+        return self.N

tools/image_tools.py ADDED Viewed

	@@ -0,0 +1,164 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@author: Tu Bui @surrey.ac.uk
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from scipy import fftpack
+import sys, os
+from pathlib import Path
+import numpy as np
+import random
+import glob
+import json
+import time
+import importlib
+import pandas as pd
+from tqdm import tqdm
+# from IPython.display import display
+# import seaborn as sns
+import matplotlib
+# matplotlib.use('Agg')  # headless run
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+from PIL import Image, ImageDraw, ImageFont
+cmap = plt.get_cmap("tab10")  # cmap as function
+cmap = plt.rcParams['axes.prop_cycle'].by_key()['color']  # cmap
+FONT = '/vol/research/tubui1/_base/utils/FreeSans.ttf'
+# def imshow(im):
+#     if type(im) is np.ndarray:
+#         im = Image.fromarray(im)
+#     display(im)
+def make_grid(array_list, gsize=(3,3)):
+    """
+    make a grid image from a list of image array (RGB)
+    return: array RGB
+    """
+    assert len(gsize)==2 and gsize[0]*gsize[1]==len(array_list)
+    h,w,c = array_list[0].shape
+    out = np.array(array_list).reshape(gsize[0], gsize[1], h, w, c).transpose(0, 2, 1, 3, 4).reshape(gsize[0]*h, gsize[1]*w, c)
+    return out
+def collage(im_list, size=None, pad=0, color=255):
+    """
+    generalised function of make_grid()
+    work on PIL/numpy images of arbitrary size
+    """
+    if size is None:
+        size=(1, len(im_list))
+    assert len(size)==2
+    if isinstance(im_list[0], np.ndarray):
+        im_list = [Image.fromarray(im) for im in im_list]
+    h, w = size
+    n = len(im_list)
+    canvas = []
+    for i in range(h):
+        start, end = i*w, min((i+1)*w, n)
+        row = combine_horz(im_list[start:end], pad, color)
+        canvas.append(row)
+    canvas = combine_vert(canvas, pad, color)
+    return canvas
+def combine_horz(pil_ims, pad=0, c=255):
+    """
+    Combines multiple pil_ims into a single side-by-side PIL image object.
+    """
+    widths, heights = zip(*(i.size for i in pil_ims))
+    total_width = sum(widths) + (len(pil_ims)-1) * pad
+    max_height = max(heights)
+    color = (c,c,c)
+    new_im = Image.new('RGB', (total_width, max_height), color)
+    x_offset = 0
+    for im in pil_ims:
+        new_im.paste(im, (x_offset,0))
+        x_offset += (im.size[0] + pad)
+    return new_im
+def combine_vert(pil_ims, pad=0, c=255):
+    """
+    Combines multiple pil_ims into a single vertical PIL image object.
+    """
+    widths, heights = zip(*(i.size for i in pil_ims))
+    max_width = max(widths)
+    total_height = sum(heights) + (len(pil_ims)-1)*pad
+    color = (c,c,c)
+    new_im = Image.new('RGB', (max_width, total_height), color)
+    y_offset = 0
+    for im in pil_ims:
+        new_im.paste(im, (0,y_offset))
+        y_offset += (im.size[1] + pad)
+    return new_im
+def make_text_image(img_shape=(100,20), text='hello', font_path=FONT, offset=(0,0), font_size=16):
+    """
+    make a text image with given width/height and font size
+    Args:
+    img_shape, offset    tuple (width, height)
+    font_path            path to font file (TrueType)
+    font_size            max font size, actual may smaller
+    Return:
+    pil image
+    """
+    im = Image.new('RGB', tuple(img_shape), (255,255,255))
+    draw = ImageDraw.Draw(im)
+    def get_font_size(max_font_size):
+        font = ImageFont.truetype(font_path, max_font_size)
+        text_size = font.getsize(text)  # (w,h)
+        start_w = int((img_shape[0] - text_size[0]) / 2)
+        start_h = int((img_shape[1] - text_size[1])/2)
+        if start_h <0 or start_w < 0:
+            return get_font_size(max_font_size-2)
+        else:
+            return font, (start_w, start_h)
+    font, pos = get_font_size(font_size)
+    pos = (pos[0]+offset[0], pos[1]+offset[1])
+    draw.text(pos, text, font=font, fill=0)
+    return im
+def log_scale(array, epsilon=1e-12):
+    """Log scale the input array.
+    """
+    array = np.abs(array)
+    array += epsilon  # no zero in log
+    array = np.log(array)
+    return array
+def dct2(array):
+    """2D DCT"""
+    array = fftpack.dct(array, type=2, norm="ortho", axis=0)
+    array = fftpack.dct(array, type=2, norm="ortho", axis=1)
+    return array
+def idct2(array):
+    """inverse 2D DCT"""
+    array = fftpack.idct(array, type=2, norm="ortho", axis=0)
+    array = fftpack.idct(array, type=2, norm="ortho", axis=1)
+    return array
+class DCT(object):
+    def __init__(self, log=True):
+        self.log = log
+    def __call__(self, x):
+        x = np.array(x)
+        x = dct2(x)
+        if self.log:
+            x = log_scale(x)
+        # normalize
+        x = np.clip((x - x.min())/(x.max() - x.min()) * 255, 0, 255).astype(np.uint8)
+        return Image.fromarray(x)
+    def __repr__(self):
+        s = f'(Discrete Cosine Transform, logarithm={self.log})'
+        return self.__class__.__name__ + s

tools/imgcap_dataset.py ADDED Viewed

	@@ -0,0 +1,163 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Dataset class for image-caption
+@author: Tu Bui @University of Surrey
+"""
+import json
+from PIL import Image
+import numpy as np
+from pathlib import Path
+import torch
+from torch.utils.data import Dataset, DataLoader
+from functools import partial
+import pytorch_lightning as pl
+from ldm.util import instantiate_from_config
+import pandas as pd
+def worker_init_fn(_):
+    worker_info = torch.utils.data.get_worker_info()
+    worker_id = worker_info.id
+    return np.random.seed(np.random.get_state()[1][0] + worker_id)
+class WrappedDataset(Dataset):
+    """Wraps an arbitrary object with __len__ and __getitem__ into a pytorch dataset"""
+    def __init__(self, dataset):
+        self.data = dataset
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        return self.data[idx]
+class DataModuleFromConfig(pl.LightningDataModule):
+    def __init__(self, batch_size, train=None, validation=None, test=None, predict=None, wrap=False, num_workers=None, shuffle_test_loader=False, use_worker_init_fn=False,
+                 shuffle_val_dataloader=False):
+        super().__init__()
+        self.batch_size = batch_size
+        self.dataset_configs = dict()
+        self.num_workers = num_workers if num_workers is not None else batch_size * 2
+        self.use_worker_init_fn = use_worker_init_fn
+        if train is not None:
+            self.dataset_configs["train"] = train
+            self.train_dataloader = self._train_dataloader
+        if validation is not None:
+            self.dataset_configs["validation"] = validation
+            self.val_dataloader = partial(self._val_dataloader, shuffle=shuffle_val_dataloader)
+        if test is not None:
+            self.dataset_configs["test"] = test
+            self.test_dataloader = partial(self._test_dataloader, shuffle=shuffle_test_loader)
+        if predict is not None:
+            self.dataset_configs["predict"] = predict
+            self.predict_dataloader = self._predict_dataloader
+        self.wrap = wrap
+    def prepare_data(self):
+        for data_cfg in self.dataset_configs.values():
+            instantiate_from_config(data_cfg)
+    def setup(self, stage=None):
+        self.datasets = dict(
+            (k, instantiate_from_config(self.dataset_configs[k]))
+            for k in self.dataset_configs)
+        if self.wrap:
+            for k in self.datasets:
+                self.datasets[k] = WrappedDataset(self.datasets[k])
+    def _train_dataloader(self):
+        if self.use_worker_init_fn:
+            init_fn = worker_init_fn
+        else:
+            init_fn = None
+        return DataLoader(self.datasets["train"], batch_size=self.batch_size,
+                          num_workers=self.num_workers, shuffle=True,
+                          worker_init_fn=init_fn)
+    def _val_dataloader(self, shuffle=False):
+        if self.use_worker_init_fn:
+            init_fn = worker_init_fn
+        else:
+            init_fn = None
+        return DataLoader(self.datasets["validation"],
+                          batch_size=self.batch_size,
+                          num_workers=self.num_workers,
+                          worker_init_fn=init_fn,
+                          shuffle=shuffle)
+    def _test_dataloader(self, shuffle=False):
+        if self.use_worker_init_fn:
+            init_fn = worker_init_fn
+        else:
+            init_fn = None
+        return DataLoader(self.datasets["test"], batch_size=self.batch_size,
+                          num_workers=self.num_workers, worker_init_fn=init_fn, shuffle=shuffle)
+    def _predict_dataloader(self, shuffle=False):
+        if self.use_worker_init_fn:
+            init_fn = worker_init_fn
+        else:
+            init_fn = None
+        return DataLoader(self.datasets["predict"], batch_size=self.batch_size,
+                          num_workers=self.num_workers, worker_init_fn=init_fn)
+class ImageCaptionRaw(Dataset):
+    def __init__(self, image_dir, caption_file, secret_len=100, transform=None):
+        super().__init__()
+        self.image_dir = Path(image_dir)
+        self.data = []
+        with open(caption_file, 'rt') as f:
+            for line in f:
+                self.data.append(json.loads(line))
+        self.secret_len = secret_len
+        self.transform = transform
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        item = self.data[idx]
+        image = Image.open(self.image_dir/item['image']).convert('RGB').resize((512,512))
+        caption = item['captions']
+        cid = torch.randint(0, len(caption), (1,)).item()
+        caption = caption[cid]
+        if self.transform is not None:
+            image = self.transform(image)
+        image = np.array(image, dtype=np.float32)/ 255.0  # normalize to [0, 1]
+        target = image * 2.0 - 1.0  # normalize to [-1, 1]
+        secret = torch.zeros(self.secret_len, dtype=torch.float).random_(0, 2)
+        return dict(image=image, caption=caption, target=target, secret=secret)
+class BAMFG(Dataset):
+    def __init__(self, style_dir, gt_dir, data_list, transform=None):
+        super().__init__()
+        self.style_dir = Path(style_dir)
+        self.gt_dir = Path(gt_dir)
+        self.data = pd.read_csv(data_list)
+        self.transform = transform
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        item = self.data.iloc[idx]
+        gt_img = Image.open(self.gt_dir/item['gt_img']).convert('RGB').resize((512,512))
+        style_img = Image.open(self.style_dir/item['style_img']).convert('RGB').resize((512,512))
+        txt = item['prompt']
+        if self.transform is not None:
+            gt_img = self.transform(gt_img)
+            style_img = self.transform(style_img)
+        gt_img = np.array(gt_img, dtype=np.float32)/ 255.0  # normalize to [0, 1]
+        style_img = np.array(style_img, dtype=np.float32)/ 255.0  # normalize to [0, 1]
+        target = gt_img * 2.0 - 1.0  # normalize to [-1, 1]
+        return dict(image=gt_img, txt=txt, hint=style_img)

tools/sifid.py ADDED Viewed

	@@ -0,0 +1,246 @@

+import numpy as np
+import torch
+from scipy import linalg
+import torchvision
+from torchvision import transforms
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
+    """Numpy implementation of the Frechet Distance.
+    The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
+    and X_2 ~ N(mu_2, C_2) is
+            d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
+    Stable version by Dougal J. Sutherland.
+    Params:
+    -- mu1   : Numpy array containing the activations of a layer of the
+               inception net (like returned by the function 'get_predictions')
+               for generated samples.
+    -- mu2   : The sample mean over activations, precalculated on an
+               representative data set.
+    -- sigma1: The covariance matrix over activations for generated samples.
+    -- sigma2: The covariance matrix over activations, precalculated on an
+               representative data set.
+    Returns:
+    --   : The Frechet Distance.
+    """
+    mu1 = np.atleast_1d(mu1)
+    mu2 = np.atleast_1d(mu2)
+    sigma1 = np.atleast_2d(sigma1)
+    sigma2 = np.atleast_2d(sigma2)
+    assert mu1.shape == mu2.shape, \
+        'Training and test mean vectors have different lengths'
+    assert sigma1.shape == sigma2.shape, \
+        'Training and test covariances have different dimensions'
+    diff = mu1 - mu2
+    # Product might be almost singular
+    covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+    if not np.isfinite(covmean).all():
+        msg = ('fid calculation produces singular product; '
+               'adding %s to diagonal of cov estimates') % eps
+        print(msg)
+        offset = np.eye(sigma1.shape[0]) * eps
+        covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+    # Numerical error might give slight imaginary component
+    if np.iscomplexobj(covmean):
+        if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+            m = np.max(np.abs(covmean.imag))
+            raise ValueError('Imaginary component {}'.format(m))
+        covmean = covmean.real
+    tr_covmean = np.trace(covmean)
+    return (diff.dot(diff) + np.trace(sigma1) +
+            np.trace(sigma2) - 2 * tr_covmean)
+class SIFID(object):
+    def __init__(self, dims=64) -> None:
+        block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
+        self.model = InceptionV3([block_idx]).cuda()
+        self.model.eval()
+        self.dims = dims
+    def calculate_activation_statistics(self, x):
+        act = self.get_activations(x)
+        mu = np.mean(act, axis=0)
+        sigma = np.cov(act, rowvar=False)
+        return mu, sigma
+    def get_activations(self, x):
+        # x tensor (B, C, H, W) in range [0, 1]
+        batch_size = x.shape[0]
+        with torch.no_grad():
+            pred = self.model(x)[0]
+            pred = pred.cpu().numpy()
+            pred = pred.transpose(0, 2, 3, 1).reshape(batch_size*pred.shape[2]*pred.shape[3],-1)
+        return pred
+    def __call__(self, x1, x2):
+        # x1, x2 tensor (B, C, H, W) in range [-1, 1]
+        x1, x2 = (x1 + 1.)/2, (x2 + 1.)/2  # [-1, 1] -> [0, 1]
+        m1, s1 = self.calculate_activation_statistics(x1.unsqueeze(0).cuda())
+        m2, s2 = self.calculate_activation_statistics(x2.unsqueeze(0).cuda())
+        return calculate_frechet_distance(m1, s1, m2, s2)
+class InceptionV3(nn.Module):
+    """Pretrained InceptionV3 network returning feature maps"""
+    # Index of default block of inception to return,
+    # corresponds to output of final average pooling
+    DEFAULT_BLOCK_INDEX = 3
+    # Maps feature dimensionality to their output blocks indices
+    BLOCK_INDEX_BY_DIM = {
+        64: 0,   # First max pooling features
+        192: 1,  # Second max pooling featurs
+        768: 2,  # Pre-aux classifier features
+        2048: 3  # Final average pooling features
+    }
+    def __init__(self,
+                 output_blocks=[DEFAULT_BLOCK_INDEX],
+                 resize_input=False,
+                 normalize_input=True,
+                 requires_grad=False):
+        """Build pretrained InceptionV3
+        Parameters
+        ----------
+        output_blocks : list of int
+            Indices of blocks to return features of. Possible values are:
+                - 0: corresponds to output of first max pooling
+                - 1: corresponds to output of second max pooling
+                - 2: corresponds to output which is fed to aux classifier
+                - 3: corresponds to output of final average pooling
+        resize_input : bool
+            If true, bilinearly resizes input to width and height 299 before
+            feeding input to model. As the network without fully connected
+            layers is fully convolutional, it should be able to handle inputs
+            of arbitrary size, so resizing might not be strictly needed
+        normalize_input : bool
+            If true, scales the input from range (0, 1) to the range the
+            pretrained Inception network expects, namely (-1, 1)
+        requires_grad : bool
+            If true, parameters of the model require gradient. Possibly useful
+            for finetuning the network
+        """
+        super(InceptionV3, self).__init__()
+        self.resize_input = resize_input
+        self.normalize_input = normalize_input
+        self.output_blocks = sorted(output_blocks)
+        self.last_needed_block = max(output_blocks)
+        assert self.last_needed_block <= 3, \
+            'Last possible output block index is 3'
+        self.blocks = nn.ModuleList()
+        inception = torchvision.models.inception_v3(pretrained=True)
+        # Block 0: input to maxpool1
+        block0 = [
+            inception.Conv2d_1a_3x3,
+            inception.Conv2d_2a_3x3,
+            inception.Conv2d_2b_3x3,
+            ]
+        self.blocks.append(nn.Sequential(*block0))
+        # Block 1: maxpool1 to maxpool2
+        if self.last_needed_block >= 1:
+            block1 = [
+                nn.MaxPool2d(kernel_size=3, stride=2),
+                inception.Conv2d_3b_1x1,
+                inception.Conv2d_4a_3x3,
+            ]
+            self.blocks.append(nn.Sequential(*block1))
+        # Block 2: maxpool2 to aux classifier
+        if self.last_needed_block >= 2:
+            block2 = [
+                nn.MaxPool2d(kernel_size=3, stride=2),
+                inception.Mixed_5b,
+                inception.Mixed_5c,
+                inception.Mixed_5d,
+                inception.Mixed_6a,
+                inception.Mixed_6b,
+                inception.Mixed_6c,
+                inception.Mixed_6d,
+                inception.Mixed_6e,
+            ]
+            self.blocks.append(nn.Sequential(*block2))
+        # Block 3: aux classifier to final avgpool
+        if self.last_needed_block >= 3:
+            block3 = [
+                inception.Mixed_7a,
+                inception.Mixed_7b,
+                inception.Mixed_7c,
+            ]
+            self.blocks.append(nn.Sequential(*block3))
+        if self.last_needed_block >= 4:
+            block4 = [
+                nn.AdaptiveAvgPool2d(output_size=(1, 1))
+            ]
+            self.blocks.append(nn.Sequential(*block4))
+        for param in self.parameters():
+            param.requires_grad = requires_grad
+    def forward(self, inp):
+        """Get Inception feature maps
+        Parameters
+        ----------
+        inp : torch.autograd.Variable
+            Input tensor of shape Bx3xHxW. Values are expected to be in
+            range (0, 1)
+        Returns
+        -------
+        List of torch.autograd.Variable, corresponding to the selected output
+        block, sorted ascending by index
+        """
+        outp = []
+        x = inp
+        if self.resize_input:
+            x = F.upsample(x,
+                              size=(299, 299),
+                              mode='bilinear',
+                              align_corners=False)
+        if self.normalize_input:
+            x = 2 * x - 1  # Scale from range (0, 1) to range (-1, 1)
+        for idx, block in enumerate(self.blocks):
+            x = block(x)
+            if idx in self.output_blocks:
+                outp.append(x)
+            if idx == self.last_needed_block:
+                break
+        return outp
+if __name__ == '__main__':
+    tform = transforms.Compose([transforms.Resize((256,256)),
+                                transforms.ToTensor(),
+                                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])])
+    im1 = Image.open('test1.jpg')
+    im2 = Image.open('test2.jpg')
+    im1 = tform(im1)  # 3xHxW in [-1,]
+    im2 = tform(im2)
+    sifid_model = SIFID()
+    sifid_score = sifid_model(im1, im2)
+    print(sifid_score)

tools/slack_bot.py ADDED Viewed

	@@ -0,0 +1,157 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+slack_bot.py
+Created on May 02 2020 11:02
+a bot to send message/image during program run
+@author: Tu Bui tu@surrey.ac.uk
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+import requests
+import socket
+from slack import WebClient
+from slack.errors import SlackApiError
+import threading
+SLACK_MAX_PRINT_ERROR = 3
+SLACK_ERROR_CODE = {'not_active': 1,
+                    'API': 2}
+def welcome_message():
+    hostname = socket.gethostname()
+    all_args = ' '.join(sys.argv)
+    out_text = 'On server {}: {}\n'.format(hostname, all_args)
+    return out_text
+class Notifier(object):
+    """
+    A slack bot to send text/image to a given workspace channel.
+    This class initializes with a text file as input, the text file should contain 2 lines:
+        slack token
+        slack channel
+    Usage:
+    msg = Notifier(token_file)
+    msg.send_initial_text(' '.join(sys.argv))
+    msg.send_text('hi, this text is inside slack thread')
+    msg.send_file(your_file, 'file title')
+    """
+    def __init__(self, token_file):
+        """
+        setup slack
+        :param token_file: path to slack token file
+        """
+        self.active = True
+        self.thread_id = None
+        self.counter = 0  # count number of errors during Web API call
+        if not os.path.exists(token_file):
+            print('[SLACK] token file not found. You will not be notified.')
+            self.active = False
+        else:
+            try:
+                with open(token_file, 'r') as f:
+                    lines = f.readlines()
+                self.token = lines[0].strip()
+                self.channel = lines[1].strip()
+            except Exception as e:
+                print(e)
+                print('[SLACK] fail to read token file. You will not be notified.')
+                self.active = False
+    def _handel_error(self, e):
+        assert e.response["ok"] is False
+        assert e.response["error"]  # str like 'invalid_auth', 'channel_not_found'
+        self.counter += 1
+        if self.counter <= SLACK_MAX_PRINT_ERROR:
+            print(f"Got the following error, you will not be notified: {e.response['error']}")
+    def send_init_text(self, text=None):
+        """
+        start a new thread with a main message and register the thread id
+        :param text: initial message for this thread
+        :return:
+        """
+        if not self.active:
+            return SLACK_ERROR_CODE['not_active']
+        try:
+            if text is None:
+                text = welcome_message()
+            sc = WebClient(self.token)
+            response = sc.chat_postMessage(channel=self.channel, text=text)
+            self.thread_id = response['ts']
+        except SlackApiError as e:
+            self._handel_error(e)
+            return SLACK_ERROR_CODE['API']
+        print('[SLACK] sent initial text. Chat ID %s. Message %s' % (self.thread_id, text))
+        return 0
+    def send_init_file(self, file_path, title=''):
+        """
+        start a new thread with a file and register thread id
+        :param file_path: path to file
+        :param title: title of this file
+        :return: 0 if success otherwise error code
+        """
+        if not self.active:
+            return SLACK_ERROR_CODE['not_active']
+        try:
+            response = sc.files_upload(title=title, channels=self.channel, file=file_path)
+            self.thread_id = response['ts']
+        except SlackApiError as e:
+            self._handel_error(e)
+            return SLACK_ERROR_CODE['API']
+        print('[SLACK] sent initial file. Chat ID %s.' % self.thread_id)
+        return 0
+    def send_text(self, text, reply_broadcast=False):
+        """
+        send text as a thread if one is registered in self.thread_id.
+        Otherwise send as a new message
+        :param text: message to send.
+        :return: 0 if success, error code otherwise
+        """
+        print(text)
+        if not self.active:
+            return SLACK_ERROR_CODE['not_active']
+        if self.thread_id is None:
+            self.send_init_text(text)
+        else:
+            try:
+                sc = WebClient(self.token)
+                response = sc.chat_postMessage(channel=self.channel, text=text,
+                                               thread_ts=self.thread_id, as_user=True,
+                                               reply_broadcast=reply_broadcast)
+            except SlackApiError as e:
+                self._handel_error(e)
+                return SLACK_ERROR_CODE['API']
+        return 0
+    def _send_file(self, file_path, title='', reply_broadcast=False):
+        """can be multithread target"""
+        try:
+            sc = WebClient(self.token)
+            sc.files_upload(title=title, channels=self.channel,
+                            thread_ts=self.thread_id, file=file_path,
+                            reply_broadcast=reply_broadcast)
+        except SlackApiError as e:
+            self._handel_error(e)
+            return SLACK_ERROR_CODE['API']
+        return 0
+    def send_file(self, file_path, title='', reply_broadcast=False):
+        if not self.active:
+            return SLACK_ERROR_CODE['not_active']
+        if self.thread_id is None:
+            return self.send_init_file(file_path, title)
+        else:
+            os_thread = threading.Thread(target=self._send_file, args=(file_path, title, reply_broadcast))
+            os_thread.start()
+        return 0  # may still have error if _send_file() fail