Spaces:

WindVChen
/

INR-Harmon

Running

File size: 16,073 Bytes

6710c89

import argparse

import torch.backends.cudnn as cudnn
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

from model.build_model import build_model

import torch
import cv2
import numpy as np
import torchvision
import os
import tqdm
import time

from utils.misc import prepare_cooridinate_input, customRandomCrop

from datasets.build_INR_dataset import Implicit2DGenerator
import albumentations
from albumentations import Resize
from torch.utils.data import DataLoader
from utils.misc import normalize

import math


class single_image_dataset(torch.utils.data.Dataset):
    def __init__(self, opt, composite_image=None, mask=None):
        super().__init__()

        self.opt = opt

        if composite_image is None:
            composite_image = cv2.imread(opt.composite_image)
            composite_image = cv2.cvtColor(composite_image, cv2.COLOR_BGR2RGB)
        self.composite_image = composite_image

        assert composite_image.shape[0] == composite_image.shape[1], "This faster script only supports square images."
        assert composite_image.shape[
                   0] % 256 == 0, "This faster script only supports images with resolution multiples of 256."
        assert opt.split_resolution % (composite_image.shape[
                                           0] // 16) == 0, f"The image resolution is {composite_image.shape[0]}, " \
                                                           f"you should set {opt.split_resolution} to multiplies of {composite_image.shape[0] // 16}"

        if mask is None:
            mask = cv2.imread(opt.mask)
        mask = mask[:, :, 0].astype(np.float32) / 255.
        self.mask = mask

        self.torch_transforms = transforms.Compose([transforms.ToTensor(),
                                                    transforms.Normalize([.5, .5, .5], [.5, .5, .5])])
        self.INR_dataset = Implicit2DGenerator(opt, 'Val')

        self.split_width_resolution = self.split_height_resolution = opt.split_resolution

        self.num_w = math.ceil(composite_image.shape[1] / self.split_width_resolution)
        self.num_h = math.ceil(composite_image.shape[0] / self.split_height_resolution)

        self.split_start_point = []

        "Split the image into several parts."
        for i in range(self.num_h):
            for j in range(self.num_w):
                if i == composite_image.shape[0] // self.split_height_resolution:
                    if j == composite_image.shape[1] // self.split_width_resolution:
                        self.split_start_point.append((composite_image.shape[0] - self.split_height_resolution,
                                                       composite_image.shape[1] - self.split_width_resolution))
                    else:
                        self.split_start_point.append(
                            (composite_image.shape[0] - self.split_height_resolution, j * self.split_width_resolution))
                else:
                    if j == composite_image.shape[1] // self.split_width_resolution:
                        self.split_start_point.append(
                            (i * self.split_height_resolution, composite_image.shape[1] - self.split_width_resolution))
                    else:
                        self.split_start_point.append(
                            (i * self.split_height_resolution, j * self.split_width_resolution))

        assert len(self.split_start_point) == self.num_w * self.num_h

        print(
            f"The image will be split into {self.num_h} pieces in height, and {self.num_w} pieces in width. Totally {self.num_h * self.num_w} patches.")
        print(f"The final resolution of each patch is {self.split_height_resolution} x {self.split_width_resolution}")

    def __len__(self):
        return self.num_w * self.num_h

    def __getitem__(self, idx):
        composite_image = self.composite_image

        mask = self.mask

        full_coord = prepare_cooridinate_input(mask).transpose(1, 2, 0)

        tmp_transform = albumentations.Compose([Resize(self.opt.base_size, self.opt.base_size)],
                                               additional_targets={'object_mask': 'image'})
        transform_out = tmp_transform(image=self.composite_image, object_mask=self.mask)
        compos_list = [self.torch_transforms(transform_out['image'])]
        mask_list = [
            torchvision.transforms.ToTensor()(transform_out['object_mask'][..., np.newaxis].astype(np.float32))]
        coord_map_list = []

        if composite_image.shape[0] != self.split_height_resolution:
            c_h = self.split_start_point[idx][0] / (composite_image.shape[0] - self.split_height_resolution)
        else:
            c_h = 0
        if composite_image.shape[1] != self.split_width_resolution:
            c_w = self.split_start_point[idx][1] / (composite_image.shape[1] - self.split_width_resolution)
        else:
            c_w = 0
        transform_out, c_h, c_w = customRandomCrop([composite_image, mask, full_coord],
                                                   self.split_height_resolution, self.split_width_resolution, c_h, c_w)

        compos_list.append(self.torch_transforms(transform_out[0]))
        mask_list.append(
            torchvision.transforms.ToTensor()(transform_out[1][..., np.newaxis].astype(np.float32)))
        coord_map_list.append(torchvision.transforms.ToTensor()(transform_out[2]))
        coord_map_list.append(torchvision.transforms.ToTensor()(transform_out[2]))
        for n in range(2):
            tmp_comp = cv2.resize(composite_image, (
                composite_image.shape[1] // 2 ** (n + 1), composite_image.shape[0] // 2 ** (n + 1)))
            tmp_mask = cv2.resize(mask, (mask.shape[1] // 2 ** (n + 1), mask.shape[0] // 2 ** (n + 1)))
            tmp_coord = prepare_cooridinate_input(tmp_mask).transpose(1, 2, 0)

            transform_out, c_h, c_w = customRandomCrop([tmp_comp, tmp_mask, tmp_coord],
                                                       self.split_height_resolution // 2 ** (n + 1),
                                                       self.split_width_resolution // 2 ** (n + 1), c_h, c_w)
            compos_list.append(self.torch_transforms(transform_out[0]))
            mask_list.append(
                torchvision.transforms.ToTensor()(transform_out[1][..., np.newaxis].astype(np.float32)))
            coord_map_list.append(torchvision.transforms.ToTensor()(transform_out[2]))
        out_comp = compos_list
        out_mask = mask_list
        out_coord = coord_map_list

        fg_INR_coordinates, bg_INR_coordinates, fg_INR_RGB, fg_transfer_INR_RGB, bg_INR_RGB = self.INR_dataset.generator(
            self.torch_transforms, transform_out[0], transform_out[0], mask)

        return {
            'composite_image': out_comp,
            'mask': out_mask,
            'coordinate_map': out_coord,
            'composite_image0': out_comp[0],
            'mask0': out_mask[0],
            'coordinate_map0': out_coord[0],
            'composite_image1': out_comp[1],
            'mask1': out_mask[1],
            'coordinate_map1': out_coord[1],
            'composite_image2': out_comp[2],
            'mask2': out_mask[2],
            'coordinate_map2': out_coord[2],
            'composite_image3': out_comp[3],
            'mask3': out_mask[3],
            'coordinate_map3': out_coord[3],
            'fg_INR_coordinates': fg_INR_coordinates,
            'bg_INR_coordinates': bg_INR_coordinates,
            'fg_INR_RGB': fg_INR_RGB,
            'fg_transfer_INR_RGB': fg_transfer_INR_RGB,
            'bg_INR_RGB': bg_INR_RGB,
            'start_point': self.split_start_point[idx],
            'start_proportion': [self.split_start_point[idx][0] / (composite_image.shape[0]),
                                 self.split_start_point[idx][1] / (composite_image.shape[1]),
                                 (self.split_start_point[idx][0] + self.split_height_resolution) / (
                                 composite_image.shape[0]),
                                 (self.split_start_point[idx][1] + self.split_width_resolution) / (
                                 composite_image.shape[1])],
        }


def parse_args():
    parser = argparse.ArgumentParser()

    parser.add_argument('--split_resolution', type=int, default=2048,
                        help='The resolution of the patch split.')

    parser.add_argument('--composite_image', type=str, default=r'./demo/demo_2k_composite.jpg',
                        help='composite image path')

    parser.add_argument('--mask', type=str, default=r'./demo/demo_2k_mask.jpg',
                        help='mask path')

    parser.add_argument('--save_path', type=str, default=r'./demo/',
                        help='save path')

    parser.add_argument('--workers', type=int, default=8,
                        metavar='N', help='Dataloader threads.')

    parser.add_argument('--batch_size', type=int, default=1,
                        help='You can override model batch size by specify positive number.')

    parser.add_argument('--device', type=str, default='cuda',
                        help="Whether use cuda, 'cuda' or 'cpu'.")

    parser.add_argument('--base_size', type=int, default=256,
                        help='Base size. Resolution of the image input into the Encoder')

    parser.add_argument('--input_size', type=int, default=256,
                        help='Input size. Resolution of the image that want to be generated by the Decoder')

    parser.add_argument('--INR_input_size', type=int, default=256,
                        help='INR input size. Resolution of the image that want to be generated by the Decoder. '
                             'Should be the same as `input_size`')

    parser.add_argument('--INR_MLP_dim', type=int, default=32,
                        help='Number of channels for INR linear layer.')

    parser.add_argument('--LUT_dim', type=int, default=7,
                        help='Dim of the output LUT. Refer to https://ieeexplore.ieee.org/abstract/document/9206076')

    parser.add_argument('--activation', type=str, default='leakyrelu_pe',
                        help='INR activation layer type: leakyrelu_pe, sine')

    parser.add_argument('--pretrained', type=str,
                        default=r'.\pretrained_models\Resolution_RAW_iHarmony4.pth',
                        help='Pretrained weight path')

    parser.add_argument('--param_factorize_dim', type=int,
                        default=10,
                        help='The intermediate dimensions of the factorization of the predicted MLP parameters. '
                             'Refer to https://arxiv.org/abs/2011.12026')

    parser.add_argument('--embedding_type', type=str,
                        default="CIPS_embed",
                        help='Which embedding_type to use.')

    parser.add_argument('--INRDecode', action="store_false",
                        help='Whether INR decoder. Set it to False if you want to test the baseline '
                             '(https://github.com/SamsungLabs/image_harmonization)')

    parser.add_argument('--isMoreINRInput', action="store_false",
                        help='Whether to cat RGB and mask. See Section 3.4 in the paper.')

    parser.add_argument('--hr_train', action="store_false",
                        help='Whether use hr_train. See section 3.4 in the paper.')

    parser.add_argument('--isFullRes', action="store_true",
                        help='Whether for original resolution. See section 3.4 in the paper.')

    opt = parser.parse_args()

    assert opt.batch_size == 1, 'This faster script only supports batch size 1 for inference.'

    return opt


@torch.no_grad()
def inference(model, opt, composite_image=None, mask=None):
    model.eval()

    "dataset here is actually consisted of several patches of a single image."
    singledataset = single_image_dataset(opt, composite_image, mask)

    single_data_loader = DataLoader(singledataset, opt.batch_size, shuffle=False, drop_last=False, pin_memory=True,
                                    num_workers=opt.workers, persistent_workers=False if composite_image is not None else True)

    "Init a pure black image with the same size as the input image."
    init_img = np.zeros_like(singledataset.composite_image)

    time_all = 0

    for step, batch in tqdm.tqdm(enumerate(single_data_loader)):
        composite_image = [batch[f'composite_image{name}'].to(opt.device) for name in range(4)]
        mask = [batch[f'mask{name}'].to(opt.device) for name in range(4)]
        coordinate_map = [batch[f'coordinate_map{name}'].to(opt.device) for name in range(4)]
        start_points = batch['start_point']
        start_proportion = batch['start_proportion']

        if opt.batch_size == 1:
            start_points = [torch.cat(start_points)]
            start_proportion = [torch.cat(start_proportion)]

        fg_INR_coordinates = coordinate_map[1:]

        try:
            if step == 0:  # This is for CUDA Kernel Warm-up, or the first inference step will be quite slow.
                fg_content_bg_appearance_construct, _, lut_transform_image = model(
                    composite_image,
                    mask,
                    fg_INR_coordinates, start_proportion[0]
                )
            if opt.device == "cuda":
                torch.cuda.reset_max_memory_allocated()
                torch.cuda.reset_max_memory_cached()
                start_time = time.time()
                torch.cuda.synchronize()
            fg_content_bg_appearance_construct, _, lut_transform_image = model(
                composite_image,
                mask,
                fg_INR_coordinates, start_proportion[0]
            )
            if opt.device == "cuda":
                torch.cuda.synchronize()
                end_time = time.time()

                end_max_memory = torch.cuda.max_memory_allocated() // 1024 ** 2
                end_memory = torch.cuda.memory_allocated() // 1024 ** 2

                print(f'GPU max memory usage: {end_max_memory} MB')
                print(f'GPU memory usage: {end_memory} MB')
                time_all += (end_time - start_time)
            print(f'progress: {step} / {len(single_data_loader)}')
        except:
            raise Exception(
                f'The image resolution is large. Please reduce the `split_resolution` value. Your current set is {opt.split_resolution}')

        "Assemble the every patch's harmonized result into the final whole image."
        for id in range(len(fg_INR_coordinates[0])):
            pred_fg_image = fg_content_bg_appearance_construct[-1][id]
            pred_harmonized_image = pred_fg_image * (mask[1][id] > 100 / 255.) + composite_image[1][id] * (
                ~(mask[1][id] > 100 / 255.))

            pred_harmonized_tmp = cv2.cvtColor(
                normalize(pred_harmonized_image.unsqueeze(0), opt, 'inv')[0].permute(1, 2, 0).cpu().mul_(255.).clamp_(
                    0., 255.).numpy().astype(np.uint8), cv2.COLOR_RGB2BGR)

            init_img[start_points[id][0]:start_points[id][0] + singledataset.split_height_resolution,
            start_points[id][1]:start_points[id][1] + singledataset.split_width_resolution] = pred_harmonized_tmp

    print(f'Inference time: {time_all}')
    if opt.save_path is not None:
        os.makedirs(opt.save_path, exist_ok=True)
        cv2.imwrite(os.path.join(opt.save_path, "pred_harmonized_image.jpg"), init_img)
    return init_img


def main_process(opt, composite_image=None, mask=None):
    cudnn.benchmark = True

    model = build_model(opt).to(opt.device)

    load_dict = torch.load(opt.pretrained)['model']
    for k in load_dict.keys():
        if k not in model.state_dict().keys():
            print(f"Skip {k}")
    model.load_state_dict(load_dict, strict=False)

    return inference(model, opt, composite_image, mask)


if __name__ == '__main__':
    opt = parse_args()
    opt.transform_mean = [.5, .5, .5]
    opt.transform_var = [.5, .5, .5]
    main_process(opt)