Spaces:

sczhou
/

ProPainter

Running on A10G

App Files Files Community

ProPainter / core /trainer.py

sczhou

init code

320e465 about 1 year ago

raw

history blame contribute delete

23.6 kB

	import os
	import glob
	import logging
	import importlib
	from tqdm import tqdm

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from core.prefetch_dataloader import PrefetchDataLoader, CPUPrefetcher
	from torch.utils.data.distributed import DistributedSampler
	from torch.nn.parallel import DistributedDataParallel as DDP
	import torchvision
	from torch.utils.tensorboard import SummaryWriter

	from core.lr_scheduler import MultiStepRestartLR, CosineAnnealingRestartLR
	from core.loss import AdversarialLoss, PerceptualLoss, LPIPSLoss
	from core.dataset import TrainDataset

	from model.modules.flow_comp_raft import RAFT_bi, FlowLoss, EdgeLoss
	from model.recurrent_flow_completion import RecurrentFlowCompleteNet

	from RAFT.utils.flow_viz_pt import flow_to_image


	class Trainer:
	def __init__(self, config):
	self.config = config
	self.epoch = 0
	self.iteration = 0
	self.num_local_frames = config['train_data_loader']['num_local_frames']
	self.num_ref_frames = config['train_data_loader']['num_ref_frames']

	# setup data set and data loader
	self.train_dataset = TrainDataset(config['train_data_loader'])

	self.train_sampler = None
	self.train_args = config['trainer']
	if config['distributed']:
	self.train_sampler = DistributedSampler(
	self.train_dataset,
	num_replicas=config['world_size'],
	rank=config['global_rank'])

	dataloader_args = dict(
	dataset=self.train_dataset,
	batch_size=self.train_args['batch_size'] // config['world_size'],
	shuffle=(self.train_sampler is None),
	num_workers=self.train_args['num_workers'],
	sampler=self.train_sampler,
	drop_last=True)

	self.train_loader = PrefetchDataLoader(self.train_args['num_prefetch_queue'], **dataloader_args)
	self.prefetcher = CPUPrefetcher(self.train_loader)

	# set loss functions
	self.adversarial_loss = AdversarialLoss(type=self.config['losses']['GAN_LOSS'])
	self.adversarial_loss = self.adversarial_loss.to(self.config['device'])
	self.l1_loss = nn.L1Loss()
	# self.perc_loss = PerceptualLoss(
	# layer_weights={'conv3_4': 0.25, 'conv4_4': 0.25, 'conv5_4': 0.5},
	# use_input_norm=True,
	# range_norm=True,
	# criterion='l1'
	# ).to(self.config['device'])

	if self.config['losses']['perceptual_weight'] > 0:
	self.perc_loss = LPIPSLoss(use_input_norm=True, range_norm=True).to(self.config['device'])

	# self.flow_comp_loss = FlowCompletionLoss().to(self.config['device'])
	# self.flow_comp_loss = FlowCompletionLoss(self.config['device'])

	# set raft
	self.fix_raft = RAFT_bi(device = self.config['device'])
	self.fix_flow_complete = RecurrentFlowCompleteNet('/mnt/lustre/sczhou/VQGANs/CodeMOVI/experiments_model/recurrent_flow_completion_v5_train_flowcomp_v5/gen_760000.pth')
	for p in self.fix_flow_complete.parameters():
	p.requires_grad = False
	self.fix_flow_complete.to(self.config['device'])
	self.fix_flow_complete.eval()

	# self.flow_loss = FlowLoss()

	# setup models including generator and discriminator
	net = importlib.import_module('model.' + config['model']['net'])
	self.netG = net.InpaintGenerator()
	# print(self.netG)
	self.netG = self.netG.to(self.config['device'])
	if not self.config['model'].get('no_dis', False):
	if self.config['model'].get('dis_2d', False):
	self.netD = net.Discriminator_2D(
	in_channels=3,
	use_sigmoid=config['losses']['GAN_LOSS'] != 'hinge')
	else:
	self.netD = net.Discriminator(
	in_channels=3,
	use_sigmoid=config['losses']['GAN_LOSS'] != 'hinge')
	self.netD = self.netD.to(self.config['device'])

	self.interp_mode = self.config['model']['interp_mode']
	# setup optimizers and schedulers
	self.setup_optimizers()
	self.setup_schedulers()
	self.load()

	if config['distributed']:
	self.netG = DDP(self.netG,
	device_ids=[self.config['local_rank']],
	output_device=self.config['local_rank'],
	broadcast_buffers=True,
	find_unused_parameters=True)
	if not self.config['model']['no_dis']:
	self.netD = DDP(self.netD,
	device_ids=[self.config['local_rank']],
	output_device=self.config['local_rank'],
	broadcast_buffers=True,
	find_unused_parameters=False)

	# set summary writer
	self.dis_writer = None
	self.gen_writer = None
	self.summary = {}
	if self.config['global_rank'] == 0 or (not config['distributed']):
	if not self.config['model']['no_dis']:
	self.dis_writer = SummaryWriter(
	os.path.join(config['save_dir'], 'dis'))
	self.gen_writer = SummaryWriter(
	os.path.join(config['save_dir'], 'gen'))

	def setup_optimizers(self):
	"""Set up optimizers."""
	backbone_params = []
	for name, param in self.netG.named_parameters():
	if param.requires_grad:
	backbone_params.append(param)
	else:
	print(f'Params {name} will not be optimized.')

	optim_params = [
	{
	'params': backbone_params,
	'lr': self.config['trainer']['lr']
	},
	]

	self.optimG = torch.optim.Adam(optim_params,
	betas=(self.config['trainer']['beta1'],
	self.config['trainer']['beta2']))

	if not self.config['model']['no_dis']:
	self.optimD = torch.optim.Adam(
	self.netD.parameters(),
	lr=self.config['trainer']['lr'],
	betas=(self.config['trainer']['beta1'],
	self.config['trainer']['beta2']))

	def setup_schedulers(self):
	"""Set up schedulers."""
	scheduler_opt = self.config['trainer']['scheduler']
	scheduler_type = scheduler_opt.pop('type')

	if scheduler_type in ['MultiStepLR', 'MultiStepRestartLR']:
	self.scheG = MultiStepRestartLR(
	self.optimG,
	milestones=scheduler_opt['milestones'],
	gamma=scheduler_opt['gamma'])
	if not self.config['model']['no_dis']:
	self.scheD = MultiStepRestartLR(
	self.optimD,
	milestones=scheduler_opt['milestones'],
	gamma=scheduler_opt['gamma'])
	elif scheduler_type == 'CosineAnnealingRestartLR':
	self.scheG = CosineAnnealingRestartLR(
	self.optimG,
	periods=scheduler_opt['periods'],
	restart_weights=scheduler_opt['restart_weights'],
	eta_min=scheduler_opt['eta_min'])
	if not self.config['model']['no_dis']:
	self.scheD = CosineAnnealingRestartLR(
	self.optimD,
	periods=scheduler_opt['periods'],
	restart_weights=scheduler_opt['restart_weights'],
	eta_min=scheduler_opt['eta_min'])
	else:
	raise NotImplementedError(
	f'Scheduler {scheduler_type} is not implemented yet.')

	def update_learning_rate(self):
	"""Update learning rate."""
	self.scheG.step()
	if not self.config['model']['no_dis']:
	self.scheD.step()

	def get_lr(self):
	"""Get current learning rate."""
	return self.optimG.param_groups[0]['lr']

	def add_summary(self, writer, name, val):
	"""Add tensorboard summary."""
	if name not in self.summary:
	self.summary[name] = 0
	self.summary[name] += val
	n = self.train_args['log_freq']
	if writer is not None and self.iteration % n == 0:
	writer.add_scalar(name, self.summary[name] / n, self.iteration)
	self.summary[name] = 0

	def load(self):
	"""Load netG (and netD)."""
	# get the latest checkpoint
	model_path = self.config['save_dir']
	# TODO: add resume name
	if os.path.isfile(os.path.join(model_path, 'latest.ckpt')):
	latest_epoch = open(os.path.join(model_path, 'latest.ckpt'),
	'r').read().splitlines()[-1]
	else:
	ckpts = [
	os.path.basename(i).split('.pth')[0]
	for i in glob.glob(os.path.join(model_path, '*.pth'))
	]
	ckpts.sort()
	latest_epoch = ckpts[-1][4:] if len(ckpts) > 0 else None

	if latest_epoch is not None:
	gen_path = os.path.join(model_path,
	f'gen_{int(latest_epoch):06d}.pth')
	dis_path = os.path.join(model_path,
	f'dis_{int(latest_epoch):06d}.pth')
	opt_path = os.path.join(model_path,
	f'opt_{int(latest_epoch):06d}.pth')

	if self.config['global_rank'] == 0:
	print(f'Loading model from {gen_path}...')
	dataG = torch.load(gen_path, map_location=self.config['device'])
	self.netG.load_state_dict(dataG)
	if not self.config['model']['no_dis'] and self.config['model']['load_d']:
	dataD = torch.load(dis_path, map_location=self.config['device'])
	self.netD.load_state_dict(dataD)

	data_opt = torch.load(opt_path, map_location=self.config['device'])
	self.optimG.load_state_dict(data_opt['optimG'])
	# self.scheG.load_state_dict(data_opt['scheG'])
	if not self.config['model']['no_dis'] and self.config['model']['load_d']:
	self.optimD.load_state_dict(data_opt['optimD'])
	# self.scheD.load_state_dict(data_opt['scheD'])
	self.epoch = data_opt['epoch']
	self.iteration = data_opt['iteration']
	else:
	gen_path = self.config['trainer'].get('gen_path', None)
	dis_path = self.config['trainer'].get('dis_path', None)
	opt_path = self.config['trainer'].get('opt_path', None)
	if gen_path is not None:
	if self.config['global_rank'] == 0:
	print(f'Loading Gen-Net from {gen_path}...')
	dataG = torch.load(gen_path, map_location=self.config['device'])
	self.netG.load_state_dict(dataG)

	if dis_path is not None and not self.config['model']['no_dis'] and self.config['model']['load_d']:
	if self.config['global_rank'] == 0:
	print(f'Loading Dis-Net from {dis_path}...')
	dataD = torch.load(dis_path, map_location=self.config['device'])
	self.netD.load_state_dict(dataD)
	if opt_path is not None:
	data_opt = torch.load(opt_path, map_location=self.config['device'])
	self.optimG.load_state_dict(data_opt['optimG'])
	self.scheG.load_state_dict(data_opt['scheG'])
	if not self.config['model']['no_dis'] and self.config['model']['load_d']:
	self.optimD.load_state_dict(data_opt['optimD'])
	self.scheD.load_state_dict(data_opt['scheD'])
	else:
	if self.config['global_rank'] == 0:
	print('Warnning: There is no trained model found.'
	'An initialized model will be used.')

	def save(self, it):
	"""Save parameters every eval_epoch"""
	if self.config['global_rank'] == 0:
	# configure path
	gen_path = os.path.join(self.config['save_dir'],
	f'gen_{it:06d}.pth')
	dis_path = os.path.join(self.config['save_dir'],
	f'dis_{it:06d}.pth')
	opt_path = os.path.join(self.config['save_dir'],
	f'opt_{it:06d}.pth')
	print(f'\nsaving model to {gen_path} ...')

	# remove .module for saving
	if isinstance(self.netG, torch.nn.DataParallel) or isinstance(self.netG, DDP):
	netG = self.netG.module
	if not self.config['model']['no_dis']:
	netD = self.netD.module
	else:
	netG = self.netG
	if not self.config['model']['no_dis']:
	netD = self.netD

	# save checkpoints
	torch.save(netG.state_dict(), gen_path)
	if not self.config['model']['no_dis']:
	torch.save(netD.state_dict(), dis_path)
	torch.save(
	{
	'epoch': self.epoch,
	'iteration': self.iteration,
	'optimG': self.optimG.state_dict(),
	'optimD': self.optimD.state_dict(),
	'scheG': self.scheG.state_dict(),
	'scheD': self.scheD.state_dict()
	}, opt_path)
	else:
	torch.save(
	{
	'epoch': self.epoch,
	'iteration': self.iteration,
	'optimG': self.optimG.state_dict(),
	'scheG': self.scheG.state_dict()
	}, opt_path)

	latest_path = os.path.join(self.config['save_dir'], 'latest.ckpt')
	os.system(f"echo {it:06d} > {latest_path}")

	def train(self):
	"""training entry"""
	pbar = range(int(self.train_args['iterations']))
	if self.config['global_rank'] == 0:
	pbar = tqdm(pbar,
	initial=self.iteration,
	dynamic_ncols=True,
	smoothing=0.01)

	os.makedirs('logs', exist_ok=True)

	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s %(filename)s[line:%(lineno)d]"
	"%(levelname)s %(message)s",
	datefmt="%a, %d %b %Y %H:%M:%S",
	filename=f"logs/{self.config['save_dir'].split('/')[-1]}.log",
	filemode='w')

	while True:
	self.epoch += 1
	self.prefetcher.reset()
	if self.config['distributed']:
	self.train_sampler.set_epoch(self.epoch)
	self._train_epoch(pbar)
	if self.iteration > self.train_args['iterations']:
	break
	print('\nEnd training....')

	def _train_epoch(self, pbar):
	"""Process input and calculate loss every training epoch"""
	device = self.config['device']
	train_data = self.prefetcher.next()
	while train_data is not None:
	self.iteration += 1
	frames, masks, flows_f, flows_b, _ = train_data
	frames, masks = frames.to(device), masks.to(device).float()
	l_t = self.num_local_frames
	b, t, c, h, w = frames.size()
	gt_local_frames = frames[:, :l_t, ...]
	local_masks = masks[:, :l_t, ...].contiguous()

	masked_frames = frames * (1 - masks)
	masked_local_frames = masked_frames[:, :l_t, ...]
	# get gt optical flow
	if flows_f[0] == 'None' or flows_b[0] == 'None':
	gt_flows_bi = self.fix_raft(gt_local_frames)
	else:
	gt_flows_bi = (flows_f.to(device), flows_b.to(device))

	# ---- complete flow ----
	pred_flows_bi, _ = self.fix_flow_complete.forward_bidirect_flow(gt_flows_bi, local_masks)
	pred_flows_bi = self.fix_flow_complete.combine_flow(gt_flows_bi, pred_flows_bi, local_masks)
	# pred_flows_bi = gt_flows_bi

	# ---- image propagation ----
	prop_imgs, updated_local_masks = self.netG.module.img_propagation(masked_local_frames, pred_flows_bi, local_masks, interpolation=self.interp_mode)
	updated_masks = masks.clone()
	updated_masks[:, :l_t, ...] = updated_local_masks.view(b, l_t, 1, h, w)
	updated_frames = masked_frames.clone()
	prop_local_frames = gt_local_frames * (1-local_masks) + prop_imgs.view(b, l_t, 3, h, w) * local_masks # merge
	updated_frames[:, :l_t, ...] = prop_local_frames

	# ---- feature propagation + Transformer ----
	pred_imgs = self.netG(updated_frames, pred_flows_bi, masks, updated_masks, l_t)
	pred_imgs = pred_imgs.view(b, -1, c, h, w)

	# get the local frames
	pred_local_frames = pred_imgs[:, :l_t, ...]
	comp_local_frames = gt_local_frames * (1. - local_masks) + pred_local_frames * local_masks
	comp_imgs = frames * (1. - masks) + pred_imgs * masks

	gen_loss = 0
	dis_loss = 0
	# optimize net_g
	if not self.config['model']['no_dis']:
	for p in self.netD.parameters():
	p.requires_grad = False

	self.optimG.zero_grad()

	# generator l1 loss
	hole_loss = self.l1_loss(pred_imgs * masks, frames * masks)
	hole_loss = hole_loss / torch.mean(masks) * self.config['losses']['hole_weight']
	gen_loss += hole_loss
	self.add_summary(self.gen_writer, 'loss/hole_loss', hole_loss.item())

	valid_loss = self.l1_loss(pred_imgs * (1 - masks), frames * (1 - masks))
	valid_loss = valid_loss / torch.mean(1-masks) * self.config['losses']['valid_weight']
	gen_loss += valid_loss
	self.add_summary(self.gen_writer, 'loss/valid_loss', valid_loss.item())

	# perceptual loss
	if self.config['losses']['perceptual_weight'] > 0:
	perc_loss = self.perc_loss(pred_imgs.view(-1,3,h,w), frames.view(-1,3,h,w))[0] * self.config['losses']['perceptual_weight']
	gen_loss += perc_loss
	self.add_summary(self.gen_writer, 'loss/perc_loss', perc_loss.item())

	# gan loss
	if not self.config['model']['no_dis']:
	# generator adversarial loss
	gen_clip = self.netD(comp_imgs)
	gan_loss = self.adversarial_loss(gen_clip, True, False)
	gan_loss = gan_loss * self.config['losses']['adversarial_weight']
	gen_loss += gan_loss
	self.add_summary(self.gen_writer, 'loss/gan_loss', gan_loss.item())
	gen_loss.backward()
	self.optimG.step()

	if not self.config['model']['no_dis']:
	# optimize net_d
	for p in self.netD.parameters():
	p.requires_grad = True
	self.optimD.zero_grad()

	# discriminator adversarial loss
	real_clip = self.netD(frames)
	fake_clip = self.netD(comp_imgs.detach())
	dis_real_loss = self.adversarial_loss(real_clip, True, True)
	dis_fake_loss = self.adversarial_loss(fake_clip, False, True)
	dis_loss += (dis_real_loss + dis_fake_loss) / 2
	self.add_summary(self.dis_writer, 'loss/dis_vid_real', dis_real_loss.item())
	self.add_summary(self.dis_writer, 'loss/dis_vid_fake', dis_fake_loss.item())
	dis_loss.backward()
	self.optimD.step()

	self.update_learning_rate()

	# write image to tensorboard
	if self.iteration % 200 == 0:
	# img to cpu
	t = 0
	gt_local_frames_cpu = ((gt_local_frames.view(b,-1,3,h,w) + 1)/2.0).cpu()
	masked_local_frames = ((masked_local_frames.view(b,-1,3,h,w) + 1)/2.0).cpu()
	prop_local_frames_cpu = ((prop_local_frames.view(b,-1,3,h,w) + 1)/2.0).cpu()
	pred_local_frames_cpu = ((pred_local_frames.view(b,-1,3,h,w) + 1)/2.0).cpu()
	img_results = torch.cat([masked_local_frames[0][t], gt_local_frames_cpu[0][t],
	prop_local_frames_cpu[0][t], pred_local_frames_cpu[0][t]], 1)
	img_results = torchvision.utils.make_grid(img_results, nrow=1, normalize=True)
	if self.gen_writer is not None:
	self.gen_writer.add_image(f'img/img:inp-gt-res-{t}', img_results, self.iteration)

	t = 5
	if masked_local_frames.shape[1] > 5:
	img_results = torch.cat([masked_local_frames[0][t], gt_local_frames_cpu[0][t],
	prop_local_frames_cpu[0][t], pred_local_frames_cpu[0][t]], 1)
	img_results = torchvision.utils.make_grid(img_results, nrow=1, normalize=True)
	if self.gen_writer is not None:
	self.gen_writer.add_image(f'img/img:inp-gt-res-{t}', img_results, self.iteration)

	# flow to cpu
	gt_flows_forward_cpu = flow_to_image(gt_flows_bi[0][0]).cpu()
	masked_flows_forward_cpu = (gt_flows_forward_cpu[0] * (1-local_masks[0][0].cpu())).to(gt_flows_forward_cpu)
	pred_flows_forward_cpu = flow_to_image(pred_flows_bi[0][0]).cpu()

	flow_results = torch.cat([gt_flows_forward_cpu[0], masked_flows_forward_cpu, pred_flows_forward_cpu[0]], 1)
	if self.gen_writer is not None:
	self.gen_writer.add_image('img/flow:gt-pred', flow_results, self.iteration)

	# console logs
	if self.config['global_rank'] == 0:
	pbar.update(1)
	if not self.config['model']['no_dis']:
	pbar.set_description((f"d: {dis_loss.item():.3f}; "
	f"hole: {hole_loss.item():.3f}; "
	f"valid: {valid_loss.item():.3f}"))
	else:
	pbar.set_description((f"hole: {hole_loss.item():.3f}; "
	f"valid: {valid_loss.item():.3f}"))

	if self.iteration % self.train_args['log_freq'] == 0:
	if not self.config['model']['no_dis']:
	logging.info(f"[Iter {self.iteration}] "
	f"d: {dis_loss.item():.4f}; "
	f"hole: {hole_loss.item():.4f}; "
	f"valid: {valid_loss.item():.4f}")
	else:
	logging.info(f"[Iter {self.iteration}] "
	f"hole: {hole_loss.item():.4f}; "
	f"valid: {valid_loss.item():.4f}")

	# saving models
	if self.iteration % self.train_args['save_freq'] == 0:
	self.save(int(self.iteration))

	if self.iteration > self.train_args['iterations']:
	break

	train_data = self.prefetcher.next()