Spaces:

gyrojeff
/

YuzuMarker.FontDetection

Running

App Files Files Community

YuzuMarker.FontDetection / detector /data.py

gyrojeff

fix: data augmentation

b06784f over 1 year ago

raw

history blame

10.9 kB

	from font_dataset.fontlabel import FontLabel
	from font_dataset.font import DSFont, load_font_with_exclusion
	from . import config


	import math
	import os
	import random
	import pickle
	import torch
	import torchvision.transforms as transforms
	import torchvision.transforms.functional as TF
	from typing import List, Dict, Tuple
	from torch.utils.data import Dataset, DataLoader
	from pytorch_lightning import LightningDataModule
	from PIL import Image


	class RandomColorJitter(object):
	def __init__(
	self, brightness=0.5, contrast=0.5, saturation=0.5, hue=0.05, preserve=0.2
	):
	self.brightness = brightness
	self.contrast = contrast
	self.saturation = saturation
	self.hue = hue
	self.preserve = preserve

	def __call__(self, batch):
	if random.random() < self.preserve:
	return batch

	image, label = batch
	text_color = label[2:5].clone().view(3, 1, 1)
	stroke_color = label[7:10].clone().view(3, 1, 1)

	brightness = random.uniform(1 - self.brightness, 1 + self.brightness)
	image = TF.adjust_brightness(image, brightness)
	text_color = TF.adjust_brightness(text_color, brightness)
	stroke_color = TF.adjust_brightness(stroke_color, brightness)

	contrast = random.uniform(1 - self.contrast, 1 + self.contrast)
	image = TF.adjust_contrast(image, contrast)
	text_color = TF.adjust_contrast(text_color, contrast)
	stroke_color = TF.adjust_contrast(stroke_color, contrast)

	saturation = random.uniform(1 - self.saturation, 1 + self.saturation)
	image = TF.adjust_saturation(image, saturation)
	text_color = TF.adjust_saturation(text_color, saturation)
	stroke_color = TF.adjust_saturation(stroke_color, saturation)

	hue = random.uniform(-self.hue, self.hue)
	image = TF.adjust_hue(image, hue)
	text_color = TF.adjust_hue(text_color, hue)
	stroke_color = TF.adjust_hue(stroke_color, hue)

	label[2:5] = text_color.view(3)
	label[7:10] = stroke_color.view(3)
	return image, label


	class RandomCrop(object):
	def __init__(self, crop_factor: float = 0.1, preserve: float = 0.2):
	self.crop_factor = crop_factor
	self.preserve = preserve

	def __call__(self, batch):
	if random.random() < self.preserve:
	return batch

	image, label = batch
	width, height = image.size

	# use random value to decide scaling factor on x and y axis
	random_height = random.random() * self.crop_factor
	random_width = random.random() * self.crop_factor
	# use random value again to decide scaling factor for 4 borders
	random_top = random.random() * random_height
	random_left = random.random() * random_width
	# calculate new width and height and position
	top = int(random_top * height)
	left = int(random_left * width)
	height = int(height - random_height * height)
	width = int(width - random_width * width)
	# crop image
	image = TF.crop(image, top, left, height, width)

	label[[5, 6, 10]] = label[[5, 6, 10]] * (1 - random_height)
	return image, label


	class RandomRotate(object):
	def __init__(self, max_angle: int = 15, preserve: float = 0.2):
	self.max_angle = max_angle
	self.preserve = preserve

	def __call__(self, batch):
	if random.random() < self.preserve:
	return batch

	image, label = batch

	angle = random.uniform(-self.max_angle, self.max_angle)
	image = TF.rotate(image, angle)
	label[11] = label[11] + angle / 180
	return image, label


	class FontDataset(Dataset):
	def __init__(
	self,
	path: str,
	config_path: str = "configs/font.yml",
	regression_use_tanh: bool = False,
	transforms: str = None,
	crop_roi_bbox: bool = False,
	):
	"""Font dataset

	Args:
	path (str): path to the dataset
	config_path (str, optional): path to font config file. Defaults to "configs/font.yml".
	regression_use_tanh (bool, optional): whether use tanh as regression normalization. Defaults to False.
	transforms (str, optional): choose from None, 'v1', 'v2'. Defaults to None.
	crop_roi_bbox (bool, optional): whether to crop text roi bbox, must be true when transform='v2'. Defaults to False.
	"""
	self.path = path
	self.fonts = load_font_with_exclusion(config_path)
	self.regression_use_tanh = regression_use_tanh
	self.transforms = transforms
	self.crop_roi_bbox = crop_roi_bbox

	self.images = [
	os.path.join(path, f) for f in os.listdir(path) if f.endswith(".jpg")
	]
	self.images.sort()

	if transforms == "v2":
	assert crop_roi_bbox, "crop_roi_bbox must be true when transform='v2'"

	def __len__(self):
	return len(self.images)

	def fontlabel2tensor(self, label: FontLabel, label_path) -> torch.Tensor:
	out = torch.zeros(12, dtype=torch.float)
	try:
	out[0] = self.fonts[label.font.path]
	except KeyError:
	print(f"Unqualified font: {label.font.path}")
	print(f"Label path: {label_path}")
	raise KeyError
	out[1] = 0 if label.text_direction == "ltr" else 1
	# [0, 1]
	out[2] = label.text_color[0] / 255.0
	out[3] = label.text_color[1] / 255.0
	out[4] = label.text_color[2] / 255.0
	out[5] = label.text_size / label.image_width
	out[6] = label.stroke_width / label.image_width
	if label.stroke_color:
	out[7] = label.stroke_color[0] / 255.0
	out[8] = label.stroke_color[1] / 255.0
	out[9] = label.stroke_color[2] / 255.0
	else:
	out[7:10] = out[2:5]
	out[10] = label.line_spacing / label.image_width
	out[11] = label.angle / 180.0 + 0.5

	return out

	def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor]:
	# Load image
	image_path = self.images[index]
	image = Image.open(image_path).convert("RGB")

	# Load label
	label_path = image_path.replace(".jpg", ".bin")
	with open(label_path, "rb") as f:
	label: FontLabel = pickle.load(f)

	if (self.transforms == "v1") or (self.transforms is None):
	if self.crop_roi_bbox:
	left, top, width, height = label.bbox
	image = TF.crop(image, top, left, height, width)
	label.image_width = width
	label.image_height = height

	# encode label
	label = self.fontlabel2tensor(label, label_path)

	# data augmentation
	if self.transforms is not None:
	transform = transforms.Compose(
	[
	RandomColorJitter(preserve=0.2),
	RandomCrop(preserve=0.2),
	]
	)
	image, label = transform((image, label))
	elif self.transforms == "v2":
	# crop from 30% to 130% of bbox
	left, top, width, height = label.bbox

	right = left + width
	bottom = top + height

	width_delta = width * 0.07
	height_delta = height * 0.07

	left = max(0, int(left - width_delta))
	top = max(0, int(top - height_delta))

	right = min(image.width, int(right + width_delta))
	bottom = min(image.height, int(bottom + height_delta))

	width = right - left
	height = bottom - top

	image = TF.crop(image, top, left, height, width)
	label.image_width = width
	label.image_height = height

	# encode label
	label = self.fontlabel2tensor(label, label_path)

	transform = transforms.Compose(
	[
	RandomColorJitter(preserve=0.2),
	RandomCrop(crop_factor=0.54, preserve=0),
	RandomRotate(preserve=0.2),
	]
	)
	image, label = transform((image, label))

	transform = transforms.GaussianBlur(
	random.randint(1, 3) * 2 - 1, sigma=(0.1, 5.0)
	)

	image = transform(image)

	# resize and to tensor
	transform = transforms.Compose(
	[
	transforms.Resize((config.INPUT_SIZE, config.INPUT_SIZE)),
	transforms.ToTensor(),
	]
	)
	image = transform(image)

	if self.transforms == "v2":
	# noise
	if random.random() < 0.9:
	image = image + torch.randn_like(image) * random.random() * 0.05

	# normalize label
	if self.regression_use_tanh:
	label[2:12] = label[2:12] * 2 - 1

	return image, label


	class FontDataModule(LightningDataModule):
	def __init__(
	self,
	config_path: str = "configs/font.yml",
	train_path: str = "./dataset/font_img/train",
	val_path: str = "./dataset/font_img/val",
	test_path: str = "./dataset/font_img/test",
	train_shuffle: bool = True,
	val_shuffle: bool = False,
	test_shuffle: bool = False,
	train_transforms: bool = None,
	val_transforms: bool = None,
	test_transforms: bool = None,
	crop_roi_bbox: bool = False,
	regression_use_tanh: bool = False,
	**kwargs,
	):
	super().__init__()
	self.dataloader_args = kwargs
	self.train_shuffle = train_shuffle
	self.val_shuffle = val_shuffle
	self.test_shuffle = test_shuffle
	self.train_dataset = FontDataset(
	train_path,
	config_path,
	regression_use_tanh,
	train_transforms,
	crop_roi_bbox,
	)
	self.val_dataset = FontDataset(
	val_path, config_path, regression_use_tanh, val_transforms, crop_roi_bbox
	)
	self.test_dataset = FontDataset(
	test_path, config_path, regression_use_tanh, test_transforms, crop_roi_bbox
	)

	def get_train_num_iter(self, num_device: int) -> int:
	return math.ceil(
	len(self.train_dataset) / (self.dataloader_args["batch_size"] * num_device)
	)

	def train_dataloader(self):
	return DataLoader(
	self.train_dataset,
	shuffle=self.train_shuffle,
	**self.dataloader_args,
	)

	def val_dataloader(self):
	return DataLoader(
	self.val_dataset,
	shuffle=self.val_shuffle,
	**self.dataloader_args,
	)

	def test_dataloader(self):
	return DataLoader(
	self.test_dataset,
	shuffle=self.test_shuffle,
	**self.dataloader_args,
	)