Spaces:
Paused
Paused
# Copyright (c) 2023-2024 DeepSeek. | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy of | |
# this software and associated documentation files (the "Software"), to deal in | |
# the Software without restriction, including without limitation the rights to | |
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of | |
# the Software, and to permit persons to whom the Software is furnished to do so, | |
# subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in all | |
# copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS | |
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR | |
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER | |
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
from typing import List, Tuple, Union | |
import numpy as np | |
import torch | |
import torchvision | |
import torchvision.transforms.functional | |
from PIL import Image | |
from transformers import AutoImageProcessor, PretrainedConfig | |
from transformers.image_processing_utils import BaseImageProcessor, BatchFeature | |
from transformers.image_utils import to_numpy_array | |
from transformers.utils import logging | |
logger = logging.get_logger(__name__) | |
ImageType = Union[np.ndarray, torch.Tensor, Image.Image] | |
IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073) | |
IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711) | |
IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5) | |
IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5) | |
def expand2square(pil_img, background_color): | |
width, height = pil_img.size | |
if width == height: | |
return pil_img | |
elif width > height: | |
result = Image.new(pil_img.mode, (width, width), background_color) | |
result.paste(pil_img, (0, (width - height) // 2)) | |
return result | |
else: | |
result = Image.new(pil_img.mode, (height, height), background_color) | |
result.paste(pil_img, ((height - width) // 2, 0)) | |
return result | |
class VLMImageProcessorConfig(PretrainedConfig): | |
model_type = "deepseek_vlm" | |
image_size: int | |
min_size: int | |
image_mean: Union[Tuple[float, float, float], List[float]] | |
image_std: Union[Tuple[float, float, float], List[float]] | |
rescale_factor: float | |
do_normalize: bool | |
def __init__( | |
self, | |
image_size: int, | |
min_size: int = 14, | |
image_mean: Union[Tuple[float, float, float], List[float]] = ( | |
0.48145466, | |
0.4578275, | |
0.40821073, | |
), | |
image_std: Union[Tuple[float, float, float], List[float]] = ( | |
0.26862954, | |
0.26130258, | |
0.27577711, | |
), | |
rescale_factor: float = 1.0 / 255.0, | |
do_normalize: bool = True, | |
**kwargs, | |
): | |
self.image_size = image_size | |
self.min_size = min_size | |
self.image_mean = image_mean | |
self.image_std = image_std | |
self.rescale_factor = rescale_factor | |
self.do_normalize = do_normalize | |
super().__init__(**kwargs) | |
class VLMImageProcessor(BaseImageProcessor): | |
model_input_names = ["pixel_values"] | |
def __init__( | |
self, | |
image_size: int, | |
min_size: int = 14, | |
image_mean: Union[Tuple[float, float, float], List[float]] = ( | |
0.48145466, | |
0.4578275, | |
0.40821073, | |
), | |
image_std: Union[Tuple[float, float, float], List[float]] = ( | |
0.26862954, | |
0.26130258, | |
0.27577711, | |
), | |
rescale_factor: float = 1.0 / 255.0, | |
do_normalize: bool = True, | |
**kwargs, | |
): | |
super().__init__(**kwargs) | |
self.image_size = image_size | |
self.rescale_factor = rescale_factor | |
self.image_mean = image_mean | |
self.image_std = image_std | |
self.min_size = min_size | |
self.do_normalize = do_normalize | |
if image_mean is None: | |
self.background_color = (127, 127, 127) | |
else: | |
self.background_color = tuple([int(x * 255) for x in image_mean]) | |
def resize(self, pil_img: Image) -> np.ndarray: | |
""" | |
Args: | |
pil_img (PIL.Image): [H, W, 3] in PIL.Image in RGB | |
Returns: | |
x (np.ndarray): [3, self.image_size, self.image_size] | |
""" | |
width, height = pil_img.size | |
max_size = max(width, height) | |
size = [ | |
max(int(height / max_size * self.image_size), self.min_size), | |
max(int(width / max_size * self.image_size), self.min_size), | |
] | |
if width <= 0 or height <= 0 or size[0] <= 0 or size[1] <= 0: | |
print(f"orig size = {pil_img.size}, new size = {size}") | |
raise ValueError("Invalid size!") | |
pil_img = torchvision.transforms.functional.resize( | |
pil_img, | |
size, | |
interpolation=torchvision.transforms.functional.InterpolationMode.BICUBIC, | |
antialias=True, | |
) | |
pil_img = expand2square(pil_img, self.background_color) | |
x = to_numpy_array(pil_img) | |
# [H, W, 3] -> [3, H, W] | |
x = np.transpose(x, (2, 0, 1)) | |
return x | |
def preprocess(self, images, return_tensors: str = "pt", **kwargs) -> BatchFeature: | |
# resize and pad to [self.image_size, self.image_size] | |
# then convert from [H, W, 3] to [3, H, W] | |
images: List[np.ndarray] = [self.resize(image) for image in images] | |
# resacle from [0, 255] -> [0, 1] | |
images = [ | |
self.rescale( | |
image=image, | |
scale=self.rescale_factor, | |
input_data_format="channels_first", | |
) | |
for image in images | |
] | |
# normalize | |
if self.do_normalize: | |
images = [ | |
self.normalize( | |
image=image, | |
mean=self.image_mean, | |
std=self.image_std, | |
input_data_format="channels_first", | |
) | |
for image in images | |
] | |
data = {"pixel_values": images} | |
return BatchFeature(data=data, tensor_type=return_tensors) | |
def default_shape(self): | |
return [3, self.image_size, self.image_size] | |
AutoImageProcessor.register(VLMImageProcessorConfig, VLMImageProcessor) | |
if __name__ == "__main__": | |
image_processor = VLMImageProcessor( | |
image_size=1024, | |
image_mean=IMAGENET_INCEPTION_MEAN, | |
image_std=IMAGENET_INCEPTION_STD, | |
do_normalize=True, | |
) | |