Spaces:
Build error
Build error
Upload any_res.py
Browse files- src/data/any_res.py +257 -0
src/data/any_res.py
ADDED
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import torch
|
3 |
+
import math
|
4 |
+
import ast
|
5 |
+
from PIL import Image
|
6 |
+
from io import BytesIO
|
7 |
+
|
8 |
+
|
9 |
+
def select_best_resolution(original_size, possible_resolutions):
|
10 |
+
"""
|
11 |
+
Selects the best resolution from a list of possible resolutions based on the original size.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
original_size (tuple): The original size of the image in the format (width, height).
|
15 |
+
possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
|
16 |
+
|
17 |
+
Returns:
|
18 |
+
tuple: The best fit resolution in the format (width, height).
|
19 |
+
"""
|
20 |
+
original_width, original_height = original_size
|
21 |
+
best_fit = None
|
22 |
+
max_effective_resolution = 0
|
23 |
+
min_wasted_resolution = float('inf')
|
24 |
+
|
25 |
+
for width, height in possible_resolutions:
|
26 |
+
scale = min(width / original_width, height / original_height)
|
27 |
+
downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
|
28 |
+
effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
|
29 |
+
wasted_resolution = (width * height) - effective_resolution
|
30 |
+
|
31 |
+
if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
|
32 |
+
max_effective_resolution = effective_resolution
|
33 |
+
min_wasted_resolution = wasted_resolution
|
34 |
+
best_fit = (width, height)
|
35 |
+
|
36 |
+
return best_fit
|
37 |
+
|
38 |
+
|
39 |
+
def select_best_resolution_v2(original_size, possible_resolutions):
|
40 |
+
"""
|
41 |
+
Selects the best resolution from a list of possible resolutions based on the original size and aspect ratio.
|
42 |
+
|
43 |
+
Args:
|
44 |
+
original_size (tuple): The original size of the image in the format (width, height).
|
45 |
+
possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
|
46 |
+
|
47 |
+
Returns:
|
48 |
+
tuple: The best fit resolution in the format (width, height).
|
49 |
+
"""
|
50 |
+
original_width, original_height = original_size
|
51 |
+
original_aspect_ratio = original_height / original_width
|
52 |
+
original_area = original_width * original_height
|
53 |
+
best_fit = None
|
54 |
+
min_aspect_ratio_diff = float('inf')
|
55 |
+
min_area_ratio = float('inf')
|
56 |
+
|
57 |
+
for width, height in possible_resolutions:
|
58 |
+
aspect_ratio = height / width
|
59 |
+
area = width * height
|
60 |
+
aspect_ratio_diff = max(aspect_ratio, original_aspect_ratio) / min(aspect_ratio, original_aspect_ratio)
|
61 |
+
area_ratio = max(area, original_area) / min(area, original_area)
|
62 |
+
|
63 |
+
if aspect_ratio_diff < min_aspect_ratio_diff or (aspect_ratio_diff == min_aspect_ratio_diff and area_ratio < min_area_ratio):
|
64 |
+
min_aspect_ratio_diff = aspect_ratio_diff
|
65 |
+
min_area_ratio = area_ratio
|
66 |
+
best_fit = (width, height)
|
67 |
+
|
68 |
+
return best_fit
|
69 |
+
|
70 |
+
|
71 |
+
def resize_and_pad_image(image, target_resolution, keep_ratio=False):
|
72 |
+
"""
|
73 |
+
Resize and pad an image to a target resolution
|
74 |
+
|
75 |
+
Args:
|
76 |
+
image (PIL.Image.Image): The input image.
|
77 |
+
target_resolution (tuple): The target resolution (width, height) of the image.
|
78 |
+
|
79 |
+
Returns:
|
80 |
+
PIL.Image.Image: The resized and padded image.
|
81 |
+
"""
|
82 |
+
original_width, original_height = image.size
|
83 |
+
target_width, target_height = target_resolution
|
84 |
+
|
85 |
+
if keep_ratio:
|
86 |
+
# maintaining aspect ratio
|
87 |
+
scale_w = target_width / original_width
|
88 |
+
scale_h = target_height / original_height
|
89 |
+
|
90 |
+
if scale_w < scale_h:
|
91 |
+
new_width = target_width
|
92 |
+
new_height = min(math.ceil(original_height * scale_w), target_height)
|
93 |
+
else:
|
94 |
+
new_height = target_height
|
95 |
+
new_width = min(math.ceil(original_width * scale_h), target_width)
|
96 |
+
|
97 |
+
# Resize the image
|
98 |
+
resized_image = image.resize((new_width, new_height))
|
99 |
+
|
100 |
+
new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0))
|
101 |
+
paste_x = (target_width - new_width) // 2
|
102 |
+
paste_y = (target_height - new_height) // 2
|
103 |
+
new_image.paste(resized_image, (paste_x, paste_y))
|
104 |
+
else:
|
105 |
+
# not maintaining aspect ratio
|
106 |
+
new_image = image.resize((target_width, target_height))
|
107 |
+
|
108 |
+
return new_image
|
109 |
+
|
110 |
+
|
111 |
+
def divide_to_patches(image, patch_size):
|
112 |
+
"""
|
113 |
+
Divides an image into patches of a specified size.
|
114 |
+
|
115 |
+
Args:
|
116 |
+
image (PIL.Image.Image): The input image.
|
117 |
+
patch_size (int): The size of each patch.
|
118 |
+
|
119 |
+
Returns:
|
120 |
+
list: A list of PIL.Image.Image objects representing the patches.
|
121 |
+
"""
|
122 |
+
patches = []
|
123 |
+
width, height = image.size
|
124 |
+
for i in range(0, height, patch_size):
|
125 |
+
for j in range(0, width, patch_size):
|
126 |
+
box = (j, i, j + patch_size, i + patch_size)
|
127 |
+
patch = image.crop(box)
|
128 |
+
patches.append(patch)
|
129 |
+
|
130 |
+
return patches
|
131 |
+
|
132 |
+
|
133 |
+
def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
|
134 |
+
"""
|
135 |
+
Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
|
136 |
+
|
137 |
+
Args:
|
138 |
+
image_size (tuple): The size of the input image in the format (width, height).
|
139 |
+
grid_pinpoints (str): A string representation of a list of possible resolutions.
|
140 |
+
patch_size (int): The size of each image patch.
|
141 |
+
|
142 |
+
Returns:
|
143 |
+
tuple: The shape of the image patch grid in the format (width, height).
|
144 |
+
"""
|
145 |
+
if type(grid_pinpoints) is list:
|
146 |
+
possible_resolutions = grid_pinpoints
|
147 |
+
else:
|
148 |
+
possible_resolutions = ast.literal_eval(grid_pinpoints)
|
149 |
+
width1, height1 = select_best_resolution(image_size, possible_resolutions)
|
150 |
+
width2, height2 = select_best_resolution_v2(image_size, possible_resolutions)
|
151 |
+
if width1*height1 > width2*height2:
|
152 |
+
width, height = width2, height2
|
153 |
+
else:
|
154 |
+
width, height = width1, height1
|
155 |
+
return width // patch_size, height // patch_size
|
156 |
+
|
157 |
+
|
158 |
+
def process_anyres_image(image, image_transform, grid_pinpoints, base_image_size):
|
159 |
+
"""
|
160 |
+
Process an image with variable resolutions.
|
161 |
+
|
162 |
+
Args:
|
163 |
+
image (PIL.Image.Image): The input image to be processed.
|
164 |
+
image_transform: The image processor object.
|
165 |
+
grid_pinpoints (str): A string representation of a list of possible resolutions.
|
166 |
+
|
167 |
+
Returns:
|
168 |
+
torch.Tensor: A tensor containing the processed image patches.
|
169 |
+
"""
|
170 |
+
if type(grid_pinpoints) is list:
|
171 |
+
possible_resolutions = grid_pinpoints
|
172 |
+
else:
|
173 |
+
possible_resolutions = ast.literal_eval(grid_pinpoints)
|
174 |
+
# best_resolution = select_best_resolution(image.size, possible_resolutions)
|
175 |
+
width1, height1 = select_best_resolution(image.size, possible_resolutions)
|
176 |
+
width2, height2 = select_best_resolution_v2(image.size, possible_resolutions)
|
177 |
+
if width1*height1 > width2*height2:
|
178 |
+
width, height = width2, height2
|
179 |
+
else:
|
180 |
+
width, height = width1, height1
|
181 |
+
best_resolution = [width, height]
|
182 |
+
|
183 |
+
image_padded = resize_and_pad_image(image, best_resolution)
|
184 |
+
|
185 |
+
patches = divide_to_patches(image_padded, base_image_size)
|
186 |
+
|
187 |
+
image_original_resize = image.resize((base_image_size, base_image_size))
|
188 |
+
|
189 |
+
image_patches = patches + [image_original_resize] # add the original image as the last patch
|
190 |
+
image_patches = [image_transform(image_patch)
|
191 |
+
for image_patch in image_patches]
|
192 |
+
|
193 |
+
patch_grid = (best_resolution[0]//base_image_size, best_resolution[1]//base_image_size)
|
194 |
+
x_index = (torch.arange(patch_grid[0]).repeat(patch_grid[1], 1) + 0.5)/patch_grid[0]
|
195 |
+
y_index = (torch.arange(patch_grid[1]).unsqueeze(1).repeat(1, patch_grid[0]) + 0.5)/patch_grid[1]
|
196 |
+
patch_pos = torch.stack([x_index, y_index], dim=-1).flatten(0, 1) # h*w, 2
|
197 |
+
|
198 |
+
origin_pos = torch.tensor([[0.5, 0.5]])
|
199 |
+
patch_pos = torch.cat([patch_pos, origin_pos], dim=0) # h*w+1, 2
|
200 |
+
|
201 |
+
return torch.stack(image_patches, dim=0), patch_pos
|
202 |
+
|
203 |
+
|
204 |
+
def load_image_from_base64(image):
|
205 |
+
return Image.open(BytesIO(base64.b64decode(image)))
|
206 |
+
|
207 |
+
|
208 |
+
def anyres_data_collate(batch, tokenizer, dataset_name=None):
|
209 |
+
results = {}
|
210 |
+
keys = batch[0].keys()
|
211 |
+
|
212 |
+
for key in keys:
|
213 |
+
cur = [batch[i][key] for i in range(len(batch)) if batch[i][key] is not None]
|
214 |
+
if len(cur) == 0:
|
215 |
+
results[key] = None
|
216 |
+
elif isinstance(cur[0], torch.Tensor):
|
217 |
+
if key in ['embeds_gen_mask', 'embeds_cmp_mask', 'images', 'images_patch_length', 'patch_position', 'image_size']:
|
218 |
+
results[key] = torch.cat(cur, dim=0)
|
219 |
+
else:
|
220 |
+
if key in ['input_ids']:
|
221 |
+
results[key] = torch.nn.utils.rnn.pad_sequence(cur, batch_first=True, padding_value=tokenizer.pad_token_id)
|
222 |
+
elif key in ['attention_mask']:
|
223 |
+
results[key] = torch.nn.utils.rnn.pad_sequence(cur, batch_first=True, padding_value=0)
|
224 |
+
elif key in ['labels']:
|
225 |
+
results[key] = torch.nn.utils.rnn.pad_sequence(cur, batch_first=True, padding_value=-100)
|
226 |
+
elif key in ['ids_gen_mask', 'ids_cmp_mask']:
|
227 |
+
results[key] = torch.nn.utils.rnn.pad_sequence(cur, batch_first=True, padding_value=False)
|
228 |
+
|
229 |
+
else:
|
230 |
+
results[key] = torch.stack(cur, dim=0)
|
231 |
+
else:
|
232 |
+
results[key] = cur
|
233 |
+
|
234 |
+
results['dataset_name'] = dataset_name
|
235 |
+
|
236 |
+
return results
|
237 |
+
|
238 |
+
|
239 |
+
def anyres_data_collate_old(batch, dataset_name=None):
|
240 |
+
results = {}
|
241 |
+
keys = batch[0].keys()
|
242 |
+
|
243 |
+
for key in keys:
|
244 |
+
cur = [batch[i][key] for i in range(len(batch)) if batch[i][key] is not None]
|
245 |
+
if len(cur) == 0:
|
246 |
+
results[key] = None
|
247 |
+
elif isinstance(cur[0], torch.Tensor):
|
248 |
+
if key in ['embeds_gen_mask', 'embeds_cmp_mask', 'images', 'images_patch_length', 'patch_position', 'image_size']:
|
249 |
+
results[key] = torch.cat(cur, dim=0)
|
250 |
+
else:
|
251 |
+
results[key] = torch.stack(cur, dim=0)
|
252 |
+
else:
|
253 |
+
results[key] = cur
|
254 |
+
|
255 |
+
results['dataset_name'] = dataset_name
|
256 |
+
|
257 |
+
return results
|