princepride commited on
Commit
14c75cc
1 Parent(s): 45fe1ae

Upload 3 files

Browse files
Files changed (2) hide show
  1. preprocessor_config.json +24 -0
  2. processing_minicpmv.py +216 -0
preprocessor_config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "image_processor_type": "MiniCPMVImageProcessor",
3
+ "auto_map": {
4
+ "AutoProcessor": "processing_minicpmv.MiniCPMVProcessor",
5
+ "AutoImageProcessor": "image_processing_minicpmv.MiniCPMVImageProcessor"
6
+ },
7
+ "processor_class": "MiniCPMVProcessor",
8
+ "max_slice_nums": 9,
9
+ "scale_resolution": 448,
10
+ "patch_size": 14,
11
+ "use_image_id": true,
12
+ "image_feature_size": 64,
13
+ "im_start": "<image>",
14
+ "im_end": "</image>",
15
+ "slice_start": "<slice>",
16
+ "slice_end": "</slice>",
17
+ "unk": "<unk>",
18
+ "im_id_start": "<image_id>",
19
+ "im_id_end": "</image_id>",
20
+ "slice_mode": true,
21
+ "norm_mean": [0.5, 0.5, 0.5],
22
+ "norm_std": [0.5, 0.5, 0.5],
23
+ "version": 2.6
24
+ }
processing_minicpmv.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """
16
+ Processor class for MiniCPMV.
17
+ """
18
+
19
+ from typing import List, Optional, Union, Dict, Any
20
+ import torch
21
+ import re
22
+
23
+ from transformers.image_processing_utils import BatchFeature
24
+ from transformers.image_utils import ImageInput
25
+ from transformers.processing_utils import ProcessorMixin
26
+ from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
27
+ from transformers.utils import TensorType, requires_backends, is_torch_dtype, is_torch_device
28
+
29
+ from .image_processing_minicpmv import MiniCPMVBatchFeature
30
+
31
+
32
+ class MiniCPMVProcessor(ProcessorMixin):
33
+ r"""
34
+ Constructs a MiniCPMV processor which wraps a MiniCPMV image processor and a MiniCPMV tokenizer into a single processor.
35
+
36
+ [`MiniCPMVProcessor`] offers all the functionalities of [`MiniCPMVImageProcessor`] and [`LlamaTokenizerWrapper`]. See the
37
+ [`~MiniCPMVProcessor.__call__`] and [`~MiniCPMVProcessor.decode`] for more information.
38
+
39
+ Args:
40
+ image_processor ([`MiniCPMVImageProcessor`], *optional*):
41
+ The image processor is a required input.
42
+ tokenizer ([`LlamaTokenizerWrapper`], *optional*):
43
+ The tokenizer is a required input.
44
+ """
45
+ attributes = ["image_processor", "tokenizer"]
46
+ image_processor_class = "AutoImageProcessor"
47
+ tokenizer_class = "AutoTokenizer"
48
+
49
+ def __init__(self, image_processor=None, tokenizer=None):
50
+ super().__init__(image_processor, tokenizer)
51
+ self.version = image_processor.version
52
+
53
+ def __call__(
54
+ self,
55
+ images: ImageInput,
56
+ max_length: Optional[int] = None,
57
+ do_pad: Optional[bool] = True,
58
+ max_slice_nums: int = None,
59
+ use_image_id: bool = None,
60
+ return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
61
+ **kwargs
62
+ ) -> MiniCPMVBatchFeature:
63
+
64
+ if images is not None:
65
+ image_inputs = self.image_processor(images, do_pad=do_pad, max_slice_nums=max_slice_nums, return_tensors=return_tensors)
66
+ return self._convert_images_texts_to_inputs(image_inputs, max_slice_nums=max_slice_nums, use_image_id=use_image_id, max_length=max_length, **kwargs)
67
+
68
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
69
+ def batch_decode(self, *args, **kwargs):
70
+ """
71
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
72
+ refer to the docstring of this method for more information.
73
+ """
74
+ output_ids = args[0]
75
+ result_text = []
76
+ for result in output_ids:
77
+ result = result[result != 0]
78
+ if result[0] == self.tokenizer.bos_id:
79
+ result = result[1:]
80
+ if result[-1] == self.tokenizer.eos_id:
81
+ result = result[:-1]
82
+ result_text.append(self.tokenizer.decode(result, *args[1:], **kwargs).strip())
83
+ return result_text
84
+ # return self.tokenizer.batch_decode(*args, **kwargs)
85
+
86
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
87
+ def decode(self, *args, **kwargs):
88
+ """
89
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
90
+ the docstring of this method for more information.
91
+ """
92
+ result = args[0]
93
+ result = result[result != 0]
94
+ if result[0] == self.tokenizer.bos_id:
95
+ result = result[1:]
96
+ if result[-1] == self.tokenizer.eos_id or (hasattr(self.tokenizer, "eot_id") and result[-1] == self.tokenizer.eot_id):
97
+ result = result[:-1]
98
+ return self.tokenizer.decode(result, *args[1:], **kwargs).strip()
99
+
100
+ def _convert(
101
+ self, input_str, max_inp_length: Optional[int] = None
102
+ ):
103
+ if self.version > 2.5 or not getattr(self.tokenizer, "add_bos_token", False):
104
+ input_ids = self.tokenizer.encode(input_str)
105
+ else:
106
+ input_ids = [self.tokenizer.bos_id] + self.tokenizer.encode(input_str)
107
+ if max_inp_length is not None:
108
+ input_ids = input_ids[:max_inp_length]
109
+ input_ids = torch.tensor(input_ids, dtype=torch.int32)
110
+
111
+ start_cond = (input_ids == self.tokenizer.im_start_id) | (input_ids == self.tokenizer.slice_start_id)
112
+ end_cond = (input_ids == self.tokenizer.im_end_id) | (input_ids == self.tokenizer.slice_end_id)
113
+
114
+ image_start_tokens = torch.where(start_cond)[0]
115
+ image_start_tokens += 1
116
+ image_end_tokens = torch.where(end_cond)[0]
117
+
118
+ valid_image_nums = max(len(image_start_tokens), len(image_end_tokens))
119
+
120
+ image_bounds = torch.hstack(
121
+ [
122
+ image_start_tokens[:valid_image_nums].unsqueeze(-1),
123
+ image_end_tokens[:valid_image_nums].unsqueeze(-1),
124
+ ]
125
+ )
126
+ return input_ids, image_bounds
127
+
128
+ def _convert_images_texts_to_inputs(
129
+ self,
130
+ images,
131
+ truncation=None,
132
+ max_length=None,
133
+ max_slice_nums=None,
134
+ use_image_id=None,
135
+ return_tensors=None,
136
+ **kwargs
137
+ ):
138
+
139
+ pattern = "(<image>./</image>)"
140
+ images, image_sizes, tgt_sizes = images["pixel_values"], images["image_sizes"], images["tgt_sizes"]
141
+
142
+ input_ids_list = []
143
+ image_bounds_list = []
144
+ padded_input_ids, padding_lengths = self.pad(
145
+ input_ids_list,
146
+ padding_side="left"
147
+ )
148
+ for i, length in enumerate(padding_lengths):
149
+ image_bounds_list[i] = image_bounds_list[i] + length
150
+ attention_mask = padded_input_ids.ne(0)
151
+
152
+ return MiniCPMVBatchFeature(data={
153
+ "input_ids": padded_input_ids,
154
+ "attention_mask": attention_mask,
155
+ "pixel_values": images,
156
+ "image_sizes": image_sizes,
157
+ "image_bound": image_bounds_list,
158
+ "tgt_sizes": tgt_sizes
159
+ })
160
+
161
+ @property
162
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
163
+ def model_input_names(self):
164
+ tokenizer_input_names = self.tokenizer.model_input_names
165
+ image_processor_input_names = self.image_processor.model_input_names
166
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
167
+
168
+
169
+ def pad(self, inputs, max_length=None, padding_value=0, padding_side="left"):
170
+ items = []
171
+ if isinstance(inputs[0], list):
172
+ assert isinstance(inputs[0][0], torch.Tensor)
173
+ for it in inputs:
174
+ for tr in it:
175
+ items.append(tr)
176
+ else:
177
+ assert isinstance(inputs[0], torch.Tensor)
178
+ items = inputs
179
+
180
+ batch_size = len(items)
181
+ shape = items[0].shape
182
+ dim = len(shape)
183
+ assert dim <= 2
184
+ if max_length is None:
185
+ max_length = 0
186
+ max_length = max(max_length, max(item.shape[-1] for item in items))
187
+ min_length = min(item.shape[-1] for item in items)
188
+ dtype = items[0].dtype
189
+
190
+ if dim == 0:
191
+ return torch.stack([item for item in items], dim=0), [0]
192
+ elif dim == 1:
193
+ if max_length == min_length:
194
+ return torch.stack([item for item in items], dim=0), [0] * batch_size
195
+ tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value
196
+ else:
197
+ tensor = (
198
+ torch.zeros((batch_size, max_length, shape[-1]), dtype=dtype)
199
+ + padding_value
200
+ )
201
+
202
+ padding_length = []
203
+ for i, item in enumerate(items):
204
+ if dim == 1:
205
+ if padding_side == "left":
206
+ tensor[i, -len(item) :] = item.clone()
207
+ else:
208
+ tensor[i, : len(item)] = item.clone()
209
+ elif dim == 2:
210
+ if padding_side == "left":
211
+ tensor[i, -len(item) :, :] = item.clone()
212
+ else:
213
+ tensor[i, : len(item), :] = item.clone()
214
+ padding_length.append(tensor.shape[-1] - len(item))
215
+
216
+ return tensor, padding_length