mpatel57 commited on
Commit
56ef57e
1 Parent(s): c6551da

Upload folder using huggingface_hub

Browse files
text-encoder/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CLIPTextEncoderOnly"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "auto_map": {
7
+ "AutoConfig": "utils.CLIPTextEncoderOnlyConfig",
8
+ "AutoModel": "utils.CLIPTextEncoderOnly"
9
+ },
10
+ "bos_token_id": 49406,
11
+ "eos_token_id": 49407,
12
+ "frozen": false,
13
+ "hidden_act": "quick_gelu",
14
+ "hidden_size": 512,
15
+ "initializer_factor": 1.0,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 2048,
18
+ "layer_norm_eps": 1e-05,
19
+ "lora": null,
20
+ "max_position_embeddings": 77,
21
+ "model_name": "openai/clip-vit-base-patch32",
22
+ "model_type": "clip_custom_text_model",
23
+ "num_attention_heads": 8,
24
+ "num_hidden_layers": 12,
25
+ "pad_token_id": 1,
26
+ "pretrained": false,
27
+ "projection_dim": 512,
28
+ "torch_dtype": "float32",
29
+ "transformers_version": "4.40.1",
30
+ "vocab_size": 49408
31
+ }
text-encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fa57103fa2a3587ead3775b9c64fccc9edf97ff95dc59c836c410633fa3808b
3
+ size 253736912
text-encoder/utils.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoConfig, AutoModel, PretrainedConfig, CLIPTextConfig, CLIPVisionConfig, PreTrainedModel, CLIPTextModelWithProjection, CLIPVisionModelWithProjection
2
+ from transformers.utils import ModelOutput
3
+ import torch
4
+ import open_clip
5
+ from dataclasses import dataclass
6
+ import safetensors.torch
7
+ from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
8
+ import os
9
+
10
+ HF_SAFE_WEIGHTS_NAME = "open_clip_model.safetensors"
11
+ HF_SAFE_WEIGHTS_NAME_PRIOR = "prior_model.safetensors"
12
+
13
+ @dataclass
14
+ class PriorTransformerOutput(ModelOutput):
15
+ """
16
+ The output of [`PriorTransformer`].
17
+
18
+ Args:
19
+ predicted_image_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`):
20
+ The predicted CLIP image embedding conditioned on the CLIP text embedding input.
21
+ """
22
+
23
+ predicted_image_embedding: torch.FloatTensor
24
+
25
+ @dataclass
26
+ class TextEncoderOutput(ModelOutput):
27
+ """
28
+ Output class for CLIPTextEncoderOnly model to store the outputs in a Hugging Face transformer style.
29
+
30
+ Attributes:
31
+ prompt_embeds (torch.Tensor): The embeddings of the input prompts.
32
+ last_hidden_states (torch.Tensor): The last hidden states from the model.
33
+ """
34
+ text_embeds: torch.FloatTensor = None
35
+ last_hidden_state: torch.FloatTensor = None
36
+
37
+ class CLIPTextEncoderOnlyConfig(CLIPTextConfig):
38
+ model_type = "clip_custom_text_model"
39
+
40
+ def __init__(self, model_name: str = None, pretrained: bool = True, frozen: bool = False, lora: dict = None, **kwargs):
41
+ self.model_name = model_name
42
+ self.pretrained = pretrained
43
+ self.frozen = frozen
44
+ self.lora = lora
45
+ super().__init__(**kwargs)
46
+
47
+ class CLIPTextEncoderOnly(PreTrainedModel):
48
+ config_class = CLIPTextEncoderOnlyConfig
49
+
50
+ def __init__(self, config):
51
+ """
52
+ Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
53
+
54
+ :param model_name: The name or path of the pretrained model.
55
+ :param pretrained: Whether to load the pretrained weights.
56
+ """
57
+ super().__init__(config)
58
+
59
+ if config.pretrained:
60
+ self.model = CLIPTextModelWithProjection.from_pretrained(config.model_name)
61
+ else:
62
+ base_cfg = CLIPTextConfig.from_pretrained(config.model_name)
63
+ self.model = CLIPTextModelWithProjection(base_cfg)
64
+
65
+ if config.lora:
66
+ l_config = LoraConfig(
67
+ r=config.lora.lora_r,
68
+ lora_alpha=config.lora.lora_alpha,
69
+ target_modules=[
70
+ "k_proj",
71
+ "v_proj",
72
+ "q_proj",
73
+ "out_proj",
74
+ "fc1",
75
+ "fc2",
76
+ "visual_projection",
77
+ "text_projection"
78
+ ],
79
+ lora_dropout=config.lora.lora_dropout,
80
+ bias="lora_only",
81
+ )
82
+ self.model = get_peft_model(self.model, l_config)
83
+
84
+
85
+ def forward(self, input_ids, attention_mask=None, position_ids=None):
86
+ """
87
+ Forward pass of the model.
88
+
89
+ :param input_ids: Indices of input sequence tokens in the vocabulary.
90
+ :param attention_mask: Mask to avoid performing attention on padding token indices.
91
+ :param token_type_ids: Segment token indices to indicate first and second portions of the inputs.
92
+ :return: Outputs of the model.
93
+ """
94
+ outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, output_hidden_states=True)
95
+ return TextEncoderOutput(text_embeds=outputs.text_embeds, last_hidden_state=outputs.last_hidden_state)
96
+
97
+ class CustomTextEncoderOnly(PreTrainedModel):
98
+ def __init__(self, model_name: str, output_hidden_size: int, pretrained: bool = True, frozen: bool = True, last_hidden_state: bool = False, lora: dict = None):
99
+ """
100
+ Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
101
+
102
+ :param model_name: The name or path of the pretrained model.
103
+ :param pretrained: Whether to load the pretrained weights.
104
+ """
105
+ config = AutoModel.from_pretrained(model_name).config
106
+ super().__init__(config)
107
+ self.last_hidden_state = last_hidden_state
108
+
109
+ if pretrained:
110
+ self.model = AutoModel.from_pretrained(model_name)
111
+ if frozen:
112
+ for param in self.model.parameters():
113
+ param.requires_grad = False
114
+ else:
115
+ self.model = AutoModel(config)
116
+
117
+ self.fc1 = torch.nn.Linear(self.model.config.hidden_size, output_hidden_size)
118
+ if last_hidden_state:
119
+ self.fc2 = torch.nn.Linear(self.model.config.hidden_size, output_hidden_size)
120
+
121
+ if lora:
122
+ l_config = LoraConfig(
123
+ task_type=TaskType.FEATURE_EXTRACTION,
124
+ r=lora.lora_r,
125
+ lora_alpha=lora.lora_alpha,
126
+ lora_dropout=lora.lora_dropout,
127
+ bias="lora_only",
128
+ )
129
+ self.model = get_peft_model(self.model, l_config)
130
+
131
+ def forward(self, input_ids, attention_mask=None, token_type_ids=None):
132
+ """
133
+ Forward pass of the model.
134
+
135
+ :param input_ids: Indices of input sequence tokens in the vocabulary.
136
+ :param attention_mask: Mask to avoid performing attention on padding token indices.
137
+ :param token_type_ids: Segment token indices to indicate first and second portions of the inputs.
138
+ :return: Outputs of the model.
139
+ """
140
+ outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, output_hidden_states=True)
141
+ text_embeds = self.fc1(outputs[1])
142
+ last_hidden_state = None
143
+ if self.last_hidden_state:
144
+ last_hidden_state = self.fc2(outputs[0])
145
+ else:
146
+ last_hidden_state = outputs[0]
147
+ return TextEncoderOutput(text_embeds=text_embeds, last_hidden_state=last_hidden_state)
148
+
149
+ class CLIPVisionEncoderOnlyConfig(PretrainedConfig):
150
+ model_type = "clip_custom_vision_model"
151
+
152
+ def __init__(self, model_name: str = None, pretrained: bool = True, frozen: bool = False, lora: dict = None, **kwargs):
153
+ self.model_name = model_name
154
+ self.pretrained = pretrained
155
+ self.frozen = frozen
156
+ self.lora = lora
157
+ super().__init__(**kwargs)
158
+
159
+ class CLIPVisionEncoderOnly(PreTrainedModel):
160
+ config_class = CLIPVisionEncoderOnlyConfig
161
+
162
+ def __init__(self, config):
163
+ """
164
+ Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
165
+
166
+ :param model_name: The name or path of the pretrained model.
167
+ :param pretrained: Whether to load the pretrained weights.
168
+ """
169
+ super().__init__(config)
170
+
171
+ if config.pretrained:
172
+ self.model = CLIPVisionModelWithProjection.from_pretrained(config.model_name)
173
+ else:
174
+ base_cfg = CLIPVisionConfig.from_pretrained(config.model_name)
175
+ self.model = CLIPVisionModelWithProjection(base_cfg)
176
+
177
+ if config.lora:
178
+ l_config = LoraConfig(
179
+ r=config.lora.lora_r,
180
+ lora_alpha=config.lora.lora_alpha,
181
+ target_modules=[
182
+ "k_proj",
183
+ "v_proj",
184
+ "q_proj",
185
+ "out_proj",
186
+ "fc1",
187
+ "fc2",
188
+ "visual_projection",
189
+ "text_projection"
190
+ ],
191
+ lora_dropout=config.lora.lora_dropout,
192
+ bias="lora_only",
193
+ )
194
+ self.model = get_peft_model(self.model, l_config)
195
+
196
+ def forward(self, data):
197
+ """
198
+ Forward pass of the model.
199
+ """
200
+ return self.model(**data).image_embeds
201
+
202
+ def parameters(self):
203
+ return self.model.parameters()
204
+
205
+
206
+ class OpenCLIPVisionEncoderOnly(torch.nn.Module):
207
+ def __init__(self, model_name: str, pretrained: bool = True, frozen: bool = False, lora: dict = None):
208
+ """
209
+ Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
210
+
211
+ :param model_name: The name or path of the pretrained model.
212
+ :param pretrained: Whether to load the pretrained weights.
213
+ """
214
+ super().__init__()
215
+ if pretrained:
216
+ model, _ = open_clip.create_model_from_pretrained(f"hf-hub:{model_name}")
217
+ model = model.visual
218
+ else:
219
+ raise NotImplemented
220
+ self.model = model
221
+
222
+ if lora:
223
+ l_config = LoraConfig(
224
+ r=lora.lora_r,
225
+ lora_alpha=lora.lora_alpha,
226
+ target_modules=[
227
+ "k_proj",
228
+ "v_proj",
229
+ "q_proj",
230
+ "out_proj",
231
+ "fc1",
232
+ "fc2",
233
+ "visual_projection",
234
+ "text_projection"
235
+ ],
236
+ lora_dropout=lora.lora_dropout,
237
+ bias="lora_only",
238
+ )
239
+ self.model = get_peft_model(self.model, l_config)
240
+
241
+ def forward(self, image):
242
+ """
243
+ Forward pass of the model.
244
+ """
245
+ return self.model(image)
246
+
247
+ def save_pretrained(self, save_dir):
248
+ tensors = self.model.state_dict()
249
+ safetensors.torch.save_file(tensors, save_dir / HF_SAFE_WEIGHTS_NAME)
250
+
251
+ class CustomPriorModel(torch.nn.Module):
252
+ def __init__(self, in_hidden_state, out_hidden_state):
253
+ """
254
+ Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
255
+
256
+ :param model_name: The name or path of the pretrained model.
257
+ :param pretrained: Whether to load the pretrained weights.
258
+ """
259
+ super().__init__()
260
+ mid_hidden_state = max(in_hidden_state, out_hidden_state)
261
+
262
+ self.fc1 = torch.nn.Linear(in_hidden_state*2, mid_hidden_state)
263
+ self.relu = torch.nn.ReLU()
264
+ self.fc2 = torch.nn.Linear(mid_hidden_state, out_hidden_state)
265
+
266
+ def reinitialize_model(self):
267
+ for name, param in self.named_parameters():
268
+ if param.requires_grad:
269
+ if len(param.shape) > 1:
270
+ torch.nn.init.xavier_uniform_(param)
271
+ else:
272
+ if 'weight' in name:
273
+ torch.nn.init.normal_(param)
274
+ else:
275
+ torch.nn.init.zeros_(param)
276
+
277
+ def forward(self, feats):
278
+ """
279
+ Forward pass of the model.
280
+ """
281
+ return PriorTransformerOutput(predicted_image_embedding=self.fc2(self.relu(self.fc1(feats))))
282
+
283
+ def save_pretrained(self, save_dir):
284
+ pass
285
+ # tensors = self.state_dict()
286
+ # safetensors.torch.save_file(tensors, os.path.join(save_dir, HF_SAFE_WEIGHTS_NAME_PRIOR))
287
+
288
+
289
+ def test_text_model(register=False, upload=False):
290
+ # register the classes
291
+ if register:
292
+ AutoConfig.register("clip_custom_text_model", CLIPTextEncoderOnlyConfig)
293
+ AutoModel.register(CLIPTextEncoderOnlyConfig, CLIPTextEncoderOnly)
294
+ CLIPTextEncoderOnlyConfig.register_for_auto_class()
295
+ CLIPTextEncoderOnly.register_for_auto_class("AutoModel")
296
+
297
+ if upload:
298
+ # Initialize the model
299
+ model_name = "openai/clip-vit-base-patch32"
300
+ pretrained=True
301
+ lora=None
302
+
303
+ cfg = CLIPTextEncoderOnlyConfig(model_name=model_name, pretrained=pretrained, lora=lora)
304
+ model = CLIPTextEncoderOnly(cfg)
305
+ model.push_to_hub("test-text-hf-upload")
306
+
307
+ model = CLIPTextEncoderOnly.from_pretrained("mpatel57/test-text-hf-upload", force_download=True)
308
+
309
+ def test_vision_model(register=False, upload=False):
310
+ # register the classes
311
+ if register:
312
+ AutoConfig.register("clip_custom_vision_model", CLIPVisionEncoderOnlyConfig)
313
+ AutoModel.register(CLIPVisionEncoderOnlyConfig, CLIPVisionEncoderOnly)
314
+ CLIPVisionEncoderOnlyConfig.register_for_auto_class()
315
+ CLIPVisionEncoderOnly.register_for_auto_class("AutoModel")
316
+
317
+ if upload:
318
+ # Initialize the model
319
+ model_name = "openai/clip-vit-base-patch32"
320
+ pretrained=True
321
+ lora=None
322
+
323
+ cfg = CLIPVisionEncoderOnlyConfig(model_name=model_name, pretrained=pretrained, lora=lora)
324
+ model = CLIPVisionEncoderOnly(cfg)
325
+ model.push_to_hub("test-vision-hf-upload")
326
+
327
+ model = CLIPVisionEncoderOnly.from_pretrained("mpatel57/test-vision-hf-upload", force_download=True)
328
+
329
+
330
+ if __name__ == "__main__":
331
+ test_text_model(register=False, upload=True)
332
+ test_vision_model(register=False, upload=True)
utils.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoConfig, AutoModel, PretrainedConfig, CLIPTextConfig, CLIPVisionConfig, PreTrainedModel, CLIPTextModelWithProjection, CLIPVisionModelWithProjection
2
+ from transformers.utils import ModelOutput
3
+ import torch
4
+ import open_clip
5
+ from dataclasses import dataclass
6
+ import safetensors.torch
7
+ from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
8
+ import os
9
+
10
+ HF_SAFE_WEIGHTS_NAME = "open_clip_model.safetensors"
11
+ HF_SAFE_WEIGHTS_NAME_PRIOR = "prior_model.safetensors"
12
+
13
+ @dataclass
14
+ class PriorTransformerOutput(ModelOutput):
15
+ """
16
+ The output of [`PriorTransformer`].
17
+
18
+ Args:
19
+ predicted_image_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`):
20
+ The predicted CLIP image embedding conditioned on the CLIP text embedding input.
21
+ """
22
+
23
+ predicted_image_embedding: torch.FloatTensor
24
+
25
+ @dataclass
26
+ class TextEncoderOutput(ModelOutput):
27
+ """
28
+ Output class for CLIPTextEncoderOnly model to store the outputs in a Hugging Face transformer style.
29
+
30
+ Attributes:
31
+ prompt_embeds (torch.Tensor): The embeddings of the input prompts.
32
+ last_hidden_states (torch.Tensor): The last hidden states from the model.
33
+ """
34
+ text_embeds: torch.FloatTensor = None
35
+ last_hidden_state: torch.FloatTensor = None
36
+
37
+ class CLIPTextEncoderOnlyConfig(CLIPTextConfig):
38
+ model_type = "clip_custom_text_model"
39
+
40
+ def __init__(self, model_name: str = None, pretrained: bool = True, frozen: bool = False, lora: dict = None, **kwargs):
41
+ self.model_name = model_name
42
+ self.pretrained = pretrained
43
+ self.frozen = frozen
44
+ self.lora = lora
45
+ super().__init__(**kwargs)
46
+
47
+ class CLIPTextEncoderOnly(PreTrainedModel):
48
+ config_class = CLIPTextEncoderOnlyConfig
49
+
50
+ def __init__(self, config):
51
+ """
52
+ Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
53
+
54
+ :param model_name: The name or path of the pretrained model.
55
+ :param pretrained: Whether to load the pretrained weights.
56
+ """
57
+ super().__init__(config)
58
+
59
+ if config.pretrained:
60
+ self.model = CLIPTextModelWithProjection.from_pretrained(config.model_name)
61
+ else:
62
+ base_cfg = CLIPTextConfig.from_pretrained(config.model_name)
63
+ self.model = CLIPTextModelWithProjection(base_cfg)
64
+
65
+ if config.lora:
66
+ l_config = LoraConfig(
67
+ r=config.lora.lora_r,
68
+ lora_alpha=config.lora.lora_alpha,
69
+ target_modules=[
70
+ "k_proj",
71
+ "v_proj",
72
+ "q_proj",
73
+ "out_proj",
74
+ "fc1",
75
+ "fc2",
76
+ "visual_projection",
77
+ "text_projection"
78
+ ],
79
+ lora_dropout=config.lora.lora_dropout,
80
+ bias="lora_only",
81
+ )
82
+ self.model = get_peft_model(self.model, l_config)
83
+
84
+
85
+ def forward(self, input_ids, attention_mask=None, position_ids=None):
86
+ """
87
+ Forward pass of the model.
88
+
89
+ :param input_ids: Indices of input sequence tokens in the vocabulary.
90
+ :param attention_mask: Mask to avoid performing attention on padding token indices.
91
+ :param token_type_ids: Segment token indices to indicate first and second portions of the inputs.
92
+ :return: Outputs of the model.
93
+ """
94
+ outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, output_hidden_states=True)
95
+ return TextEncoderOutput(text_embeds=outputs.text_embeds, last_hidden_state=outputs.last_hidden_state)
96
+
97
+ class CustomTextEncoderOnly(PreTrainedModel):
98
+ def __init__(self, model_name: str, output_hidden_size: int, pretrained: bool = True, frozen: bool = True, last_hidden_state: bool = False, lora: dict = None):
99
+ """
100
+ Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
101
+
102
+ :param model_name: The name or path of the pretrained model.
103
+ :param pretrained: Whether to load the pretrained weights.
104
+ """
105
+ config = AutoModel.from_pretrained(model_name).config
106
+ super().__init__(config)
107
+ self.last_hidden_state = last_hidden_state
108
+
109
+ if pretrained:
110
+ self.model = AutoModel.from_pretrained(model_name)
111
+ if frozen:
112
+ for param in self.model.parameters():
113
+ param.requires_grad = False
114
+ else:
115
+ self.model = AutoModel(config)
116
+
117
+ self.fc1 = torch.nn.Linear(self.model.config.hidden_size, output_hidden_size)
118
+ if last_hidden_state:
119
+ self.fc2 = torch.nn.Linear(self.model.config.hidden_size, output_hidden_size)
120
+
121
+ if lora:
122
+ l_config = LoraConfig(
123
+ task_type=TaskType.FEATURE_EXTRACTION,
124
+ r=lora.lora_r,
125
+ lora_alpha=lora.lora_alpha,
126
+ lora_dropout=lora.lora_dropout,
127
+ bias="lora_only",
128
+ )
129
+ self.model = get_peft_model(self.model, l_config)
130
+
131
+ def forward(self, input_ids, attention_mask=None, token_type_ids=None):
132
+ """
133
+ Forward pass of the model.
134
+
135
+ :param input_ids: Indices of input sequence tokens in the vocabulary.
136
+ :param attention_mask: Mask to avoid performing attention on padding token indices.
137
+ :param token_type_ids: Segment token indices to indicate first and second portions of the inputs.
138
+ :return: Outputs of the model.
139
+ """
140
+ outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, output_hidden_states=True)
141
+ text_embeds = self.fc1(outputs[1])
142
+ last_hidden_state = None
143
+ if self.last_hidden_state:
144
+ last_hidden_state = self.fc2(outputs[0])
145
+ else:
146
+ last_hidden_state = outputs[0]
147
+ return TextEncoderOutput(text_embeds=text_embeds, last_hidden_state=last_hidden_state)
148
+
149
+ class CLIPVisionEncoderOnlyConfig(PretrainedConfig):
150
+ model_type = "clip_custom_vision_model"
151
+
152
+ def __init__(self, model_name: str = None, pretrained: bool = True, frozen: bool = False, lora: dict = None, **kwargs):
153
+ self.model_name = model_name
154
+ self.pretrained = pretrained
155
+ self.frozen = frozen
156
+ self.lora = lora
157
+ super().__init__(**kwargs)
158
+
159
+ class CLIPVisionEncoderOnly(PreTrainedModel):
160
+ config_class = CLIPVisionEncoderOnlyConfig
161
+
162
+ def __init__(self, config):
163
+ """
164
+ Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
165
+
166
+ :param model_name: The name or path of the pretrained model.
167
+ :param pretrained: Whether to load the pretrained weights.
168
+ """
169
+ super().__init__(config)
170
+
171
+ if config.pretrained:
172
+ self.model = CLIPVisionModelWithProjection.from_pretrained(config.model_name)
173
+ else:
174
+ base_cfg = CLIPVisionConfig.from_pretrained(config.model_name)
175
+ self.model = CLIPVisionModelWithProjection(base_cfg)
176
+
177
+ if config.lora:
178
+ l_config = LoraConfig(
179
+ r=config.lora.lora_r,
180
+ lora_alpha=config.lora.lora_alpha,
181
+ target_modules=[
182
+ "k_proj",
183
+ "v_proj",
184
+ "q_proj",
185
+ "out_proj",
186
+ "fc1",
187
+ "fc2",
188
+ "visual_projection",
189
+ "text_projection"
190
+ ],
191
+ lora_dropout=config.lora.lora_dropout,
192
+ bias="lora_only",
193
+ )
194
+ self.model = get_peft_model(self.model, l_config)
195
+
196
+ def forward(self, data):
197
+ """
198
+ Forward pass of the model.
199
+ """
200
+ return self.model(**data).image_embeds
201
+
202
+ def parameters(self):
203
+ return self.model.parameters()
204
+
205
+
206
+ class OpenCLIPVisionEncoderOnly(torch.nn.Module):
207
+ def __init__(self, model_name: str, pretrained: bool = True, frozen: bool = False, lora: dict = None):
208
+ """
209
+ Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
210
+
211
+ :param model_name: The name or path of the pretrained model.
212
+ :param pretrained: Whether to load the pretrained weights.
213
+ """
214
+ super().__init__()
215
+ if pretrained:
216
+ model, _ = open_clip.create_model_from_pretrained(f"hf-hub:{model_name}")
217
+ model = model.visual
218
+ else:
219
+ raise NotImplemented
220
+ self.model = model
221
+
222
+ if lora:
223
+ l_config = LoraConfig(
224
+ r=lora.lora_r,
225
+ lora_alpha=lora.lora_alpha,
226
+ target_modules=[
227
+ "k_proj",
228
+ "v_proj",
229
+ "q_proj",
230
+ "out_proj",
231
+ "fc1",
232
+ "fc2",
233
+ "visual_projection",
234
+ "text_projection"
235
+ ],
236
+ lora_dropout=lora.lora_dropout,
237
+ bias="lora_only",
238
+ )
239
+ self.model = get_peft_model(self.model, l_config)
240
+
241
+ def forward(self, image):
242
+ """
243
+ Forward pass of the model.
244
+ """
245
+ return self.model(image)
246
+
247
+ def save_pretrained(self, save_dir):
248
+ tensors = self.model.state_dict()
249
+ safetensors.torch.save_file(tensors, save_dir / HF_SAFE_WEIGHTS_NAME)
250
+
251
+ class CustomPriorModel(torch.nn.Module):
252
+ def __init__(self, in_hidden_state, out_hidden_state):
253
+ """
254
+ Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
255
+
256
+ :param model_name: The name or path of the pretrained model.
257
+ :param pretrained: Whether to load the pretrained weights.
258
+ """
259
+ super().__init__()
260
+ mid_hidden_state = max(in_hidden_state, out_hidden_state)
261
+
262
+ self.fc1 = torch.nn.Linear(in_hidden_state*2, mid_hidden_state)
263
+ self.relu = torch.nn.ReLU()
264
+ self.fc2 = torch.nn.Linear(mid_hidden_state, out_hidden_state)
265
+
266
+ def reinitialize_model(self):
267
+ for name, param in self.named_parameters():
268
+ if param.requires_grad:
269
+ if len(param.shape) > 1:
270
+ torch.nn.init.xavier_uniform_(param)
271
+ else:
272
+ if 'weight' in name:
273
+ torch.nn.init.normal_(param)
274
+ else:
275
+ torch.nn.init.zeros_(param)
276
+
277
+ def forward(self, feats):
278
+ """
279
+ Forward pass of the model.
280
+ """
281
+ return PriorTransformerOutput(predicted_image_embedding=self.fc2(self.relu(self.fc1(feats))))
282
+
283
+ def save_pretrained(self, save_dir):
284
+ pass
285
+ # tensors = self.state_dict()
286
+ # safetensors.torch.save_file(tensors, os.path.join(save_dir, HF_SAFE_WEIGHTS_NAME_PRIOR))
287
+
288
+
289
+ def test_text_model(register=False, upload=False):
290
+ # register the classes
291
+ if register:
292
+ AutoConfig.register("clip_custom_text_model", CLIPTextEncoderOnlyConfig)
293
+ AutoModel.register(CLIPTextEncoderOnlyConfig, CLIPTextEncoderOnly)
294
+ CLIPTextEncoderOnlyConfig.register_for_auto_class()
295
+ CLIPTextEncoderOnly.register_for_auto_class("AutoModel")
296
+
297
+ if upload:
298
+ # Initialize the model
299
+ model_name = "openai/clip-vit-base-patch32"
300
+ pretrained=True
301
+ lora=None
302
+
303
+ cfg = CLIPTextEncoderOnlyConfig(model_name=model_name, pretrained=pretrained, lora=lora)
304
+ model = CLIPTextEncoderOnly(cfg)
305
+ model.push_to_hub("test-text-hf-upload")
306
+
307
+ model = CLIPTextEncoderOnly.from_pretrained("mpatel57/test-text-hf-upload", force_download=True)
308
+
309
+ def test_vision_model(register=False, upload=False):
310
+ # register the classes
311
+ if register:
312
+ AutoConfig.register("clip_custom_vision_model", CLIPVisionEncoderOnlyConfig)
313
+ AutoModel.register(CLIPVisionEncoderOnlyConfig, CLIPVisionEncoderOnly)
314
+ CLIPVisionEncoderOnlyConfig.register_for_auto_class()
315
+ CLIPVisionEncoderOnly.register_for_auto_class("AutoModel")
316
+
317
+ if upload:
318
+ # Initialize the model
319
+ model_name = "openai/clip-vit-base-patch32"
320
+ pretrained=True
321
+ lora=None
322
+
323
+ cfg = CLIPVisionEncoderOnlyConfig(model_name=model_name, pretrained=pretrained, lora=lora)
324
+ model = CLIPVisionEncoderOnly(cfg)
325
+ model.push_to_hub("test-vision-hf-upload")
326
+
327
+ model = CLIPVisionEncoderOnly.from_pretrained("mpatel57/test-vision-hf-upload", force_download=True)
328
+
329
+
330
+ if __name__ == "__main__":
331
+ test_text_model(register=False, upload=True)
332
+ test_vision_model(register=False, upload=True)
vision-encoder/config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CLIPVisionEncoderOnly"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "utils.CLIPVisionEncoderOnlyConfig",
7
+ "AutoModel": "utils.CLIPVisionEncoderOnly"
8
+ },
9
+ "frozen": false,
10
+ "lora": null,
11
+ "model_name": "openai/clip-vit-base-patch32",
12
+ "model_type": "clip_custom_vision_model",
13
+ "pretrained": false,
14
+ "torch_dtype": "float32",
15
+ "transformers_version": "4.40.1"
16
+ }
vision-encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbb2d7358cbbb43f89e5b70ebc2ec03aa8b696251608995d4ea63536dd01d54f
3
+ size 351421984
vision-encoder/utils.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoConfig, AutoModel, PretrainedConfig, CLIPTextConfig, CLIPVisionConfig, PreTrainedModel, CLIPTextModelWithProjection, CLIPVisionModelWithProjection
2
+ from transformers.utils import ModelOutput
3
+ import torch
4
+ import open_clip
5
+ from dataclasses import dataclass
6
+ import safetensors.torch
7
+ from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
8
+ import os
9
+
10
+ HF_SAFE_WEIGHTS_NAME = "open_clip_model.safetensors"
11
+ HF_SAFE_WEIGHTS_NAME_PRIOR = "prior_model.safetensors"
12
+
13
+ @dataclass
14
+ class PriorTransformerOutput(ModelOutput):
15
+ """
16
+ The output of [`PriorTransformer`].
17
+
18
+ Args:
19
+ predicted_image_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`):
20
+ The predicted CLIP image embedding conditioned on the CLIP text embedding input.
21
+ """
22
+
23
+ predicted_image_embedding: torch.FloatTensor
24
+
25
+ @dataclass
26
+ class TextEncoderOutput(ModelOutput):
27
+ """
28
+ Output class for CLIPTextEncoderOnly model to store the outputs in a Hugging Face transformer style.
29
+
30
+ Attributes:
31
+ prompt_embeds (torch.Tensor): The embeddings of the input prompts.
32
+ last_hidden_states (torch.Tensor): The last hidden states from the model.
33
+ """
34
+ text_embeds: torch.FloatTensor = None
35
+ last_hidden_state: torch.FloatTensor = None
36
+
37
+ class CLIPTextEncoderOnlyConfig(CLIPTextConfig):
38
+ model_type = "clip_custom_text_model"
39
+
40
+ def __init__(self, model_name: str = None, pretrained: bool = True, frozen: bool = False, lora: dict = None, **kwargs):
41
+ self.model_name = model_name
42
+ self.pretrained = pretrained
43
+ self.frozen = frozen
44
+ self.lora = lora
45
+ super().__init__(**kwargs)
46
+
47
+ class CLIPTextEncoderOnly(PreTrainedModel):
48
+ config_class = CLIPTextEncoderOnlyConfig
49
+
50
+ def __init__(self, config):
51
+ """
52
+ Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
53
+
54
+ :param model_name: The name or path of the pretrained model.
55
+ :param pretrained: Whether to load the pretrained weights.
56
+ """
57
+ super().__init__(config)
58
+
59
+ if config.pretrained:
60
+ self.model = CLIPTextModelWithProjection.from_pretrained(config.model_name)
61
+ else:
62
+ base_cfg = CLIPTextConfig.from_pretrained(config.model_name)
63
+ self.model = CLIPTextModelWithProjection(base_cfg)
64
+
65
+ if config.lora:
66
+ l_config = LoraConfig(
67
+ r=config.lora.lora_r,
68
+ lora_alpha=config.lora.lora_alpha,
69
+ target_modules=[
70
+ "k_proj",
71
+ "v_proj",
72
+ "q_proj",
73
+ "out_proj",
74
+ "fc1",
75
+ "fc2",
76
+ "visual_projection",
77
+ "text_projection"
78
+ ],
79
+ lora_dropout=config.lora.lora_dropout,
80
+ bias="lora_only",
81
+ )
82
+ self.model = get_peft_model(self.model, l_config)
83
+
84
+
85
+ def forward(self, input_ids, attention_mask=None, position_ids=None):
86
+ """
87
+ Forward pass of the model.
88
+
89
+ :param input_ids: Indices of input sequence tokens in the vocabulary.
90
+ :param attention_mask: Mask to avoid performing attention on padding token indices.
91
+ :param token_type_ids: Segment token indices to indicate first and second portions of the inputs.
92
+ :return: Outputs of the model.
93
+ """
94
+ outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, output_hidden_states=True)
95
+ return TextEncoderOutput(text_embeds=outputs.text_embeds, last_hidden_state=outputs.last_hidden_state)
96
+
97
+ class CustomTextEncoderOnly(PreTrainedModel):
98
+ def __init__(self, model_name: str, output_hidden_size: int, pretrained: bool = True, frozen: bool = True, last_hidden_state: bool = False, lora: dict = None):
99
+ """
100
+ Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
101
+
102
+ :param model_name: The name or path of the pretrained model.
103
+ :param pretrained: Whether to load the pretrained weights.
104
+ """
105
+ config = AutoModel.from_pretrained(model_name).config
106
+ super().__init__(config)
107
+ self.last_hidden_state = last_hidden_state
108
+
109
+ if pretrained:
110
+ self.model = AutoModel.from_pretrained(model_name)
111
+ if frozen:
112
+ for param in self.model.parameters():
113
+ param.requires_grad = False
114
+ else:
115
+ self.model = AutoModel(config)
116
+
117
+ self.fc1 = torch.nn.Linear(self.model.config.hidden_size, output_hidden_size)
118
+ if last_hidden_state:
119
+ self.fc2 = torch.nn.Linear(self.model.config.hidden_size, output_hidden_size)
120
+
121
+ if lora:
122
+ l_config = LoraConfig(
123
+ task_type=TaskType.FEATURE_EXTRACTION,
124
+ r=lora.lora_r,
125
+ lora_alpha=lora.lora_alpha,
126
+ lora_dropout=lora.lora_dropout,
127
+ bias="lora_only",
128
+ )
129
+ self.model = get_peft_model(self.model, l_config)
130
+
131
+ def forward(self, input_ids, attention_mask=None, token_type_ids=None):
132
+ """
133
+ Forward pass of the model.
134
+
135
+ :param input_ids: Indices of input sequence tokens in the vocabulary.
136
+ :param attention_mask: Mask to avoid performing attention on padding token indices.
137
+ :param token_type_ids: Segment token indices to indicate first and second portions of the inputs.
138
+ :return: Outputs of the model.
139
+ """
140
+ outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, output_hidden_states=True)
141
+ text_embeds = self.fc1(outputs[1])
142
+ last_hidden_state = None
143
+ if self.last_hidden_state:
144
+ last_hidden_state = self.fc2(outputs[0])
145
+ else:
146
+ last_hidden_state = outputs[0]
147
+ return TextEncoderOutput(text_embeds=text_embeds, last_hidden_state=last_hidden_state)
148
+
149
+ class CLIPVisionEncoderOnlyConfig(PretrainedConfig):
150
+ model_type = "clip_custom_vision_model"
151
+
152
+ def __init__(self, model_name: str = None, pretrained: bool = True, frozen: bool = False, lora: dict = None, **kwargs):
153
+ self.model_name = model_name
154
+ self.pretrained = pretrained
155
+ self.frozen = frozen
156
+ self.lora = lora
157
+ super().__init__(**kwargs)
158
+
159
+ class CLIPVisionEncoderOnly(PreTrainedModel):
160
+ config_class = CLIPVisionEncoderOnlyConfig
161
+
162
+ def __init__(self, config):
163
+ """
164
+ Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
165
+
166
+ :param model_name: The name or path of the pretrained model.
167
+ :param pretrained: Whether to load the pretrained weights.
168
+ """
169
+ super().__init__(config)
170
+
171
+ if config.pretrained:
172
+ self.model = CLIPVisionModelWithProjection.from_pretrained(config.model_name)
173
+ else:
174
+ base_cfg = CLIPVisionConfig.from_pretrained(config.model_name)
175
+ self.model = CLIPVisionModelWithProjection(base_cfg)
176
+
177
+ if config.lora:
178
+ l_config = LoraConfig(
179
+ r=config.lora.lora_r,
180
+ lora_alpha=config.lora.lora_alpha,
181
+ target_modules=[
182
+ "k_proj",
183
+ "v_proj",
184
+ "q_proj",
185
+ "out_proj",
186
+ "fc1",
187
+ "fc2",
188
+ "visual_projection",
189
+ "text_projection"
190
+ ],
191
+ lora_dropout=config.lora.lora_dropout,
192
+ bias="lora_only",
193
+ )
194
+ self.model = get_peft_model(self.model, l_config)
195
+
196
+ def forward(self, data):
197
+ """
198
+ Forward pass of the model.
199
+ """
200
+ return self.model(**data).image_embeds
201
+
202
+ def parameters(self):
203
+ return self.model.parameters()
204
+
205
+
206
+ class OpenCLIPVisionEncoderOnly(torch.nn.Module):
207
+ def __init__(self, model_name: str, pretrained: bool = True, frozen: bool = False, lora: dict = None):
208
+ """
209
+ Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
210
+
211
+ :param model_name: The name or path of the pretrained model.
212
+ :param pretrained: Whether to load the pretrained weights.
213
+ """
214
+ super().__init__()
215
+ if pretrained:
216
+ model, _ = open_clip.create_model_from_pretrained(f"hf-hub:{model_name}")
217
+ model = model.visual
218
+ else:
219
+ raise NotImplemented
220
+ self.model = model
221
+
222
+ if lora:
223
+ l_config = LoraConfig(
224
+ r=lora.lora_r,
225
+ lora_alpha=lora.lora_alpha,
226
+ target_modules=[
227
+ "k_proj",
228
+ "v_proj",
229
+ "q_proj",
230
+ "out_proj",
231
+ "fc1",
232
+ "fc2",
233
+ "visual_projection",
234
+ "text_projection"
235
+ ],
236
+ lora_dropout=lora.lora_dropout,
237
+ bias="lora_only",
238
+ )
239
+ self.model = get_peft_model(self.model, l_config)
240
+
241
+ def forward(self, image):
242
+ """
243
+ Forward pass of the model.
244
+ """
245
+ return self.model(image)
246
+
247
+ def save_pretrained(self, save_dir):
248
+ tensors = self.model.state_dict()
249
+ safetensors.torch.save_file(tensors, save_dir / HF_SAFE_WEIGHTS_NAME)
250
+
251
+ class CustomPriorModel(torch.nn.Module):
252
+ def __init__(self, in_hidden_state, out_hidden_state):
253
+ """
254
+ Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
255
+
256
+ :param model_name: The name or path of the pretrained model.
257
+ :param pretrained: Whether to load the pretrained weights.
258
+ """
259
+ super().__init__()
260
+ mid_hidden_state = max(in_hidden_state, out_hidden_state)
261
+
262
+ self.fc1 = torch.nn.Linear(in_hidden_state*2, mid_hidden_state)
263
+ self.relu = torch.nn.ReLU()
264
+ self.fc2 = torch.nn.Linear(mid_hidden_state, out_hidden_state)
265
+
266
+ def reinitialize_model(self):
267
+ for name, param in self.named_parameters():
268
+ if param.requires_grad:
269
+ if len(param.shape) > 1:
270
+ torch.nn.init.xavier_uniform_(param)
271
+ else:
272
+ if 'weight' in name:
273
+ torch.nn.init.normal_(param)
274
+ else:
275
+ torch.nn.init.zeros_(param)
276
+
277
+ def forward(self, feats):
278
+ """
279
+ Forward pass of the model.
280
+ """
281
+ return PriorTransformerOutput(predicted_image_embedding=self.fc2(self.relu(self.fc1(feats))))
282
+
283
+ def save_pretrained(self, save_dir):
284
+ pass
285
+ # tensors = self.state_dict()
286
+ # safetensors.torch.save_file(tensors, os.path.join(save_dir, HF_SAFE_WEIGHTS_NAME_PRIOR))
287
+
288
+
289
+ def test_text_model(register=False, upload=False):
290
+ # register the classes
291
+ if register:
292
+ AutoConfig.register("clip_custom_text_model", CLIPTextEncoderOnlyConfig)
293
+ AutoModel.register(CLIPTextEncoderOnlyConfig, CLIPTextEncoderOnly)
294
+ CLIPTextEncoderOnlyConfig.register_for_auto_class()
295
+ CLIPTextEncoderOnly.register_for_auto_class("AutoModel")
296
+
297
+ if upload:
298
+ # Initialize the model
299
+ model_name = "openai/clip-vit-base-patch32"
300
+ pretrained=True
301
+ lora=None
302
+
303
+ cfg = CLIPTextEncoderOnlyConfig(model_name=model_name, pretrained=pretrained, lora=lora)
304
+ model = CLIPTextEncoderOnly(cfg)
305
+ model.push_to_hub("test-text-hf-upload")
306
+
307
+ model = CLIPTextEncoderOnly.from_pretrained("mpatel57/test-text-hf-upload", force_download=True)
308
+
309
+ def test_vision_model(register=False, upload=False):
310
+ # register the classes
311
+ if register:
312
+ AutoConfig.register("clip_custom_vision_model", CLIPVisionEncoderOnlyConfig)
313
+ AutoModel.register(CLIPVisionEncoderOnlyConfig, CLIPVisionEncoderOnly)
314
+ CLIPVisionEncoderOnlyConfig.register_for_auto_class()
315
+ CLIPVisionEncoderOnly.register_for_auto_class("AutoModel")
316
+
317
+ if upload:
318
+ # Initialize the model
319
+ model_name = "openai/clip-vit-base-patch32"
320
+ pretrained=True
321
+ lora=None
322
+
323
+ cfg = CLIPVisionEncoderOnlyConfig(model_name=model_name, pretrained=pretrained, lora=lora)
324
+ model = CLIPVisionEncoderOnly(cfg)
325
+ model.push_to_hub("test-vision-hf-upload")
326
+
327
+ model = CLIPVisionEncoderOnly.from_pretrained("mpatel57/test-vision-hf-upload", force_download=True)
328
+
329
+
330
+ if __name__ == "__main__":
331
+ test_text_model(register=False, upload=True)
332
+ test_vision_model(register=False, upload=True)