kkr5155 commited on
Commit
e968589
·
verified ·
1 Parent(s): bc501b8

Upload 22 files

Browse files
app.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ import time
4
+ import os
5
+ import torch
6
+ from PIL import Image
7
+ from threading import Thread
8
+ from transformers import TextIteratorStreamer, AutoConfig, AutoModelForCausalLM
9
+ from constants import (
10
+ IMAGE_TOKEN_INDEX,
11
+ DEFAULT_IMAGE_TOKEN,
12
+ DEFAULT_IM_START_TOKEN,
13
+ DEFAULT_IM_END_TOKEN,
14
+ )
15
+ from conversation import conv_templates
16
+ from eval_utils import load_maya_model
17
+ from utils import disable_torch_init
18
+ from mm_utils import tokenizer_image_token, process_images
19
+ from huggingface_hub._login import _login
20
+
21
+ # Import LLaVA modules to register model types
22
+ from model import *
23
+ from model.language_model.llava_cohere import LlavaCohereForCausalLM, LlavaCohereConfig
24
+
25
+ # Register model type and config
26
+ AutoConfig.register("llava_cohere", LlavaCohereConfig)
27
+ AutoModelForCausalLM.register(LlavaCohereConfig, LlavaCohereForCausalLM)
28
+
29
+ hf_token = os.getenv("hf_token")
30
+ _login(token=hf_token, add_to_git_credential=False)
31
+
32
+ # Global Variables
33
+ MODEL_BASE = "CohereForAI/aya-23-8B"
34
+ MODEL_PATH = "maya-multimodal/maya"
35
+ MODE = "finetuned"
36
+
37
+ def load_model():
38
+ """Load the Maya model and required components"""
39
+ model, tokenizer, image_processor, _ = load_maya_model(
40
+ MODEL_BASE, MODEL_PATH, None, MODE
41
+ )
42
+ model = model.cuda()
43
+ model.eval()
44
+ return model, tokenizer, image_processor
45
+
46
+ # Load model globally
47
+ print("Loading model...")
48
+ model, tokenizer, image_processor = load_model()
49
+ print("Model loaded successfully!")
50
+
51
+ def validate_image_file(image_path):
52
+ """Validate that the image file exists and is in a supported format."""
53
+ if not os.path.isfile(image_path):
54
+ raise gr.Error(f"Error: File {image_path} does not exist.")
55
+
56
+ try:
57
+ with Image.open(image_path) as img:
58
+ img.verify()
59
+ return True
60
+ except (IOError, SyntaxError) as e:
61
+ raise gr.Error(f"Error: {image_path} is not a valid image file. {e}")
62
+
63
+ @spaces.GPU
64
+ def process_chat_stream(message, history):
65
+ print(message)
66
+ print("History:", history)
67
+ image = None # Initialize image variable first
68
+
69
+ # First try to get image from current message
70
+ if message.get("files", []):
71
+ current_files = message["files"]
72
+ if current_files:
73
+ last_file = current_files[-1]
74
+ image = last_file["path"] if isinstance(last_file, dict) else last_file
75
+
76
+ # If no image in current message, try to get from history
77
+ if image is None and history:
78
+ for hist in reversed(history):
79
+ print("Processing history item:", hist)
80
+ if isinstance(hist["content"], tuple):
81
+ image = hist["content"][0]
82
+ break
83
+ elif isinstance(hist["content"], dict) and hist["content"].get("files"):
84
+ hist_files = hist["content"]["files"]
85
+ if hist_files:
86
+ first_file = hist_files[0]
87
+ image = first_file["path"] if isinstance(first_file, dict) else first_file
88
+ break
89
+
90
+ # Check if we found an image
91
+ if image is None:
92
+ raise gr.Error("Please upload an image to start the conversation.")
93
+
94
+ # Validate and process image
95
+ validate_image_file(image)
96
+ image = Image.open(image).convert("RGB")
97
+
98
+ # Process image for the model
99
+ image_tensor = process_images([image], image_processor, model.config)
100
+ if image_tensor is None:
101
+ raise gr.Error("Failed to process image")
102
+
103
+ image_tensor = image_tensor.cuda()
104
+
105
+ # Prepare conversation
106
+ conv = conv_templates["aya"].copy()
107
+
108
+ # Add conversation history
109
+ for hist in history:
110
+ # Handle user messages
111
+ if hist["role"] == "user":
112
+ # Extract text content based on format
113
+ if isinstance(hist["content"], str):
114
+ human_text = hist["content"]
115
+ elif isinstance(hist["content"], tuple):
116
+ human_text = hist["content"][1] if len(hist["content"]) > 1 else ""
117
+ else:
118
+ human_text = hist["content"]
119
+ conv.append_message(conv.roles[0], human_text)
120
+
121
+ # Handle assistant messages
122
+ elif hist["role"] == "assistant":
123
+ conv.append_message(conv.roles[1], hist["content"])
124
+
125
+ # Format current message with proper image token placement
126
+ current_message = message["text"]
127
+ if not history:
128
+ if model.config.mm_use_im_start_end:
129
+ current_message = f"{DEFAULT_IM_START_TOKEN}{DEFAULT_IMAGE_TOKEN}{DEFAULT_IM_END_TOKEN}\n{current_message}"
130
+ else:
131
+ current_message = f"{DEFAULT_IMAGE_TOKEN}\n{current_message}"
132
+
133
+ # Add current message to conversation
134
+ conv.append_message(conv.roles[0], current_message)
135
+ conv.append_message(conv.roles[1], None)
136
+
137
+ # Get prompt and ensure input_ids are properly created
138
+ prompt = conv.get_prompt()
139
+ # print("PROMPT: ", prompt)
140
+
141
+ try:
142
+ input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
143
+ if input_ids is None:
144
+ raise ValueError("Tokenization returned None")
145
+
146
+ # Ensure input_ids is 2D tensor
147
+ if len(input_ids.shape) == 1:
148
+ input_ids = input_ids.unsqueeze(0)
149
+ input_ids = input_ids.cuda()
150
+
151
+ # Validate vision tower and image tensor before starting generation
152
+ if not hasattr(model, 'get_vision_tower') or model.get_vision_tower() is None:
153
+ raise ValueError("Model's vision tower is not properly initialized")
154
+
155
+ if image_tensor is None:
156
+ raise ValueError("Image tensor is None")
157
+
158
+ # Setup streamer and generation
159
+ streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True)
160
+
161
+ generation_kwargs = {
162
+ "inputs": input_ids,
163
+ "images": image_tensor,
164
+ "image_sizes": [image.size],
165
+ "streamer": streamer,
166
+ "temperature": 0.3,
167
+ "do_sample": True,
168
+ "top_p": 0.9,
169
+ "num_beams": 1,
170
+ "max_new_tokens": 4096,
171
+ "use_cache": True
172
+ }
173
+
174
+ def generate_with_error_handling():
175
+ try:
176
+ model.generate(**generation_kwargs)
177
+ except Exception as e:
178
+ import traceback
179
+ error_msg = f"Generation error: {str(e)}\nTraceback:\n{''.join(traceback.format_exc())}"
180
+ raise gr.Error(error_msg)
181
+
182
+ thread = Thread(target=generate_with_error_handling)
183
+ thread.start()
184
+
185
+ except Exception as e:
186
+ error_msg = f"Setup error: {str(e)}"
187
+ import traceback
188
+ error_msg += f"\nTraceback:\n{''.join(traceback.format_exc())}"
189
+ raise gr.Error(error_msg)
190
+
191
+ partial_message = ""
192
+ for new_token in streamer:
193
+ partial_message += new_token
194
+ time.sleep(0.1)
195
+ yield {"role": "assistant", "content": partial_message}
196
+
197
+
198
+
199
+ # Create Gradio interface
200
+ chatbot = gr.Chatbot(
201
+ show_label=False,
202
+ height=450,
203
+ show_share_button=False,
204
+ show_copy_button=False,
205
+ avatar_images=None,
206
+ container=True,
207
+ render_markdown=True,
208
+ scale=1,
209
+ type="messages"
210
+ )
211
+ chat_input = gr.MultimodalTextbox(interactive=True, file_types=["image"], placeholder="Enter message or upload file...", show_label=False)
212
+ with gr.Blocks(fill_height=True, ) as demo:
213
+ gr.ChatInterface(
214
+ fn=process_chat_stream,
215
+ title="Maya: Multilingual Multimodal Model",
216
+ examples=[{"text": "Describe this photo in detail.", "files": ["./asian_food.jpg"]},
217
+ {"text": "What is the name of this famous sight in the photo?", "files": ["./hawaii.jpg"]}],
218
+ description="Upload an image and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error. [Read the research paper](https://huggingface.co/papers/2412.07112)\n\nTeam 💚 Maya",
219
+ stop_btn="Stop Generation",
220
+ multimodal=True,
221
+ textbox=chat_input,
222
+ chatbot=chatbot,
223
+ )
224
+
225
+ if __name__ == "__main__":
226
+ demo.queue(api_open=False)
227
+ demo.launch(show_api=False, share=False)
asian_food.jpg ADDED
constants.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CONTROLLER_HEART_BEAT_EXPIRATION = 30
2
+ WORKER_HEART_BEAT_INTERVAL = 15
3
+
4
+ LOGDIR = "."
5
+
6
+ # Model Constants
7
+ IGNORE_INDEX = -100
8
+ IMAGE_TOKEN_INDEX = -200
9
+ DEFAULT_IMAGE_TOKEN = "<image>"
10
+ DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
11
+ DEFAULT_IM_START_TOKEN = "<im_start>"
12
+ DEFAULT_IM_END_TOKEN = "<im_end>"
13
+ IMAGE_PLACEHOLDER = "<image-placeholder>"
conversation.py ADDED
@@ -0,0 +1,409 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
+ from enum import auto, Enum
3
+ from typing import List, Tuple
4
+ import base64
5
+ from io import BytesIO
6
+ from PIL import Image
7
+
8
+
9
+ class SeparatorStyle(Enum):
10
+ """Different separator style."""
11
+ SINGLE = auto()
12
+ TWO = auto()
13
+ MPT = auto()
14
+ PLAIN = auto()
15
+ LLAMA_2 = auto()
16
+
17
+
18
+ @dataclasses.dataclass
19
+ class Conversation:
20
+ """A class that keeps all conversation history."""
21
+ system: str
22
+ roles: List[str]
23
+ messages: List[List[str]]
24
+ offset: int
25
+ sep_style: SeparatorStyle = SeparatorStyle.SINGLE
26
+ sep: str = "###"
27
+ sep2: str = None
28
+ version: str = "Unknown"
29
+
30
+ skip_next: bool = False
31
+
32
+ def get_prompt(self):
33
+ messages = self.messages
34
+ if len(messages) > 0 and type(messages[0][1]) is tuple:
35
+ messages = self.messages.copy()
36
+ init_role, init_msg = messages[0].copy()
37
+ init_msg = init_msg[0].replace("<image>", "").strip()
38
+ if 'mmtag' in self.version:
39
+ messages[0] = (init_role, init_msg)
40
+ messages.insert(0, (self.roles[0], "<Image><image></Image>"))
41
+ messages.insert(1, (self.roles[1], "Received."))
42
+ else:
43
+ messages[0] = (init_role, "<image>\n" + init_msg)
44
+
45
+ if self.sep_style == SeparatorStyle.SINGLE:
46
+ ret = self.system + self.sep
47
+ for role, message in messages:
48
+ if message:
49
+ if type(message) is tuple:
50
+ message, _, _ = message
51
+ ret += role + ": " + message + self.sep
52
+ else:
53
+ ret += role + ":"
54
+ elif self.sep_style == SeparatorStyle.TWO:
55
+ seps = [self.sep, self.sep2]
56
+ ret = self.system + seps[0]
57
+ for i, (role, message) in enumerate(messages):
58
+ if message:
59
+ if type(message) is tuple:
60
+ message, _, _ = message
61
+ ret += role + ": " + message + seps[i % 2]
62
+ else:
63
+ ret += role + ":"
64
+ elif self.sep_style == SeparatorStyle.MPT:
65
+ ret = self.system + self.sep
66
+ for role, message in messages:
67
+ if message:
68
+ if type(message) is tuple:
69
+ message, _, _ = message
70
+ ret += role + message + self.sep
71
+ else:
72
+ ret += role
73
+ elif self.sep_style == SeparatorStyle.LLAMA_2:
74
+ wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n" if len(msg) > 0 else msg
75
+ wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
76
+ ret = ""
77
+
78
+ for i, (role, message) in enumerate(messages):
79
+ if i == 0:
80
+ assert message, "first message should not be none"
81
+ assert role == self.roles[0], "first message should come from user"
82
+ if message:
83
+ if type(message) is tuple:
84
+ message, _, _ = message
85
+ if i == 0: message = wrap_sys(self.system) + message
86
+ if i % 2 == 0:
87
+ message = wrap_inst(message)
88
+ ret += self.sep + message
89
+ else:
90
+ ret += " " + message + " " + self.sep2
91
+ else:
92
+ ret += ""
93
+ ret = ret.lstrip(self.sep)
94
+ elif self.sep_style == SeparatorStyle.PLAIN:
95
+ seps = [self.sep, self.sep2]
96
+ ret = self.system
97
+ for i, (role, message) in enumerate(messages):
98
+ if message:
99
+ if type(message) is tuple:
100
+ message, _, _ = message
101
+ ret += message + seps[i % 2]
102
+ else:
103
+ ret += ""
104
+ else:
105
+ raise ValueError(f"Invalid style: {self.sep_style}")
106
+
107
+ return ret
108
+
109
+ def append_message(self, role, message):
110
+ self.messages.append([role, message])
111
+
112
+ def process_image(self, image, image_process_mode, return_pil=False, image_format='PNG', max_len=1344, min_len=672):
113
+ if image_process_mode == "Pad":
114
+ def expand2square(pil_img, background_color=(122, 116, 104)):
115
+ width, height = pil_img.size
116
+ if width == height:
117
+ return pil_img
118
+ elif width > height:
119
+ result = Image.new(pil_img.mode, (width, width), background_color)
120
+ result.paste(pil_img, (0, (width - height) // 2))
121
+ return result
122
+ else:
123
+ result = Image.new(pil_img.mode, (height, height), background_color)
124
+ result.paste(pil_img, ((height - width) // 2, 0))
125
+ return result
126
+ image = expand2square(image)
127
+ elif image_process_mode in ["Default", "Crop"]:
128
+ pass
129
+ elif image_process_mode == "Resize":
130
+ image = image.resize((336, 336))
131
+ else:
132
+ raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
133
+ if max(image.size) > max_len:
134
+ max_hw, min_hw = max(image.size), min(image.size)
135
+ aspect_ratio = max_hw / min_hw
136
+ shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
137
+ longest_edge = int(shortest_edge * aspect_ratio)
138
+ W, H = image.size
139
+ if H > W:
140
+ H, W = longest_edge, shortest_edge
141
+ else:
142
+ H, W = shortest_edge, longest_edge
143
+ image = image.resize((W, H))
144
+ if return_pil:
145
+ return image
146
+ else:
147
+ buffered = BytesIO()
148
+ image.save(buffered, format=image_format)
149
+ img_b64_str = base64.b64encode(buffered.getvalue()).decode()
150
+ return img_b64_str
151
+
152
+ def get_images(self, return_pil=False):
153
+ images = []
154
+ for i, (role, msg) in enumerate(self.messages[self.offset:]):
155
+ if i % 2 == 0:
156
+ if type(msg) is tuple:
157
+ msg, image, image_process_mode = msg
158
+ image = self.process_image(image, image_process_mode, return_pil=return_pil)
159
+ images.append(image)
160
+ return images
161
+
162
+ def to_gradio_chatbot(self):
163
+ ret = []
164
+ for i, (role, msg) in enumerate(self.messages[self.offset:]):
165
+ if i % 2 == 0:
166
+ if type(msg) is tuple:
167
+ msg, image, image_process_mode = msg
168
+ img_b64_str = self.process_image(
169
+ image, "Default", return_pil=False,
170
+ image_format='JPEG')
171
+ img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
172
+ msg = img_str + msg.replace('<image>', '').strip()
173
+ ret.append([msg, None])
174
+ else:
175
+ ret.append([msg, None])
176
+ else:
177
+ ret[-1][-1] = msg
178
+ return ret
179
+
180
+ def copy(self):
181
+ return Conversation(
182
+ system=self.system,
183
+ roles=self.roles,
184
+ messages=[[x, y] for x, y in self.messages],
185
+ offset=self.offset,
186
+ sep_style=self.sep_style,
187
+ sep=self.sep,
188
+ sep2=self.sep2,
189
+ version=self.version)
190
+
191
+ def dict(self):
192
+ if len(self.get_images()) > 0:
193
+ return {
194
+ "system": self.system,
195
+ "roles": self.roles,
196
+ "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
197
+ "offset": self.offset,
198
+ "sep": self.sep,
199
+ "sep2": self.sep2,
200
+ }
201
+ return {
202
+ "system": self.system,
203
+ "roles": self.roles,
204
+ "messages": self.messages,
205
+ "offset": self.offset,
206
+ "sep": self.sep,
207
+ "sep2": self.sep2,
208
+ }
209
+
210
+
211
+ conv_vicuna_v0 = Conversation(
212
+ system="A chat between a curious human and an artificial intelligence assistant. "
213
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
214
+ roles=("Human", "Assistant"),
215
+ messages=(
216
+ ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
217
+ ("Assistant",
218
+ "Renewable energy sources are those that can be replenished naturally in a relatively "
219
+ "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
220
+ "Non-renewable energy sources, on the other hand, are finite and will eventually be "
221
+ "depleted, such as coal, oil, and natural gas. Here are some key differences between "
222
+ "renewable and non-renewable energy sources:\n"
223
+ "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
224
+ "energy sources are finite and will eventually run out.\n"
225
+ "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
226
+ "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
227
+ "and other negative effects.\n"
228
+ "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
229
+ "have lower operational costs than non-renewable sources.\n"
230
+ "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
231
+ "locations than non-renewable sources.\n"
232
+ "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
233
+ "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
234
+ "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
235
+ "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
236
+ ),
237
+ offset=2,
238
+ sep_style=SeparatorStyle.SINGLE,
239
+ sep="###",
240
+ )
241
+
242
+ conv_vicuna_v1 = Conversation(
243
+ system="A chat between a curious user and an artificial intelligence assistant. "
244
+ "The assistant gives helpful, detailed, and polite answers to the user's questions.",
245
+ roles=("USER", "ASSISTANT"),
246
+ version="v1",
247
+ messages=(),
248
+ offset=0,
249
+ sep_style=SeparatorStyle.TWO,
250
+ sep=" ",
251
+ sep2="</s>",
252
+ )
253
+
254
+ conv_llama_2 = Conversation(
255
+ system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
256
+
257
+ If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
258
+ roles=("USER", "ASSISTANT"),
259
+ version="llama_v2",
260
+ messages=(),
261
+ offset=0,
262
+ sep_style=SeparatorStyle.LLAMA_2,
263
+ sep="<s>",
264
+ sep2="</s>",
265
+ )
266
+
267
+ conv_llava_llama_2 = Conversation(
268
+ system="You are a helpful language and vision assistant. "
269
+ "You are able to understand the visual content that the user provides, "
270
+ "and assist the user with a variety of tasks using natural language.",
271
+ roles=("USER", "ASSISTANT"),
272
+ version="llama_v2",
273
+ messages=(),
274
+ offset=0,
275
+ sep_style=SeparatorStyle.LLAMA_2,
276
+ sep="<s>",
277
+ sep2="</s>",
278
+ )
279
+
280
+ conv_mpt = Conversation(
281
+ system="""<|im_start|>system
282
+ A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
283
+ roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
284
+ version="mpt",
285
+ messages=(),
286
+ offset=0,
287
+ sep_style=SeparatorStyle.MPT,
288
+ sep="<|im_end|>",
289
+ )
290
+
291
+ conv_llava_plain = Conversation(
292
+ system="",
293
+ roles=("", ""),
294
+ messages=(
295
+ ),
296
+ offset=0,
297
+ sep_style=SeparatorStyle.PLAIN,
298
+ sep="\n",
299
+ )
300
+
301
+ conv_llava_v0 = Conversation(
302
+ system="A chat between a curious human and an artificial intelligence assistant. "
303
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
304
+ roles=("Human", "Assistant"),
305
+ messages=(
306
+ ),
307
+ offset=0,
308
+ sep_style=SeparatorStyle.SINGLE,
309
+ sep="###",
310
+ )
311
+
312
+ conv_llava_v0_mmtag = Conversation(
313
+ system="A chat between a curious user and an artificial intelligence assistant. "
314
+ "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
315
+ "The visual content will be provided with the following format: <Image>visual content</Image>.",
316
+ roles=("Human", "Assistant"),
317
+ messages=(
318
+ ),
319
+ offset=0,
320
+ sep_style=SeparatorStyle.SINGLE,
321
+ sep="###",
322
+ version="v0_mmtag",
323
+ )
324
+
325
+ conv_llava_v1 = Conversation(
326
+ system="A chat between a curious human and an artificial intelligence assistant. "
327
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
328
+ roles=("USER", "ASSISTANT"),
329
+ version="v1",
330
+ messages=(),
331
+ offset=0,
332
+ sep_style=SeparatorStyle.TWO,
333
+ sep=" ",
334
+ sep2="</s>",
335
+ )
336
+
337
+ conv_llava_v1_mmtag = Conversation(
338
+ system="A chat between a curious user and an artificial intelligence assistant. "
339
+ "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
340
+ "The visual content will be provided with the following format: <Image>visual content</Image>.",
341
+ roles=("USER", "ASSISTANT"),
342
+ messages=(),
343
+ offset=0,
344
+ sep_style=SeparatorStyle.TWO,
345
+ sep=" ",
346
+ sep2="</s>",
347
+ version="v1_mmtag",
348
+ )
349
+
350
+ conv_mistral_instruct = Conversation(
351
+ system="",
352
+ roles=("USER", "ASSISTANT"),
353
+ version="llama_v2",
354
+ messages=(),
355
+ offset=0,
356
+ sep_style=SeparatorStyle.LLAMA_2,
357
+ sep="",
358
+ sep2="</s>",
359
+ )
360
+
361
+ conv_chatml_direct = Conversation(
362
+ system="""<|im_start|>system
363
+ Answer the questions.""",
364
+ roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
365
+ version="mpt",
366
+ messages=(),
367
+ offset=0,
368
+ sep_style=SeparatorStyle.MPT,
369
+ sep="<|im_end|>",
370
+ )
371
+
372
+ conv_aya = Conversation(
373
+ system="A chat between a curious user and an artificial intelligence assistant. "
374
+ "The assistant gives helpful, detailed, and polite answers to the user's questions.",
375
+ roles=("USER", "ASSISTANT"),
376
+ version="v1",
377
+ messages=(),
378
+ offset=0,
379
+ sep_style=SeparatorStyle.TWO,
380
+ sep=" ",
381
+ sep2="<|END_OF_TURN_TOKEN|>",
382
+ )
383
+
384
+ default_conversation = conv_vicuna_v1
385
+ conv_templates = {
386
+ "default": conv_vicuna_v0,
387
+ "v0": conv_vicuna_v0,
388
+ "v1": conv_vicuna_v1,
389
+ "vicuna_v1": conv_vicuna_v1,
390
+ "llama_2": conv_llama_2,
391
+ "mistral_instruct": conv_mistral_instruct,
392
+ "chatml_direct": conv_chatml_direct,
393
+ "mistral_direct": conv_chatml_direct,
394
+
395
+ "plain": conv_llava_plain,
396
+ "v0_plain": conv_llava_plain,
397
+ "llava_v0": conv_llava_v0,
398
+ "v0_mmtag": conv_llava_v0_mmtag,
399
+ "llava_v1": conv_llava_v1,
400
+ "v1_mmtag": conv_llava_v1_mmtag,
401
+ "llava_llama_2": conv_llava_llama_2,
402
+
403
+ "mpt": conv_mpt,
404
+ "aya": conv_aya
405
+ }
406
+
407
+
408
+ if __name__ == "__main__":
409
+ print(default_conversation.get_prompt())
eval_utils.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Cherry picked from Roshan's PR https://github.com/nahidalam/LLaVA/blob/1ecc141d7f20f16518f38a0d99320268305c17c3/llava/eval/maya/eval_utils.py
3
+ '''
4
+
5
+ import os
6
+ import sys
7
+ import torch
8
+ import requests
9
+ from io import BytesIO
10
+ from PIL import Image
11
+
12
+
13
+ from transformers import AutoTokenizer, AutoConfig, TextStreamer
14
+ from transformers.models.cohere.tokenization_cohere_fast import CohereTokenizerFast
15
+ from model.language_model.llava_cohere import LlavaCohereForCausalLM, LlavaCohereConfig
16
+ from constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_IMAGE_PATCH_TOKEN
17
+ from conversation import conv_templates, SeparatorStyle
18
+ from mm_utils import process_images, tokenizer_image_token, get_model_name_from_path
19
+
20
+ from typing import Optional, Literal
21
+
22
+
23
+ def load_maya_model(model_base: str, model_path : str, projector_path : Optional[str] = None, mode = Literal['pretrained','finetuned']):
24
+
25
+ """ Function that helps load a trained Maya model
26
+
27
+ Trained Maya model can be of two flavors :
28
+ 1. Pretrained : The model has only gone through pretraining and the changes are restricted to the projector layer
29
+ 2. Finetuned : Model has gone through instruction finetuning post pretraining stage. This affects the whole model
30
+
31
+ This is a replication of the load_pretrained_model function from llava.model.builder thats specific to Cohere/Maya
32
+
33
+ Args:
34
+ model_base : Path of the base LLM model in HF. Eg: 'CohereForAI/aya-23-8B', 'meta-llama/Meta-Llama-3-8B-Instruct'.
35
+ This is used to instantiate the tokenizer and the model (in case of loading the pretrained model)
36
+ model_path : Path of the trained model repo in HF. Eg : 'nahidalam/Maya'
37
+ This is used to load the config file. So this path/directory should have the config.json file
38
+ For the finetuned model, this is used to load the final model weights as well
39
+ projector_path : For the pretrained model, this represents the path to the local directory which holds the mm_projector.bin file
40
+ model : Helps specify if this is loading a pretrained only model or a finetuned model
41
+
42
+ Returns:
43
+ model: LlavaCohereForCausalLM object
44
+ tokenizer: CohereTokenizerFast object
45
+ image_processor:
46
+ content_len:
47
+ """
48
+
49
+ device_map = 'auto'
50
+ kwargs = {"device_map": device_map}
51
+ kwargs['torch_dtype'] = torch.float32
52
+ # kwargs['attn_implementation'] = 'flash_attention_2'
53
+
54
+ ## Instantiating tokenizer and model base
55
+ tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=True)
56
+ cfg_pretrained = LlavaCohereConfig.from_pretrained(model_path)
57
+
58
+ if mode == 'pretrained':
59
+ model = LlavaCohereForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
60
+
61
+ ## Loading Projector layer weights
62
+ mm_projector_weights = torch.load(projector_path, map_location='cpu')
63
+ mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
64
+ model.load_state_dict(mm_projector_weights, strict=False)
65
+ else:
66
+ # Load model with ignore_mismatched_sizes to handle vision tower weights
67
+ model = LlavaCohereForCausalLM.from_pretrained(
68
+ model_path,
69
+ config=cfg_pretrained,
70
+ ignore_mismatched_sizes=True, # Add this to handle vision tower weights
71
+ **kwargs
72
+ )
73
+
74
+ ## Loading image processor
75
+ image_processor = None
76
+
77
+ mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
78
+ mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
79
+ if mm_use_im_patch_token:
80
+ tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
81
+ if mm_use_im_start_end:
82
+ tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
83
+ model.resize_token_embeddings(len(tokenizer))
84
+
85
+ # Get and load vision tower
86
+ vision_tower = model.get_vision_tower()
87
+ if vision_tower is None:
88
+ raise ValueError("Vision tower not found in model config")
89
+
90
+ print(f"Loading vision tower... Is loaded: {vision_tower.is_loaded}")
91
+ if not vision_tower.is_loaded:
92
+ try:
93
+ vision_tower.load_model()
94
+ print("Vision tower loaded successfully")
95
+ except Exception as e:
96
+ print(f"Error loading vision tower: {str(e)}")
97
+ raise
98
+
99
+ if device_map != 'auto':
100
+ vision_tower.to(device=device_map, dtype=torch.float16)
101
+ image_processor = vision_tower.image_processor
102
+
103
+ if hasattr(model.config, "max_sequence_length"):
104
+ context_len = model.config.max_sequence_length
105
+ else:
106
+ context_len = 2048
107
+
108
+ #maya = MayaModel(model, tokenizer, image_processor, context_len)
109
+
110
+ return model, tokenizer, image_processor, context_len
111
+
112
+
113
+ class MayaModel(object):
114
+
115
+ def __init__(self, model : LlavaCohereForCausalLM, tokenizer : CohereTokenizerFast, image_processor, context_length):
116
+ self.model = model
117
+ self.tokenizer = tokenizer
118
+ self.image_processor = image_processor
119
+ self.context_length = context_length
120
+
121
+ def validate_inputs(self):
122
+ """
123
+ Method to validate the inputs
124
+ """
125
+ pass
126
+
127
+
128
+
129
+
130
+ def load_image(image_input):
131
+ """
132
+ Convert various image inputs to a PIL Image object.
133
+
134
+ :param image_input: Can be a URL string, a file path string, or image bytes
135
+ :return: PIL Image object
136
+ """
137
+ try:
138
+ if isinstance(image_input, str):
139
+ if image_input.startswith(('http://', 'https://')):
140
+ # Input is a URL
141
+ response = requests.get(image_input)
142
+ response.raise_for_status() # Raise an exception for bad responses
143
+ return Image.open(BytesIO(response.content))
144
+ elif os.path.isfile(image_input):
145
+ # Input is a file path
146
+ return Image.open(image_input)
147
+ else:
148
+ raise ValueError("Invalid input: string is neither a valid URL nor a file path")
149
+ elif isinstance(image_input, bytes):
150
+ # Input is bytes
151
+ return Image.open(BytesIO(image_input))
152
+ else:
153
+ raise ValueError("Invalid input type. Expected URL string, file path string, or bytes.")
154
+ except requests.RequestException as e:
155
+ raise ValueError(f"Error fetching image from URL: {e}")
156
+ except IOError as e:
157
+ raise ValueError(f"Error opening image file: {e}")
158
+ except Exception as e:
159
+ raise ValueError(f"An unexpected error occurred: {e}")
160
+
161
+
162
+
163
+
164
+ def get_single_sample_prediction(maya_model, image_file, user_question, temperature = 0.0, max_new_tokens = 100, conv_mode = 'aya'):
165
+ """Generates the prediction for a single image-user question pair.
166
+
167
+ Args:
168
+ model (MayaModel): Trained Maya model
169
+ image_file : One of the following: Online image url, local image path, or image bytes
170
+ user_question (str): Question to be shared with LLM
171
+ temperature (float, optional): Temperature param for LLMs. Defaults to 0.0.
172
+ max_new_tokens (int, optional): Max new number of tokens generated. Defaults to 100
173
+ conv_model (str, optional): Conversation model to be used. Defaults to 'aya'.
174
+
175
+ Returns:
176
+ output (str): Model's response to user question
177
+ """
178
+
179
+
180
+ conv = conv_templates[conv_mode].copy()
181
+ roles = conv.roles
182
+ model = maya_model.model
183
+ tokenizer = maya_model.tokenizer
184
+ image_processor = maya_model.image_processor
185
+
186
+ image = load_image(image_file)
187
+ image_size = image.size
188
+
189
+ image_tensor = process_images([image], image_processor, model.config)
190
+ if type(image_tensor) is list:
191
+ image_tensor = [image.to(model.device, dtype=torch.float16) for image in image_tensor]
192
+ else:
193
+ image_tensor = image_tensor.to(model.device, dtype=torch.float16)
194
+
195
+ inp = user_question
196
+
197
+ if image is not None:
198
+ # first message
199
+ if model.config.mm_use_im_start_end:
200
+ inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp
201
+ else:
202
+ inp = DEFAULT_IMAGE_TOKEN + '\n' + inp
203
+ # image = None
204
+
205
+ conv.append_message(conv.roles[0], inp)
206
+ conv.append_message(conv.roles[1], None)
207
+ prompt = conv.get_prompt()
208
+
209
+ input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)
210
+ stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
211
+ keywords = [stop_str]
212
+ streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
213
+
214
+ with torch.inference_mode():
215
+ output_ids = model.generate(
216
+ input_ids,
217
+ images=image_tensor,
218
+ image_sizes=[image_size],
219
+ do_sample=True if temperature > 0 else False,
220
+ temperature=temperature,
221
+ max_new_tokens=max_new_tokens,
222
+ streamer=streamer,
223
+ use_cache=True)
224
+
225
+ outputs = tokenizer.decode(output_ids[0]).strip()
226
+
227
+ return outputs
hawaii.jpg ADDED
mm_utils.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ from io import BytesIO
3
+ import base64
4
+ import torch
5
+ import math
6
+ import ast
7
+
8
+ from transformers import StoppingCriteria
9
+ from constants import IMAGE_TOKEN_INDEX
10
+
11
+
12
+ def select_best_resolution(original_size, possible_resolutions):
13
+ """
14
+ Selects the best resolution from a list of possible resolutions based on the original size.
15
+
16
+ Args:
17
+ original_size (tuple): The original size of the image in the format (width, height).
18
+ possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
19
+
20
+ Returns:
21
+ tuple: The best fit resolution in the format (width, height).
22
+ """
23
+ original_width, original_height = original_size
24
+ best_fit = None
25
+ max_effective_resolution = 0
26
+ min_wasted_resolution = float('inf')
27
+
28
+ for width, height in possible_resolutions:
29
+ scale = min(width / original_width, height / original_height)
30
+ downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
31
+ effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
32
+ wasted_resolution = (width * height) - effective_resolution
33
+
34
+ if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
35
+ max_effective_resolution = effective_resolution
36
+ min_wasted_resolution = wasted_resolution
37
+ best_fit = (width, height)
38
+
39
+ return best_fit
40
+
41
+
42
+ def resize_and_pad_image(image, target_resolution):
43
+ """
44
+ Resize and pad an image to a target resolution while maintaining aspect ratio.
45
+
46
+ Args:
47
+ image (PIL.Image.Image): The input image.
48
+ target_resolution (tuple): The target resolution (width, height) of the image.
49
+
50
+ Returns:
51
+ PIL.Image.Image: The resized and padded image.
52
+ """
53
+ original_width, original_height = image.size
54
+ target_width, target_height = target_resolution
55
+
56
+ scale_w = target_width / original_width
57
+ scale_h = target_height / original_height
58
+
59
+ if scale_w < scale_h:
60
+ new_width = target_width
61
+ new_height = min(math.ceil(original_height * scale_w), target_height)
62
+ else:
63
+ new_height = target_height
64
+ new_width = min(math.ceil(original_width * scale_h), target_width)
65
+
66
+ # Resize the image
67
+ resized_image = image.resize((new_width, new_height))
68
+
69
+ new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0))
70
+ paste_x = (target_width - new_width) // 2
71
+ paste_y = (target_height - new_height) // 2
72
+ new_image.paste(resized_image, (paste_x, paste_y))
73
+
74
+ return new_image
75
+
76
+
77
+ def divide_to_patches(image, patch_size):
78
+ """
79
+ Divides an image into patches of a specified size.
80
+
81
+ Args:
82
+ image (PIL.Image.Image): The input image.
83
+ patch_size (int): The size of each patch.
84
+
85
+ Returns:
86
+ list: A list of PIL.Image.Image objects representing the patches.
87
+ """
88
+ patches = []
89
+ width, height = image.size
90
+ for i in range(0, height, patch_size):
91
+ for j in range(0, width, patch_size):
92
+ box = (j, i, j + patch_size, i + patch_size)
93
+ patch = image.crop(box)
94
+ patches.append(patch)
95
+
96
+ return patches
97
+
98
+
99
+ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
100
+ """
101
+ Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
102
+
103
+ Args:
104
+ image_size (tuple): The size of the input image in the format (width, height).
105
+ grid_pinpoints (str): A string representation of a list of possible resolutions.
106
+ patch_size (int): The size of each image patch.
107
+
108
+ Returns:
109
+ tuple: The shape of the image patch grid in the format (width, height).
110
+ """
111
+ if type(grid_pinpoints) is list:
112
+ possible_resolutions = grid_pinpoints
113
+ else:
114
+ possible_resolutions = ast.literal_eval(grid_pinpoints)
115
+ width, height = select_best_resolution(image_size, possible_resolutions)
116
+ return width // patch_size, height // patch_size
117
+
118
+
119
+ def process_anyres_image(image, processor, grid_pinpoints):
120
+ """
121
+ Process an image with variable resolutions.
122
+
123
+ Args:
124
+ image (PIL.Image.Image): The input image to be processed.
125
+ processor: The image processor object.
126
+ grid_pinpoints (str): A string representation of a list of possible resolutions.
127
+
128
+ Returns:
129
+ torch.Tensor: A tensor containing the processed image patches.
130
+ """
131
+ if type(grid_pinpoints) is list:
132
+ possible_resolutions = grid_pinpoints
133
+ else:
134
+ possible_resolutions = ast.literal_eval(grid_pinpoints)
135
+ best_resolution = select_best_resolution(image.size, possible_resolutions)
136
+ image_padded = resize_and_pad_image(image, best_resolution)
137
+
138
+ patches = divide_to_patches(image_padded, processor.crop_size['height'])
139
+
140
+ image_original_resize = image.resize((processor.size['shortest_edge'], processor.size['shortest_edge']))
141
+
142
+ image_patches = [image_original_resize] + patches
143
+ image_patches = [processor.preprocess(image_patch, return_tensors='pt')['pixel_values'][0]
144
+ for image_patch in image_patches]
145
+ return torch.stack(image_patches, dim=0)
146
+
147
+
148
+ def load_image_from_base64(image):
149
+ return Image.open(BytesIO(base64.b64decode(image)))
150
+
151
+
152
+ def expand2square(pil_img, background_color):
153
+ width, height = pil_img.size
154
+ if width == height:
155
+ return pil_img
156
+ elif width > height:
157
+ result = Image.new(pil_img.mode, (width, width), background_color)
158
+ result.paste(pil_img, (0, (width - height) // 2))
159
+ return result
160
+ else:
161
+ result = Image.new(pil_img.mode, (height, height), background_color)
162
+ result.paste(pil_img, ((height - width) // 2, 0))
163
+ return result
164
+
165
+
166
+ def process_images(images, image_processor, model_cfg):
167
+ image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
168
+ new_images = []
169
+ if image_aspect_ratio == 'pad':
170
+ for image in images:
171
+ image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
172
+ image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
173
+ new_images.append(image)
174
+ elif image_aspect_ratio == "anyres":
175
+ for image in images:
176
+ image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints)
177
+ new_images.append(image)
178
+ else:
179
+ return image_processor(images, return_tensors='pt')['pixel_values']
180
+ if all(x.shape == new_images[0].shape for x in new_images):
181
+ new_images = torch.stack(new_images, dim=0)
182
+ return new_images
183
+
184
+
185
+ def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
186
+ prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
187
+
188
+ def insert_separator(X, sep):
189
+ return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
190
+
191
+ input_ids = []
192
+ offset = 0
193
+ if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
194
+ offset = 1
195
+ input_ids.append(prompt_chunks[0][0])
196
+
197
+ for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
198
+ input_ids.extend(x[offset:])
199
+
200
+ if return_tensors is not None:
201
+ if return_tensors == 'pt':
202
+ return torch.tensor(input_ids, dtype=torch.long)
203
+ raise ValueError(f'Unsupported tensor type: {return_tensors}')
204
+ return input_ids
205
+
206
+
207
+ def get_model_name_from_path(model_path):
208
+ model_path = model_path.strip("/")
209
+ model_paths = model_path.split("/")
210
+ if model_paths[-1].startswith('checkpoint-'):
211
+ return model_paths[-2] + "_" + model_paths[-1]
212
+ else:
213
+ return model_paths[-1]
214
+
215
+ class KeywordsStoppingCriteria(StoppingCriteria):
216
+ def __init__(self, keywords, tokenizer, input_ids):
217
+ self.keywords = keywords
218
+ self.keyword_ids = []
219
+ self.max_keyword_len = 0
220
+ for keyword in keywords:
221
+ cur_keyword_ids = tokenizer(keyword).input_ids
222
+ if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
223
+ cur_keyword_ids = cur_keyword_ids[1:]
224
+ if len(cur_keyword_ids) > self.max_keyword_len:
225
+ self.max_keyword_len = len(cur_keyword_ids)
226
+ self.keyword_ids.append(torch.tensor(cur_keyword_ids))
227
+ self.tokenizer = tokenizer
228
+ self.start_len = input_ids.shape[1]
229
+
230
+ def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
231
+ offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
232
+ self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
233
+ for keyword_id in self.keyword_ids:
234
+ truncated_output_ids = output_ids[0, -keyword_id.shape[0]:]
235
+ if torch.equal(truncated_output_ids, keyword_id):
236
+ return True
237
+ outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
238
+ for keyword in self.keywords:
239
+ if keyword in outputs:
240
+ return True
241
+ return False
242
+
243
+ def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
244
+ outputs = []
245
+ for i in range(output_ids.shape[0]):
246
+ outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
247
+ return all(outputs)
model/___init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ try:
2
+ from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
3
+ from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig
4
+ from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig
5
+ from .language_model.llava_cohere import LlavaCohereForCausalLM, LlavaCohereConfig
6
+ except:
7
+ pass
model/apply_delta.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Usage:
3
+ python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
4
+ """
5
+ import argparse
6
+
7
+ import torch
8
+ from tqdm import tqdm
9
+ from transformers import AutoTokenizer, AutoModelForCausalLM
10
+ from llava import LlavaLlamaForCausalLM
11
+
12
+
13
+ def apply_delta(base_model_path, target_model_path, delta_path):
14
+ print("Loading base model")
15
+ base = AutoModelForCausalLM.from_pretrained(
16
+ base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17
+
18
+ print("Loading delta")
19
+ delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
20
+ delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
21
+
22
+ print("Applying delta")
23
+ for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
24
+ if name not in base.state_dict():
25
+ assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
26
+ continue
27
+ if param.data.shape == base.state_dict()[name].shape:
28
+ param.data += base.state_dict()[name]
29
+ else:
30
+ assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \
31
+ f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
32
+ bparam = base.state_dict()[name]
33
+ param.data[:bparam.shape[0], :bparam.shape[1]] += bparam
34
+
35
+ print("Saving target model")
36
+ delta.save_pretrained(target_model_path)
37
+ delta_tokenizer.save_pretrained(target_model_path)
38
+
39
+
40
+ if __name__ == "__main__":
41
+ parser = argparse.ArgumentParser()
42
+ parser.add_argument("--base-model-path", type=str, required=True)
43
+ parser.add_argument("--target-model-path", type=str, required=True)
44
+ parser.add_argument("--delta-path", type=str, required=True)
45
+
46
+ args = parser.parse_args()
47
+
48
+ apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
model/builder.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Haotian Liu
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ import os
17
+ import warnings
18
+ import shutil
19
+
20
+ from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
21
+ import torch
22
+ from llava.model import *
23
+ from llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
24
+
25
+ # from transformers.models.cohere.tokenization_cohere_fast import CohereTokenizerFast
26
+ # from llava.model.language_model.llava_cohere import LlavaCohereForCausalLM, LlavaCohereConfig
27
+
28
+
29
+
30
+ def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
31
+ kwargs = {"device_map": device_map, **kwargs}
32
+
33
+ if device != "cuda":
34
+ kwargs['device_map'] = {"": device}
35
+
36
+ if load_8bit:
37
+ kwargs['load_in_8bit'] = True
38
+ elif load_4bit:
39
+ kwargs['load_in_4bit'] = True
40
+ kwargs['quantization_config'] = BitsAndBytesConfig(
41
+ load_in_4bit=True,
42
+ bnb_4bit_compute_dtype=torch.float16,
43
+ bnb_4bit_use_double_quant=True,
44
+ bnb_4bit_quant_type='nf4'
45
+ )
46
+ else:
47
+ kwargs['torch_dtype'] = torch.float16
48
+
49
+ if use_flash_attn:
50
+ kwargs['attn_implementation'] = 'flash_attention_2'
51
+
52
+ if 'llava' in model_name.lower():
53
+ # Load LLaVA model
54
+ if 'lora' in model_name.lower() and model_base is None:
55
+ warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged.')
56
+ if 'lora' in model_name.lower() and model_base is not None:
57
+ from llava.model.language_model.llava_llama import LlavaConfig
58
+ lora_cfg_pretrained = LlavaConfig.from_pretrained(model_path)
59
+ tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
60
+ print('Loading LLaVA from base model...')
61
+ model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
62
+ token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
63
+ if model.lm_head.weight.shape[0] != token_num:
64
+ model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
65
+ model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
66
+
67
+ print('Loading additional LLaVA weights...')
68
+ if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
69
+ non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
70
+ else:
71
+ # this is probably from HF Hub
72
+ from huggingface_hub import hf_hub_download
73
+ def load_from_hf(repo_id, filename, subfolder=None):
74
+ cache_file = hf_hub_download(
75
+ repo_id=repo_id,
76
+ filename=filename,
77
+ subfolder=subfolder)
78
+ return torch.load(cache_file, map_location='cpu')
79
+ non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')
80
+ non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
81
+ if any(k.startswith('model.model.') for k in non_lora_trainables):
82
+ non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
83
+ model.load_state_dict(non_lora_trainables, strict=False)
84
+
85
+ from peft import PeftModel
86
+ print('Loading LoRA weights...')
87
+ model = PeftModel.from_pretrained(model, model_path)
88
+ print('Merging LoRA weights...')
89
+ model = model.merge_and_unload()
90
+ print('Model is loaded...')
91
+ elif model_base is not None:
92
+ # this may be mm projector only
93
+ print('Loading LLaVA from base model...')
94
+ if 'mpt' in model_name.lower():
95
+ if not os.path.isfile(os.path.join(model_path, 'configuration_mpt.py')):
96
+ shutil.copyfile(os.path.join(model_base, 'configuration_mpt.py'), os.path.join(model_path, 'configuration_mpt.py'))
97
+ tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=True)
98
+ cfg_pretrained = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
99
+ model = LlavaMptForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
100
+ else:
101
+ tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
102
+ cfg_pretrained = AutoConfig.from_pretrained(model_path)
103
+ model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
104
+
105
+ mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
106
+ mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
107
+ model.load_state_dict(mm_projector_weights, strict=False)
108
+ else:
109
+ if 'mpt' in model_name.lower():
110
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
111
+ model = LlavaMptForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
112
+ elif 'mistral' in model_name.lower():
113
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
114
+ model = LlavaMistralForCausalLM.from_pretrained(
115
+ model_path,
116
+ low_cpu_mem_usage=True,
117
+ **kwargs
118
+ )
119
+ else:
120
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
121
+ model = LlavaLlamaForCausalLM.from_pretrained(
122
+ model_path,
123
+ low_cpu_mem_usage=True,
124
+ **kwargs
125
+ )
126
+ elif 'aya' in model_name.lower():
127
+
128
+ ## TO DO : Currently only works for projector pretrained models. Doesnt support PEFT models or models with base LLMs trained
129
+ tokenizer = AutoTokenizer.from_pretrained(model_base, padding_side="right", use_fast=True)
130
+ cfg_pretrained = LlavaCohereConfig.from_pretrained(model_path)
131
+ model = LlavaCohereForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
132
+
133
+ ## TO DO : Improve the processing/loading/saving of the projector file
134
+ projector_file_path = os.path.join(os.getcwd(), 'mm_projector.bin')
135
+ if not os.path.exists(projector_file_path):
136
+
137
+ projector_file_link = os.path.join('https://huggingface.co/',model_path,'resolve/main/mm_projector.bin')
138
+ print(f"Downloading {projector_file_link} ...")
139
+ os.system(f"wget {projector_file_link}")
140
+
141
+ mm_projector_weights = torch.load(projector_file_path, map_location='cpu')
142
+ mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
143
+ model.load_state_dict(mm_projector_weights, strict=False)
144
+
145
+
146
+
147
+ else:
148
+ # Load language model
149
+ if model_base is not None:
150
+ # PEFT model
151
+ from peft import PeftModel
152
+ tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
153
+ model = AutoModelForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
154
+ print(f"Loading LoRA weights from {model_path}")
155
+ model = PeftModel.from_pretrained(model, model_path)
156
+ print(f"Merging weights")
157
+ model = model.merge_and_unload()
158
+ print('Convert to FP16...')
159
+ model.to(torch.float16)
160
+ else:
161
+ use_fast = False
162
+ if 'mpt' in model_name.lower():
163
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
164
+ model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs)
165
+ else:
166
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
167
+ model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
168
+
169
+ image_processor = None
170
+
171
+ if 'llava' in model_name.lower() or 'aya' in model_name.lower():
172
+ mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
173
+ mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
174
+ if mm_use_im_patch_token:
175
+ tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
176
+ if mm_use_im_start_end:
177
+ tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
178
+ model.resize_token_embeddings(len(tokenizer))
179
+
180
+ vision_tower = model.get_vision_tower()
181
+ if not vision_tower.is_loaded:
182
+ vision_tower.load_model(device_map=device_map)
183
+ if device_map != 'auto':
184
+ vision_tower.to(device=device_map, dtype=torch.float16)
185
+ image_processor = vision_tower.image_processor
186
+
187
+ if hasattr(model.config, "max_sequence_length"):
188
+ context_len = model.config.max_sequence_length
189
+ else:
190
+ context_len = 2048
191
+
192
+ return tokenizer, model, image_processor, context_len
model/consolidate.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Usage:
3
+ python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
4
+ """
5
+ import argparse
6
+
7
+ import torch
8
+ from transformers import AutoTokenizer, AutoModelForCausalLM
9
+ from llava.model import *
10
+ from llava.model.utils import auto_upgrade
11
+
12
+
13
+ def consolidate_ckpt(src_path, dst_path):
14
+ print("Loading model")
15
+ auto_upgrade(src_path)
16
+ src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17
+ src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
18
+ src_model.save_pretrained(dst_path)
19
+ src_tokenizer.save_pretrained(dst_path)
20
+
21
+
22
+ if __name__ == "__main__":
23
+ parser = argparse.ArgumentParser()
24
+ parser.add_argument("--src", type=str, required=True)
25
+ parser.add_argument("--dst", type=str, required=True)
26
+
27
+ args = parser.parse_args()
28
+
29
+ consolidate_ckpt(args.src, args.dst)
model/language_model/llava_cohere.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ reference: https://github.com/Satyajitv/LLaVA/blob/maya_exp/llava/model/language_model/llava_cohere.py
3
+ '''
4
+
5
+ from typing import List, Optional, Tuple, Union
6
+
7
+ import torch
8
+ import torch.nn as nn
9
+
10
+ from transformers import AutoConfig, AutoModelForCausalLM, \
11
+ CohereConfig, CohereModel, CohereForCausalLM
12
+
13
+ from transformers.modeling_outputs import CausalLMOutputWithPast
14
+ from transformers.generation.utils import GenerateOutput
15
+
16
+ from model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
17
+
18
+ class LlavaCohereConfig(CohereConfig):
19
+ model_type = "llava_cohere"
20
+
21
+ class LlavaCohereModel(LlavaMetaModel, CohereModel):
22
+ config_class = LlavaCohereConfig
23
+
24
+ def __init__(self, config: CohereConfig):
25
+ super(LlavaCohereModel, self).__init__(config)
26
+
27
+ class LlavaCohereForCausalLM(CohereForCausalLM, LlavaMetaForCausalLM):
28
+ config_class = LlavaCohereConfig
29
+
30
+ def __init__(self, config):
31
+ super().__init__(config)
32
+ self.model = LlavaCohereModel(config)
33
+
34
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
35
+
36
+ # Initialize weights and apply final processing
37
+ self.post_init()
38
+
39
+ def get_model(self):
40
+ return self.model
41
+
42
+ def forward(
43
+ self,
44
+ input_ids: torch.LongTensor = None,
45
+ attention_mask: Optional[torch.Tensor] = None,
46
+ position_ids: Optional[torch.LongTensor] = None,
47
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
48
+ inputs_embeds: Optional[torch.FloatTensor] = None,
49
+ labels: Optional[torch.LongTensor] = None,
50
+ use_cache: Optional[bool] = None,
51
+ output_attentions: Optional[bool] = None,
52
+ output_hidden_states: Optional[bool] = None,
53
+ images: Optional[torch.FloatTensor] = None,
54
+ image_sizes: Optional[List[List[int]]] = None,
55
+ return_dict: Optional[bool] = None,
56
+ cache_position=None,
57
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
58
+
59
+ if inputs_embeds is None:
60
+ (
61
+ input_ids,
62
+ position_ids,
63
+ attention_mask,
64
+ past_key_values,
65
+ inputs_embeds,
66
+ labels
67
+ ) = self.prepare_inputs_labels_for_multimodal(
68
+ input_ids,
69
+ position_ids,
70
+ attention_mask,
71
+ past_key_values,
72
+ labels,
73
+ images,
74
+ image_sizes
75
+ )
76
+
77
+ return super().forward(
78
+ input_ids=input_ids,
79
+ attention_mask=attention_mask,
80
+ position_ids=position_ids,
81
+ past_key_values=past_key_values,
82
+ inputs_embeds=inputs_embeds,
83
+ labels=labels,
84
+ use_cache=use_cache,
85
+ output_attentions=output_attentions,
86
+ output_hidden_states=output_hidden_states,
87
+ return_dict=return_dict
88
+ )
89
+
90
+ @torch.no_grad()
91
+ def generate(
92
+ self,
93
+ inputs: Optional[torch.Tensor] = None,
94
+ images: Optional[torch.Tensor] = None,
95
+ image_sizes: Optional[torch.Tensor] = None,
96
+ **kwargs,
97
+ ) -> Union[GenerateOutput, torch.LongTensor]:
98
+ position_ids = kwargs.pop("position_ids", None)
99
+ attention_mask = kwargs.pop("attention_mask", None)
100
+ if "inputs_embeds" in kwargs:
101
+ raise NotImplementedError("`inputs_embeds` is not supported")
102
+
103
+ if images is not None:
104
+ (
105
+ inputs,
106
+ position_ids,
107
+ attention_mask,
108
+ _,
109
+ inputs_embeds,
110
+ _
111
+ ) = self.prepare_inputs_labels_for_multimodal(
112
+ inputs,
113
+ position_ids,
114
+ attention_mask,
115
+ None,
116
+ None,
117
+ images,
118
+ image_sizes=image_sizes
119
+ )
120
+ else:
121
+ inputs_embeds = self.get_model().embed_tokens(inputs)
122
+
123
+ return super().generate(
124
+ position_ids=position_ids,
125
+ attention_mask=attention_mask,
126
+ inputs_embeds=inputs_embeds,
127
+ **kwargs
128
+ )
129
+
130
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
131
+ inputs_embeds=None, **kwargs):
132
+ images = kwargs.pop("images", None)
133
+ image_sizes = kwargs.pop("image_sizes", None)
134
+ inputs = super().prepare_inputs_for_generation(
135
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
136
+ )
137
+ if images is not None:
138
+ inputs['images'] = images
139
+ if image_sizes is not None:
140
+ inputs['image_sizes'] = image_sizes
141
+ return inputs
142
+
143
+ AutoConfig.register("llava_cohere", LlavaCohereConfig)
144
+ AutoModelForCausalLM.register(LlavaCohereConfig, LlavaCohereForCausalLM)
model/llava_arch.py ADDED
@@ -0,0 +1,371 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Haotian Liu
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from abc import ABC, abstractmethod
17
+
18
+ import torch
19
+ import torch.nn as nn
20
+
21
+ from model.multimodal_encoder.builder import build_vision_tower
22
+ from model.multimodal_projector.builder import build_vision_projector
23
+
24
+ from constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
25
+
26
+ from mm_utils import get_anyres_image_grid_shape
27
+
28
+
29
+ class LlavaMetaModel:
30
+
31
+ def __init__(self, config):
32
+ super(LlavaMetaModel, self).__init__(config)
33
+
34
+ if hasattr(config, "mm_vision_tower"):
35
+ self.vision_tower = build_vision_tower(config, delay_load=True)
36
+ self.mm_projector = build_vision_projector(config)
37
+
38
+ if 'unpad' in getattr(config, 'mm_patch_merge_type', ''):
39
+ self.image_newline = nn.Parameter(
40
+ torch.empty(config.hidden_size, dtype=self.dtype)
41
+ )
42
+
43
+ def get_vision_tower(self):
44
+ vision_tower = getattr(self, 'vision_tower', None)
45
+ if type(vision_tower) is list:
46
+ vision_tower = vision_tower[0]
47
+ return vision_tower
48
+
49
+ def initialize_vision_modules(self, model_args, fsdp=None):
50
+ vision_tower = model_args.vision_tower
51
+ mm_vision_select_layer = model_args.mm_vision_select_layer
52
+ mm_vision_select_feature = model_args.mm_vision_select_feature
53
+ pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
54
+ mm_patch_merge_type = model_args.mm_patch_merge_type
55
+
56
+ self.config.mm_vision_tower = vision_tower
57
+
58
+ if self.get_vision_tower() is None:
59
+ vision_tower = build_vision_tower(model_args)
60
+
61
+ if fsdp is not None and len(fsdp) > 0:
62
+ self.vision_tower = [vision_tower]
63
+ else:
64
+ self.vision_tower = vision_tower
65
+ else:
66
+ if fsdp is not None and len(fsdp) > 0:
67
+ vision_tower = self.vision_tower[0]
68
+ else:
69
+ vision_tower = self.vision_tower
70
+ vision_tower.load_model()
71
+
72
+ self.config.use_mm_proj = True
73
+ self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
74
+ self.config.mm_hidden_size = vision_tower.hidden_size
75
+ self.config.mm_vision_select_layer = mm_vision_select_layer
76
+ self.config.mm_vision_select_feature = mm_vision_select_feature
77
+ self.config.mm_patch_merge_type = mm_patch_merge_type
78
+
79
+ if getattr(self, 'mm_projector', None) is None:
80
+ self.mm_projector = build_vision_projector(self.config)
81
+
82
+ if 'unpad' in mm_patch_merge_type:
83
+ embed_std = 1 / torch.sqrt(torch.tensor(self.config.hidden_size, dtype=self.dtype))
84
+ self.image_newline = nn.Parameter(
85
+ torch.randn(self.config.hidden_size, dtype=self.dtype) * embed_std
86
+ )
87
+ else:
88
+ # In case it is frozen by LoRA
89
+ for p in self.mm_projector.parameters():
90
+ p.requires_grad = True
91
+
92
+ if pretrain_mm_mlp_adapter is not None:
93
+ mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
94
+ def get_w(weights, keyword):
95
+ return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
96
+
97
+ self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
98
+
99
+
100
+ def unpad_image(tensor, original_size):
101
+ """
102
+ Unpads a PyTorch tensor of a padded and resized image.
103
+
104
+ Args:
105
+ tensor (torch.Tensor): The image tensor, assumed to be in CxHxW format.
106
+ original_size (tuple): The original size of PIL image (width, height).
107
+
108
+ Returns:
109
+ torch.Tensor: The unpadded image tensor.
110
+ """
111
+ original_width, original_height = original_size
112
+ current_height, current_width = tensor.shape[1:]
113
+
114
+ original_aspect_ratio = original_width / original_height
115
+ current_aspect_ratio = current_width / current_height
116
+
117
+ if original_aspect_ratio > current_aspect_ratio:
118
+ scale_factor = current_width / original_width
119
+ new_height = int(original_height * scale_factor)
120
+ padding = (current_height - new_height) // 2
121
+ unpadded_tensor = tensor[:, padding:current_height - padding, :]
122
+ else:
123
+ scale_factor = current_height / original_height
124
+ new_width = int(original_width * scale_factor)
125
+ padding = (current_width - new_width) // 2
126
+ unpadded_tensor = tensor[:, :, padding:current_width - padding]
127
+
128
+ return unpadded_tensor
129
+
130
+
131
+ class LlavaMetaForCausalLM(ABC):
132
+
133
+ @abstractmethod
134
+ def get_model(self):
135
+ pass
136
+
137
+ def get_vision_tower(self):
138
+ return self.get_model().get_vision_tower()
139
+
140
+ def encode_images(self, images):
141
+ image_features = self.get_model().get_vision_tower()(images)
142
+ image_features = self.get_model().mm_projector(image_features)
143
+ return image_features
144
+
145
+ def prepare_inputs_labels_for_multimodal(
146
+ self, input_ids, position_ids, attention_mask, past_key_values, labels,
147
+ images, image_sizes=None
148
+ ):
149
+ vision_tower = self.get_vision_tower()
150
+ if vision_tower is None or images is None or input_ids.shape[1] == 1:
151
+ return input_ids, position_ids, attention_mask, past_key_values, None, labels
152
+
153
+ if type(images) is list or images.ndim == 5:
154
+ if type(images) is list:
155
+ images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images]
156
+ concat_images = torch.cat([image for image in images], dim=0)
157
+ image_features = self.encode_images(concat_images)
158
+ split_sizes = [image.shape[0] for image in images]
159
+ image_features = torch.split(image_features, split_sizes, dim=0)
160
+ mm_patch_merge_type = getattr(self.config, 'mm_patch_merge_type', 'flat')
161
+ image_aspect_ratio = getattr(self.config, 'image_aspect_ratio', 'square')
162
+ if mm_patch_merge_type == 'flat':
163
+ image_features = [x.flatten(0, 1) for x in image_features]
164
+ elif mm_patch_merge_type.startswith('spatial'):
165
+ new_image_features = []
166
+ for image_idx, image_feature in enumerate(image_features):
167
+ if image_feature.shape[0] > 1:
168
+ base_image_feature = image_feature[0]
169
+ image_feature = image_feature[1:]
170
+ height = width = self.get_vision_tower().num_patches_per_side
171
+ assert height * width == base_image_feature.shape[0]
172
+ if image_aspect_ratio == 'anyres':
173
+ num_patch_width, num_patch_height = get_anyres_image_grid_shape(image_sizes[image_idx], self.config.image_grid_pinpoints, self.get_vision_tower().config.image_size)
174
+ image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
175
+ else:
176
+ raise NotImplementedError
177
+ if 'unpad' in mm_patch_merge_type:
178
+ image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
179
+ image_feature = image_feature.flatten(1, 2).flatten(2, 3)
180
+ image_feature = unpad_image(image_feature, image_sizes[image_idx])
181
+ image_feature = torch.cat((
182
+ image_feature,
183
+ self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)
184
+ ), dim=-1)
185
+ image_feature = image_feature.flatten(1, 2).transpose(0, 1)
186
+ else:
187
+ image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
188
+ image_feature = image_feature.flatten(0, 3)
189
+ image_feature = torch.cat((base_image_feature, image_feature), dim=0)
190
+ else:
191
+ image_feature = image_feature[0]
192
+ if 'unpad' in mm_patch_merge_type:
193
+ image_feature = torch.cat((
194
+ image_feature,
195
+ self.model.image_newline[None].to(image_feature.device)
196
+ ), dim=0)
197
+ new_image_features.append(image_feature)
198
+ image_features = new_image_features
199
+ else:
200
+ raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}")
201
+ else:
202
+ image_features = self.encode_images(images)
203
+
204
+ # TODO: image start / end is not implemented here to support pretraining.
205
+ if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
206
+ raise NotImplementedError
207
+
208
+ # Let's just add dummy tensors if they do not exist,
209
+ # it is a headache to deal with None all the time.
210
+ # But it is not ideal, and if you have a better idea,
211
+ # please open an issue / submit a PR, thanks.
212
+ _labels = labels
213
+ _position_ids = position_ids
214
+ _attention_mask = attention_mask
215
+ if attention_mask is None:
216
+ attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
217
+ else:
218
+ attention_mask = attention_mask.bool()
219
+ if position_ids is None:
220
+ position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
221
+ if labels is None:
222
+ labels = torch.full_like(input_ids, IGNORE_INDEX)
223
+
224
+ # remove the padding using attention_mask -- FIXME
225
+ _input_ids = input_ids
226
+ input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
227
+ labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
228
+
229
+ new_input_embeds = []
230
+ new_labels = []
231
+ cur_image_idx = 0
232
+ for batch_idx, cur_input_ids in enumerate(input_ids):
233
+ num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
234
+ if num_images == 0:
235
+ cur_image_features = image_features[cur_image_idx]
236
+ cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
237
+ cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
238
+ new_input_embeds.append(cur_input_embeds)
239
+ new_labels.append(labels[batch_idx])
240
+ cur_image_idx += 1
241
+ continue
242
+
243
+ image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
244
+ cur_input_ids_noim = []
245
+ cur_labels = labels[batch_idx]
246
+ cur_labels_noim = []
247
+ for i in range(len(image_token_indices) - 1):
248
+ cur_input_ids_noim.append(cur_input_ids[image_token_indices[i]+1:image_token_indices[i+1]])
249
+ cur_labels_noim.append(cur_labels[image_token_indices[i]+1:image_token_indices[i+1]])
250
+ split_sizes = [x.shape[0] for x in cur_labels_noim]
251
+ cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim))
252
+ cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
253
+ cur_new_input_embeds = []
254
+ cur_new_labels = []
255
+
256
+ for i in range(num_images + 1):
257
+ cur_new_input_embeds.append(cur_input_embeds_no_im[i])
258
+ cur_new_labels.append(cur_labels_noim[i])
259
+ if i < num_images:
260
+ try:
261
+ cur_image_features = image_features[cur_image_idx]
262
+ except Exception as e:
263
+ print(f'Index ERROR issue due to data/image mismatch/missing: {e}')
264
+ cur_image_idx += 1
265
+ cur_new_input_embeds.append(cur_image_features)
266
+ cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
267
+
268
+ cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
269
+
270
+ cur_new_input_embeds = torch.cat(cur_new_input_embeds)
271
+ cur_new_labels = torch.cat(cur_new_labels)
272
+
273
+ new_input_embeds.append(cur_new_input_embeds)
274
+ new_labels.append(cur_new_labels)
275
+
276
+ # Truncate sequences to max length as image embeddings can make the sequence longer
277
+ tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None)
278
+ if tokenizer_model_max_length is not None:
279
+ new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
280
+ new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
281
+
282
+ # Combine them
283
+ max_len = max(x.shape[0] for x in new_input_embeds)
284
+ batch_size = len(new_input_embeds)
285
+
286
+ new_input_embeds_padded = []
287
+ new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
288
+ attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
289
+ position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
290
+
291
+ for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
292
+ cur_len = cur_new_embed.shape[0]
293
+ if getattr(self.config, 'tokenizer_padding_side', 'right') == "left":
294
+ new_input_embeds_padded.append(torch.cat((
295
+ torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device),
296
+ cur_new_embed
297
+ ), dim=0))
298
+ if cur_len > 0:
299
+ new_labels_padded[i, -cur_len:] = cur_new_labels
300
+ attention_mask[i, -cur_len:] = True
301
+ position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
302
+ else:
303
+ new_input_embeds_padded.append(torch.cat((
304
+ cur_new_embed,
305
+ torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)
306
+ ), dim=0))
307
+ if cur_len > 0:
308
+ new_labels_padded[i, :cur_len] = cur_new_labels
309
+ attention_mask[i, :cur_len] = True
310
+ position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
311
+
312
+ new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
313
+
314
+ if _labels is None:
315
+ new_labels = None
316
+ else:
317
+ new_labels = new_labels_padded
318
+
319
+ if _attention_mask is None:
320
+ attention_mask = None
321
+ else:
322
+ attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
323
+
324
+ if _position_ids is None:
325
+ position_ids = None
326
+
327
+ return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
328
+
329
+ def initialize_vision_tokenizer(self, model_args, tokenizer):
330
+ if model_args.mm_use_im_patch_token:
331
+ tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
332
+ self.resize_token_embeddings(len(tokenizer))
333
+
334
+ if model_args.mm_use_im_start_end:
335
+ num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
336
+ self.resize_token_embeddings(len(tokenizer))
337
+
338
+ if num_new_tokens > 0:
339
+ input_embeddings = self.get_input_embeddings().weight.data
340
+ output_embeddings = self.get_output_embeddings().weight.data
341
+
342
+ input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
343
+ dim=0, keepdim=True)
344
+ output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
345
+ dim=0, keepdim=True)
346
+
347
+ input_embeddings[-num_new_tokens:] = input_embeddings_avg
348
+ output_embeddings[-num_new_tokens:] = output_embeddings_avg
349
+
350
+ if model_args.tune_mm_mlp_adapter:
351
+ for p in self.get_input_embeddings().parameters():
352
+ p.requires_grad = True
353
+ for p in self.get_output_embeddings().parameters():
354
+ p.requires_grad = False
355
+
356
+ if model_args.pretrain_mm_mlp_adapter:
357
+ mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location='cpu')
358
+ embed_tokens_weight = mm_projector_weights['model.embed_tokens.weight']
359
+ assert num_new_tokens == 2
360
+ if input_embeddings.shape == embed_tokens_weight.shape:
361
+ input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
362
+ elif embed_tokens_weight.shape[0] == num_new_tokens:
363
+ input_embeddings[-num_new_tokens:] = embed_tokens_weight
364
+ else:
365
+ raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
366
+ elif model_args.mm_use_im_patch_token:
367
+ if model_args.tune_mm_mlp_adapter:
368
+ for p in self.get_input_embeddings().parameters():
369
+ p.requires_grad = False
370
+ for p in self.get_output_embeddings().parameters():
371
+ p.requires_grad = False
model/make_delta.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Usage:
3
+ python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta
4
+ """
5
+ import argparse
6
+
7
+ import torch
8
+ from tqdm import tqdm
9
+ from transformers import AutoTokenizer, AutoModelForCausalLM
10
+ from llava.model.utils import auto_upgrade
11
+
12
+
13
+ def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id):
14
+ print("Loading base model")
15
+ base = AutoModelForCausalLM.from_pretrained(
16
+ base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17
+
18
+ print("Loading target model")
19
+ auto_upgrade(target_model_path)
20
+ target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
21
+
22
+ print("Calculating delta")
23
+ for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
24
+ if name not in base.state_dict():
25
+ assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
26
+ continue
27
+ if param.data.shape == base.state_dict()[name].shape:
28
+ param.data -= base.state_dict()[name]
29
+ else:
30
+ assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
31
+ bparam = base.state_dict()[name]
32
+ param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam
33
+
34
+ print("Saving delta")
35
+ if hub_repo_id:
36
+ kwargs = {"push_to_hub": True, "repo_id": hub_repo_id}
37
+ else:
38
+ kwargs = {}
39
+ target.save_pretrained(delta_path, **kwargs)
40
+ target_tokenizer = AutoTokenizer.from_pretrained(target_model_path)
41
+ target_tokenizer.save_pretrained(delta_path, **kwargs)
42
+
43
+
44
+ if __name__ == "__main__":
45
+ parser = argparse.ArgumentParser()
46
+ parser.add_argument("--base-model-path", type=str, required=True)
47
+ parser.add_argument("--target-model-path", type=str, required=True)
48
+ parser.add_argument("--delta-path", type=str, required=True)
49
+ parser.add_argument("--hub-repo-id", type=str, default=None)
50
+ args = parser.parse_args()
51
+
52
+ make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id)
model/multimodal_encoder/builder.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2
3
+ from .siglip_encoder import SiglipVisionTower
4
+
5
+ def build_vision_tower(vision_tower_cfg, **kwargs):
6
+ vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
7
+ is_absolute_path_exists = os.path.exists(vision_tower)
8
+ use_s2 = getattr(vision_tower_cfg, 's2', False)
9
+ if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower:
10
+ if use_s2:
11
+ return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs)
12
+ else:
13
+ return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
14
+ elif 'siglip' in vision_tower:
15
+ return SiglipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
16
+ raise ValueError(f'Unknown vision tower: {vision_tower}')
model/multimodal_encoder/clip_encoder.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
5
+
6
+
7
+ class CLIPVisionTower(nn.Module):
8
+ def __init__(self, vision_tower, args, delay_load=False):
9
+ super().__init__()
10
+
11
+ self.is_loaded = False
12
+
13
+ self.vision_tower_name = vision_tower
14
+ self.select_layer = args.mm_vision_select_layer
15
+ self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
16
+
17
+ if not delay_load:
18
+ self.load_model()
19
+ elif getattr(args, 'unfreeze_mm_vision_tower', False):
20
+ self.load_model()
21
+ else:
22
+ self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
23
+
24
+ def load_model(self, device_map=None):
25
+ if self.is_loaded:
26
+ print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
27
+ return
28
+
29
+ self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
30
+ self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
31
+ self.vision_tower.requires_grad_(False)
32
+
33
+ self.is_loaded = True
34
+
35
+ def feature_select(self, image_forward_outs):
36
+ image_features = image_forward_outs.hidden_states[self.select_layer]
37
+ if self.select_feature == 'patch':
38
+ image_features = image_features[:, 1:]
39
+ elif self.select_feature == 'cls_patch':
40
+ image_features = image_features
41
+ else:
42
+ raise ValueError(f'Unexpected select feature: {self.select_feature}')
43
+ return image_features
44
+
45
+ @torch.no_grad()
46
+ def forward(self, images):
47
+ if type(images) is list:
48
+ image_features = []
49
+ for image in images:
50
+ image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
51
+ image_feature = self.feature_select(image_forward_out).to(image.dtype)
52
+ image_features.append(image_feature)
53
+ else:
54
+ image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
55
+ image_features = self.feature_select(image_forward_outs).to(images.dtype)
56
+
57
+ return image_features
58
+
59
+ @property
60
+ def dummy_feature(self):
61
+ return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
62
+
63
+ @property
64
+ def dtype(self):
65
+ return self.vision_tower.dtype
66
+
67
+ @property
68
+ def device(self):
69
+ return self.vision_tower.device
70
+
71
+ @property
72
+ def config(self):
73
+ if self.is_loaded:
74
+ return self.vision_tower.config
75
+ else:
76
+ return self.cfg_only
77
+
78
+ @property
79
+ def hidden_size(self):
80
+ return self.config.hidden_size
81
+
82
+ @property
83
+ def num_patches_per_side(self):
84
+ return self.config.image_size // self.config.patch_size
85
+
86
+ @property
87
+ def num_patches(self):
88
+ return (self.config.image_size // self.config.patch_size) ** 2
89
+
90
+
91
+
92
+ class CLIPVisionTowerS2(CLIPVisionTower):
93
+ def __init__(self, vision_tower, args, delay_load=False):
94
+ super().__init__(vision_tower, args, delay_load)
95
+
96
+ self.s2_scales = getattr(args, 's2_scales', '336,672,1008')
97
+ self.s2_scales = list(map(int, self.s2_scales.split(',')))
98
+ self.s2_scales.sort()
99
+ self.s2_split_size = self.s2_scales[0]
100
+ self.s2_image_size = self.s2_scales[-1]
101
+
102
+ try:
103
+ from s2wrapper import forward as multiscale_forward
104
+ except ImportError:
105
+ raise ImportError('Package s2wrapper not found! Please install by running: \npip install git+https://github.com/bfshi/scaling_on_scales.git')
106
+ self.multiscale_forward = multiscale_forward
107
+
108
+ # change resize/crop size in preprocessing to the largest image size in s2_scale
109
+ if not delay_load or getattr(args, 'unfreeze_mm_vision_tower', False):
110
+ self.image_processor.size['shortest_edge'] = self.s2_image_size
111
+ self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size
112
+
113
+ def load_model(self, device_map=None):
114
+ if self.is_loaded:
115
+ print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
116
+ return
117
+
118
+ self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
119
+ self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
120
+ self.vision_tower.requires_grad_(False)
121
+
122
+ self.image_processor.size['shortest_edge'] = self.s2_image_size
123
+ self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size
124
+
125
+ self.is_loaded = True
126
+
127
+ @torch.no_grad()
128
+ def forward_feature(self, images):
129
+ image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
130
+ image_features = self.feature_select(image_forward_outs).to(images.dtype)
131
+ return image_features
132
+
133
+ @torch.no_grad()
134
+ def forward(self, images):
135
+ if type(images) is list:
136
+ image_features = []
137
+ for image in images:
138
+ image_feature = self.multiscale_forward(self.forward_feature, image.unsqueeze(0), img_sizes=self.s2_scales, max_split_size=self.s2_split_size)
139
+ image_features.append(image_feature)
140
+ else:
141
+ image_features = self.multiscale_forward(self.forward_feature, images, img_sizes=self.s2_scales, max_split_size=self.s2_split_size)
142
+
143
+ return image_features
144
+
145
+ @property
146
+ def hidden_size(self):
147
+ return self.config.hidden_size * len(self.s2_scales)
model/multimodal_encoder/siglip_encoder.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ from transformers import SiglipVisionModel, SiglipImageProcessor, SiglipVisionConfig
5
+
6
+ class SiglipVisionTower(nn.Module):
7
+ def __init__(self, vision_tower, args, delay_load=False):
8
+ super().__init__()
9
+
10
+ self.is_loaded = False
11
+
12
+ self.vision_tower_name = vision_tower
13
+ self.select_layer = args.mm_vision_select_layer
14
+ self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
15
+
16
+ if not delay_load:
17
+ self.load_model()
18
+ elif getattr(args, 'unfreeze_mm_vision_tower', False):
19
+ self.load_model()
20
+ else:
21
+ self.cfg_only = SiglipVisionConfig.from_pretrained(self.vision_tower_name)
22
+
23
+ def load_model(self, device_map=None):
24
+ if self.is_loaded:
25
+ print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
26
+ return
27
+
28
+ self.image_processor = SiglipImageProcessor.from_pretrained(self.vision_tower_name)
29
+ self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
30
+ self.vision_tower.requires_grad_(False)
31
+
32
+ self.is_loaded = True
33
+
34
+ def feature_select(self, image_forward_outs):
35
+ image_features = image_forward_outs.hidden_states[self.select_layer]
36
+ if self.select_feature == 'patch':
37
+ image_features = image_features[:, 1:]
38
+ elif self.select_feature == 'cls_patch':
39
+ image_features = image_features
40
+ else:
41
+ raise ValueError(f'Unexpected select feature: {self.select_feature}')
42
+ return image_features
43
+
44
+ @torch.no_grad()
45
+ def forward(self, images):
46
+ if type(images) is list:
47
+ image_features = []
48
+ for image in images:
49
+ image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
50
+ image_feature = self.feature_select(image_forward_out).to(image.dtype)
51
+ image_features.append(image_feature)
52
+ else:
53
+ image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
54
+ image_features = self.feature_select(image_forward_outs).to(images.dtype)
55
+
56
+ return image_features
57
+
58
+ @property
59
+ def dummy_feature(self):
60
+ return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
61
+
62
+ @property
63
+ def dtype(self):
64
+ return self.vision_tower.dtype
65
+
66
+ @property
67
+ def device(self):
68
+ return self.vision_tower.device
69
+
70
+ @property
71
+ def config(self):
72
+ if self.is_loaded:
73
+ return self.vision_tower.config
74
+ else:
75
+ return self.cfg_only
76
+
77
+ @property
78
+ def hidden_size(self):
79
+ return self.config.hidden_size
80
+
81
+ @property
82
+ def num_patches_per_side(self):
83
+ return self.config.image_size // self.config.patch_size
84
+
85
+ @property
86
+ def num_patches(self):
87
+ return (self.config.image_size // self.config.patch_size) ** 2
88
+
89
+
90
+
91
+ class SiglipVisionTowerS2(SiglipVisionTower):
92
+ def __init__(self, vision_tower, args, delay_load=False):
93
+ super().__init__(vision_tower, args, delay_load)
94
+
95
+ self.s2_scales = getattr(args, 's2_scales', '336,672,1008')
96
+ self.s2_scales = list(map(int, self.s2_scales.split(',')))
97
+ self.s2_scales.sort()
98
+ self.s2_split_size = self.s2_scales[0]
99
+ self.s2_image_size = self.s2_scales[-1]
100
+
101
+ try:
102
+ from s2wrapper import forward as multiscale_forward
103
+ except ImportError:
104
+ raise ImportError('Package s2wrapper not found! Please install by running: \npip install git+https://github.com/bfshi/scaling_on_scales.git')
105
+ self.multiscale_forward = multiscale_forward
106
+
107
+ # change resize/crop size in preprocessing to the largest image size in s2_scale
108
+ if not delay_load or getattr(args, 'unfreeze_mm_vision_tower', False):
109
+ self.image_processor.size['shortest_edge'] = self.s2_image_size
110
+ self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size
111
+
112
+ def load_model(self, device_map=None):
113
+ if self.is_loaded:
114
+ print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
115
+ return
116
+
117
+ self.image_processor = SiglipImageProcessor.from_pretrained(self.vision_tower_name)
118
+ self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
119
+ self.vision_tower.requires_grad_(False)
120
+
121
+ self.image_processor.size['shortest_edge'] = self.s2_image_size
122
+ self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size
123
+
124
+ self.is_loaded = True
125
+
126
+ @torch.no_grad()
127
+ def forward_feature(self, images):
128
+ image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
129
+ image_features = self.feature_select(image_forward_outs).to(images.dtype)
130
+ return image_features
131
+
132
+ @torch.no_grad()
133
+ def forward(self, images):
134
+ if type(images) is list:
135
+ image_features = []
136
+ for image in images:
137
+ image_feature = self.multiscale_forward(self.forward_feature, image.unsqueeze(0), img_sizes=self.s2_scales, max_split_size=self.s2_split_size)
138
+ image_features.append(image_feature)
139
+ else:
140
+ image_features = self.multiscale_forward(self.forward_feature, images, img_sizes=self.s2_scales, max_split_size=self.s2_split_size)
141
+
142
+ return image_features
143
+
144
+ @property
145
+ def hidden_size(self):
146
+ return self.config.hidden_size * len(self.s2_scales)
147
+
model/multimodal_projector/builder.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import re
4
+
5
+
6
+ class IdentityMap(nn.Module):
7
+ def __init__(self):
8
+ super().__init__()
9
+
10
+ def forward(self, x, *args, **kwargs):
11
+ return x
12
+
13
+ @property
14
+ def config(self):
15
+ return {"mm_projector_type": 'identity'}
16
+
17
+
18
+ class SimpleResBlock(nn.Module):
19
+ def __init__(self, channels):
20
+ super().__init__()
21
+ self.pre_norm = nn.LayerNorm(channels)
22
+
23
+ self.proj = nn.Sequential(
24
+ nn.Linear(channels, channels),
25
+ nn.GELU(),
26
+ nn.Linear(channels, channels)
27
+ )
28
+ def forward(self, x):
29
+ x = self.pre_norm(x)
30
+ return x + self.proj(x)
31
+
32
+
33
+ def build_vision_projector(config, delay_load=False, **kwargs):
34
+ projector_type = getattr(config, 'mm_projector_type', 'linear')
35
+
36
+ if projector_type == 'linear':
37
+ return nn.Linear(config.mm_hidden_size, config.hidden_size)
38
+
39
+ mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
40
+ if mlp_gelu_match:
41
+ mlp_depth = int(mlp_gelu_match.group(1))
42
+ modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
43
+ for _ in range(1, mlp_depth):
44
+ modules.append(nn.GELU())
45
+ modules.append(nn.Linear(config.hidden_size, config.hidden_size))
46
+ return nn.Sequential(*modules)
47
+
48
+ if projector_type == 'identity':
49
+ return IdentityMap()
50
+
51
+ raise ValueError(f'Unknown projector type: {projector_type}')
model/utils.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoConfig
2
+
3
+
4
+ def auto_upgrade(config):
5
+ cfg = AutoConfig.from_pretrained(config)
6
+ if 'llava' in config and 'llava' not in cfg.model_type:
7
+ assert cfg.model_type == 'llama'
8
+ print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
9
+ print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10
+ confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11
+ if confirm.lower() in ["y", "yes"]:
12
+ print("Upgrading checkpoint...")
13
+ assert len(cfg.architectures) == 1
14
+ setattr(cfg.__class__, "model_type", "llava")
15
+ cfg.architectures[0] = 'LlavaLlamaForCausalLM'
16
+ cfg.save_pretrained(config)
17
+ print("Checkpoint upgraded.")
18
+ else:
19
+ print("Checkpoint upgrade aborted.")
20
+ exit(1)
requirements.txt ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy==1.26.4
2
+ scikit-learn==1.2.2
3
+
4
+ # HuggingFace ecosystem
5
+ transformers==4.47.0
6
+ tokenizers==0.21
7
+ sentencepiece==0.1.99
8
+ accelerate==0.27.2
9
+ datasets==2.15.0
10
+ peft==0.12.0
11
+ huggingface_hub>=0.25.2
12
+
13
+
14
+ # Additional ML libraries
15
+ bitsandbytes==0.43.3
16
+ timm==0.6.13
17
+ einops==0.6.1
18
+ einops-exts==0.0.4
19
+
20
+ # Utilities
21
+ shortuuid==1.0.13
22
+ pydantic==2.8.2
23
+ markdown2[all]
24
+
25
+ # Web framework and API
26
+ gradio==5.1.0
27
+ gradio_client
28
+ fastapi
29
+ uvicorn
30
+ requests==2.32.3
31
+ httpx==0.27.2
torch_requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ setuptools>=45.0
3
+ wheel>=0.36.2
4
+ ninja>=1.10.0
5
+ packaging>=20.0
6
+
7
+ # PyTorch
8
+ torch==2.1.2
9
+ torchvision==0.16.2
utils.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import logging.handlers
3
+ import os
4
+ import sys
5
+
6
+ import requests
7
+
8
+ from constants import LOGDIR
9
+
10
+ server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
11
+ moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
12
+
13
+ handler = None
14
+
15
+
16
+ def build_logger(logger_name, logger_filename):
17
+ global handler
18
+
19
+ formatter = logging.Formatter(
20
+ fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
21
+ datefmt="%Y-%m-%d %H:%M:%S",
22
+ )
23
+
24
+ # Set the format of root handlers
25
+ if not logging.getLogger().handlers:
26
+ logging.basicConfig(level=logging.INFO)
27
+ logging.getLogger().handlers[0].setFormatter(formatter)
28
+
29
+ # Redirect stdout and stderr to loggers
30
+ stdout_logger = logging.getLogger("stdout")
31
+ stdout_logger.setLevel(logging.INFO)
32
+ sl = StreamToLogger(stdout_logger, logging.INFO)
33
+ sys.stdout = sl
34
+
35
+ stderr_logger = logging.getLogger("stderr")
36
+ stderr_logger.setLevel(logging.ERROR)
37
+ sl = StreamToLogger(stderr_logger, logging.ERROR)
38
+ sys.stderr = sl
39
+
40
+ # Get logger
41
+ logger = logging.getLogger(logger_name)
42
+ logger.setLevel(logging.INFO)
43
+
44
+ # Add a file handler for all loggers
45
+ if handler is None:
46
+ os.makedirs(LOGDIR, exist_ok=True)
47
+ filename = os.path.join(LOGDIR, logger_filename)
48
+ handler = logging.handlers.TimedRotatingFileHandler(
49
+ filename, when='D', utc=True, encoding='UTF-8')
50
+ handler.setFormatter(formatter)
51
+
52
+ for name, item in logging.root.manager.loggerDict.items():
53
+ if isinstance(item, logging.Logger):
54
+ item.addHandler(handler)
55
+
56
+ return logger
57
+
58
+
59
+ class StreamToLogger(object):
60
+ """
61
+ Fake file-like stream object that redirects writes to a logger instance.
62
+ """
63
+ def __init__(self, logger, log_level=logging.INFO):
64
+ self.terminal = sys.stdout
65
+ self.logger = logger
66
+ self.log_level = log_level
67
+ self.linebuf = ''
68
+
69
+ def __getattr__(self, attr):
70
+ return getattr(self.terminal, attr)
71
+
72
+ def write(self, buf):
73
+ temp_linebuf = self.linebuf + buf
74
+ self.linebuf = ''
75
+ for line in temp_linebuf.splitlines(True):
76
+ # From the io.TextIOWrapper docs:
77
+ # On output, if newline is None, any '\n' characters written
78
+ # are translated to the system default line separator.
79
+ # By default sys.stdout.write() expects '\n' newlines and then
80
+ # translates them so this is still cross platform.
81
+ if line[-1] == '\n':
82
+ self.logger.log(self.log_level, line.rstrip())
83
+ else:
84
+ self.linebuf += line
85
+
86
+ def flush(self):
87
+ if self.linebuf != '':
88
+ self.logger.log(self.log_level, self.linebuf.rstrip())
89
+ self.linebuf = ''
90
+
91
+
92
+ def disable_torch_init():
93
+ """
94
+ Disable the redundant torch default initialization to accelerate model creation.
95
+ """
96
+ import torch
97
+ setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
98
+ setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
99
+
100
+
101
+ def violates_moderation(text):
102
+ """
103
+ Check whether the text violates OpenAI moderation API.
104
+ """
105
+ url = "https://api.openai.com/v1/moderations"
106
+ headers = {"Content-Type": "application/json",
107
+ "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
108
+ text = text.replace("\n", "")
109
+ data = "{" + '"input": ' + f'"{text}"' + "}"
110
+ data = data.encode("utf-8")
111
+ try:
112
+ ret = requests.post(url, headers=headers, data=data, timeout=5)
113
+ flagged = ret.json()["results"][0]["flagged"]
114
+ except requests.exceptions.RequestException as e:
115
+ flagged = False
116
+ except KeyError as e:
117
+ flagged = False
118
+
119
+ return flagged
120
+
121
+
122
+ def pretty_print_semaphore(semaphore):
123
+ if semaphore is None:
124
+ return "None"
125
+ return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"