Spaces:
Running
on
Zero
Running
on
Zero
Upload 22 files
Browse files- app.py +227 -0
- asian_food.jpg +0 -0
- constants.py +13 -0
- conversation.py +409 -0
- eval_utils.py +227 -0
- hawaii.jpg +0 -0
- mm_utils.py +247 -0
- model/___init__.py +7 -0
- model/apply_delta.py +48 -0
- model/builder.py +192 -0
- model/consolidate.py +29 -0
- model/language_model/llava_cohere.py +144 -0
- model/llava_arch.py +371 -0
- model/make_delta.py +52 -0
- model/multimodal_encoder/builder.py +16 -0
- model/multimodal_encoder/clip_encoder.py +147 -0
- model/multimodal_encoder/siglip_encoder.py +147 -0
- model/multimodal_projector/builder.py +51 -0
- model/utils.py +20 -0
- requirements.txt +31 -0
- torch_requirements.txt +9 -0
- utils.py +125 -0
app.py
ADDED
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import spaces
|
3 |
+
import time
|
4 |
+
import os
|
5 |
+
import torch
|
6 |
+
from PIL import Image
|
7 |
+
from threading import Thread
|
8 |
+
from transformers import TextIteratorStreamer, AutoConfig, AutoModelForCausalLM
|
9 |
+
from constants import (
|
10 |
+
IMAGE_TOKEN_INDEX,
|
11 |
+
DEFAULT_IMAGE_TOKEN,
|
12 |
+
DEFAULT_IM_START_TOKEN,
|
13 |
+
DEFAULT_IM_END_TOKEN,
|
14 |
+
)
|
15 |
+
from conversation import conv_templates
|
16 |
+
from eval_utils import load_maya_model
|
17 |
+
from utils import disable_torch_init
|
18 |
+
from mm_utils import tokenizer_image_token, process_images
|
19 |
+
from huggingface_hub._login import _login
|
20 |
+
|
21 |
+
# Import LLaVA modules to register model types
|
22 |
+
from model import *
|
23 |
+
from model.language_model.llava_cohere import LlavaCohereForCausalLM, LlavaCohereConfig
|
24 |
+
|
25 |
+
# Register model type and config
|
26 |
+
AutoConfig.register("llava_cohere", LlavaCohereConfig)
|
27 |
+
AutoModelForCausalLM.register(LlavaCohereConfig, LlavaCohereForCausalLM)
|
28 |
+
|
29 |
+
hf_token = os.getenv("hf_token")
|
30 |
+
_login(token=hf_token, add_to_git_credential=False)
|
31 |
+
|
32 |
+
# Global Variables
|
33 |
+
MODEL_BASE = "CohereForAI/aya-23-8B"
|
34 |
+
MODEL_PATH = "maya-multimodal/maya"
|
35 |
+
MODE = "finetuned"
|
36 |
+
|
37 |
+
def load_model():
|
38 |
+
"""Load the Maya model and required components"""
|
39 |
+
model, tokenizer, image_processor, _ = load_maya_model(
|
40 |
+
MODEL_BASE, MODEL_PATH, None, MODE
|
41 |
+
)
|
42 |
+
model = model.cuda()
|
43 |
+
model.eval()
|
44 |
+
return model, tokenizer, image_processor
|
45 |
+
|
46 |
+
# Load model globally
|
47 |
+
print("Loading model...")
|
48 |
+
model, tokenizer, image_processor = load_model()
|
49 |
+
print("Model loaded successfully!")
|
50 |
+
|
51 |
+
def validate_image_file(image_path):
|
52 |
+
"""Validate that the image file exists and is in a supported format."""
|
53 |
+
if not os.path.isfile(image_path):
|
54 |
+
raise gr.Error(f"Error: File {image_path} does not exist.")
|
55 |
+
|
56 |
+
try:
|
57 |
+
with Image.open(image_path) as img:
|
58 |
+
img.verify()
|
59 |
+
return True
|
60 |
+
except (IOError, SyntaxError) as e:
|
61 |
+
raise gr.Error(f"Error: {image_path} is not a valid image file. {e}")
|
62 |
+
|
63 |
+
@spaces.GPU
|
64 |
+
def process_chat_stream(message, history):
|
65 |
+
print(message)
|
66 |
+
print("History:", history)
|
67 |
+
image = None # Initialize image variable first
|
68 |
+
|
69 |
+
# First try to get image from current message
|
70 |
+
if message.get("files", []):
|
71 |
+
current_files = message["files"]
|
72 |
+
if current_files:
|
73 |
+
last_file = current_files[-1]
|
74 |
+
image = last_file["path"] if isinstance(last_file, dict) else last_file
|
75 |
+
|
76 |
+
# If no image in current message, try to get from history
|
77 |
+
if image is None and history:
|
78 |
+
for hist in reversed(history):
|
79 |
+
print("Processing history item:", hist)
|
80 |
+
if isinstance(hist["content"], tuple):
|
81 |
+
image = hist["content"][0]
|
82 |
+
break
|
83 |
+
elif isinstance(hist["content"], dict) and hist["content"].get("files"):
|
84 |
+
hist_files = hist["content"]["files"]
|
85 |
+
if hist_files:
|
86 |
+
first_file = hist_files[0]
|
87 |
+
image = first_file["path"] if isinstance(first_file, dict) else first_file
|
88 |
+
break
|
89 |
+
|
90 |
+
# Check if we found an image
|
91 |
+
if image is None:
|
92 |
+
raise gr.Error("Please upload an image to start the conversation.")
|
93 |
+
|
94 |
+
# Validate and process image
|
95 |
+
validate_image_file(image)
|
96 |
+
image = Image.open(image).convert("RGB")
|
97 |
+
|
98 |
+
# Process image for the model
|
99 |
+
image_tensor = process_images([image], image_processor, model.config)
|
100 |
+
if image_tensor is None:
|
101 |
+
raise gr.Error("Failed to process image")
|
102 |
+
|
103 |
+
image_tensor = image_tensor.cuda()
|
104 |
+
|
105 |
+
# Prepare conversation
|
106 |
+
conv = conv_templates["aya"].copy()
|
107 |
+
|
108 |
+
# Add conversation history
|
109 |
+
for hist in history:
|
110 |
+
# Handle user messages
|
111 |
+
if hist["role"] == "user":
|
112 |
+
# Extract text content based on format
|
113 |
+
if isinstance(hist["content"], str):
|
114 |
+
human_text = hist["content"]
|
115 |
+
elif isinstance(hist["content"], tuple):
|
116 |
+
human_text = hist["content"][1] if len(hist["content"]) > 1 else ""
|
117 |
+
else:
|
118 |
+
human_text = hist["content"]
|
119 |
+
conv.append_message(conv.roles[0], human_text)
|
120 |
+
|
121 |
+
# Handle assistant messages
|
122 |
+
elif hist["role"] == "assistant":
|
123 |
+
conv.append_message(conv.roles[1], hist["content"])
|
124 |
+
|
125 |
+
# Format current message with proper image token placement
|
126 |
+
current_message = message["text"]
|
127 |
+
if not history:
|
128 |
+
if model.config.mm_use_im_start_end:
|
129 |
+
current_message = f"{DEFAULT_IM_START_TOKEN}{DEFAULT_IMAGE_TOKEN}{DEFAULT_IM_END_TOKEN}\n{current_message}"
|
130 |
+
else:
|
131 |
+
current_message = f"{DEFAULT_IMAGE_TOKEN}\n{current_message}"
|
132 |
+
|
133 |
+
# Add current message to conversation
|
134 |
+
conv.append_message(conv.roles[0], current_message)
|
135 |
+
conv.append_message(conv.roles[1], None)
|
136 |
+
|
137 |
+
# Get prompt and ensure input_ids are properly created
|
138 |
+
prompt = conv.get_prompt()
|
139 |
+
# print("PROMPT: ", prompt)
|
140 |
+
|
141 |
+
try:
|
142 |
+
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
|
143 |
+
if input_ids is None:
|
144 |
+
raise ValueError("Tokenization returned None")
|
145 |
+
|
146 |
+
# Ensure input_ids is 2D tensor
|
147 |
+
if len(input_ids.shape) == 1:
|
148 |
+
input_ids = input_ids.unsqueeze(0)
|
149 |
+
input_ids = input_ids.cuda()
|
150 |
+
|
151 |
+
# Validate vision tower and image tensor before starting generation
|
152 |
+
if not hasattr(model, 'get_vision_tower') or model.get_vision_tower() is None:
|
153 |
+
raise ValueError("Model's vision tower is not properly initialized")
|
154 |
+
|
155 |
+
if image_tensor is None:
|
156 |
+
raise ValueError("Image tensor is None")
|
157 |
+
|
158 |
+
# Setup streamer and generation
|
159 |
+
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True)
|
160 |
+
|
161 |
+
generation_kwargs = {
|
162 |
+
"inputs": input_ids,
|
163 |
+
"images": image_tensor,
|
164 |
+
"image_sizes": [image.size],
|
165 |
+
"streamer": streamer,
|
166 |
+
"temperature": 0.3,
|
167 |
+
"do_sample": True,
|
168 |
+
"top_p": 0.9,
|
169 |
+
"num_beams": 1,
|
170 |
+
"max_new_tokens": 4096,
|
171 |
+
"use_cache": True
|
172 |
+
}
|
173 |
+
|
174 |
+
def generate_with_error_handling():
|
175 |
+
try:
|
176 |
+
model.generate(**generation_kwargs)
|
177 |
+
except Exception as e:
|
178 |
+
import traceback
|
179 |
+
error_msg = f"Generation error: {str(e)}\nTraceback:\n{''.join(traceback.format_exc())}"
|
180 |
+
raise gr.Error(error_msg)
|
181 |
+
|
182 |
+
thread = Thread(target=generate_with_error_handling)
|
183 |
+
thread.start()
|
184 |
+
|
185 |
+
except Exception as e:
|
186 |
+
error_msg = f"Setup error: {str(e)}"
|
187 |
+
import traceback
|
188 |
+
error_msg += f"\nTraceback:\n{''.join(traceback.format_exc())}"
|
189 |
+
raise gr.Error(error_msg)
|
190 |
+
|
191 |
+
partial_message = ""
|
192 |
+
for new_token in streamer:
|
193 |
+
partial_message += new_token
|
194 |
+
time.sleep(0.1)
|
195 |
+
yield {"role": "assistant", "content": partial_message}
|
196 |
+
|
197 |
+
|
198 |
+
|
199 |
+
# Create Gradio interface
|
200 |
+
chatbot = gr.Chatbot(
|
201 |
+
show_label=False,
|
202 |
+
height=450,
|
203 |
+
show_share_button=False,
|
204 |
+
show_copy_button=False,
|
205 |
+
avatar_images=None,
|
206 |
+
container=True,
|
207 |
+
render_markdown=True,
|
208 |
+
scale=1,
|
209 |
+
type="messages"
|
210 |
+
)
|
211 |
+
chat_input = gr.MultimodalTextbox(interactive=True, file_types=["image"], placeholder="Enter message or upload file...", show_label=False)
|
212 |
+
with gr.Blocks(fill_height=True, ) as demo:
|
213 |
+
gr.ChatInterface(
|
214 |
+
fn=process_chat_stream,
|
215 |
+
title="Maya: Multilingual Multimodal Model",
|
216 |
+
examples=[{"text": "Describe this photo in detail.", "files": ["./asian_food.jpg"]},
|
217 |
+
{"text": "What is the name of this famous sight in the photo?", "files": ["./hawaii.jpg"]}],
|
218 |
+
description="Upload an image and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error. [Read the research paper](https://huggingface.co/papers/2412.07112)\n\nTeam 💚 Maya",
|
219 |
+
stop_btn="Stop Generation",
|
220 |
+
multimodal=True,
|
221 |
+
textbox=chat_input,
|
222 |
+
chatbot=chatbot,
|
223 |
+
)
|
224 |
+
|
225 |
+
if __name__ == "__main__":
|
226 |
+
demo.queue(api_open=False)
|
227 |
+
demo.launch(show_api=False, share=False)
|
asian_food.jpg
ADDED
constants.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
CONTROLLER_HEART_BEAT_EXPIRATION = 30
|
2 |
+
WORKER_HEART_BEAT_INTERVAL = 15
|
3 |
+
|
4 |
+
LOGDIR = "."
|
5 |
+
|
6 |
+
# Model Constants
|
7 |
+
IGNORE_INDEX = -100
|
8 |
+
IMAGE_TOKEN_INDEX = -200
|
9 |
+
DEFAULT_IMAGE_TOKEN = "<image>"
|
10 |
+
DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
|
11 |
+
DEFAULT_IM_START_TOKEN = "<im_start>"
|
12 |
+
DEFAULT_IM_END_TOKEN = "<im_end>"
|
13 |
+
IMAGE_PLACEHOLDER = "<image-placeholder>"
|
conversation.py
ADDED
@@ -0,0 +1,409 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import dataclasses
|
2 |
+
from enum import auto, Enum
|
3 |
+
from typing import List, Tuple
|
4 |
+
import base64
|
5 |
+
from io import BytesIO
|
6 |
+
from PIL import Image
|
7 |
+
|
8 |
+
|
9 |
+
class SeparatorStyle(Enum):
|
10 |
+
"""Different separator style."""
|
11 |
+
SINGLE = auto()
|
12 |
+
TWO = auto()
|
13 |
+
MPT = auto()
|
14 |
+
PLAIN = auto()
|
15 |
+
LLAMA_2 = auto()
|
16 |
+
|
17 |
+
|
18 |
+
@dataclasses.dataclass
|
19 |
+
class Conversation:
|
20 |
+
"""A class that keeps all conversation history."""
|
21 |
+
system: str
|
22 |
+
roles: List[str]
|
23 |
+
messages: List[List[str]]
|
24 |
+
offset: int
|
25 |
+
sep_style: SeparatorStyle = SeparatorStyle.SINGLE
|
26 |
+
sep: str = "###"
|
27 |
+
sep2: str = None
|
28 |
+
version: str = "Unknown"
|
29 |
+
|
30 |
+
skip_next: bool = False
|
31 |
+
|
32 |
+
def get_prompt(self):
|
33 |
+
messages = self.messages
|
34 |
+
if len(messages) > 0 and type(messages[0][1]) is tuple:
|
35 |
+
messages = self.messages.copy()
|
36 |
+
init_role, init_msg = messages[0].copy()
|
37 |
+
init_msg = init_msg[0].replace("<image>", "").strip()
|
38 |
+
if 'mmtag' in self.version:
|
39 |
+
messages[0] = (init_role, init_msg)
|
40 |
+
messages.insert(0, (self.roles[0], "<Image><image></Image>"))
|
41 |
+
messages.insert(1, (self.roles[1], "Received."))
|
42 |
+
else:
|
43 |
+
messages[0] = (init_role, "<image>\n" + init_msg)
|
44 |
+
|
45 |
+
if self.sep_style == SeparatorStyle.SINGLE:
|
46 |
+
ret = self.system + self.sep
|
47 |
+
for role, message in messages:
|
48 |
+
if message:
|
49 |
+
if type(message) is tuple:
|
50 |
+
message, _, _ = message
|
51 |
+
ret += role + ": " + message + self.sep
|
52 |
+
else:
|
53 |
+
ret += role + ":"
|
54 |
+
elif self.sep_style == SeparatorStyle.TWO:
|
55 |
+
seps = [self.sep, self.sep2]
|
56 |
+
ret = self.system + seps[0]
|
57 |
+
for i, (role, message) in enumerate(messages):
|
58 |
+
if message:
|
59 |
+
if type(message) is tuple:
|
60 |
+
message, _, _ = message
|
61 |
+
ret += role + ": " + message + seps[i % 2]
|
62 |
+
else:
|
63 |
+
ret += role + ":"
|
64 |
+
elif self.sep_style == SeparatorStyle.MPT:
|
65 |
+
ret = self.system + self.sep
|
66 |
+
for role, message in messages:
|
67 |
+
if message:
|
68 |
+
if type(message) is tuple:
|
69 |
+
message, _, _ = message
|
70 |
+
ret += role + message + self.sep
|
71 |
+
else:
|
72 |
+
ret += role
|
73 |
+
elif self.sep_style == SeparatorStyle.LLAMA_2:
|
74 |
+
wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n" if len(msg) > 0 else msg
|
75 |
+
wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
|
76 |
+
ret = ""
|
77 |
+
|
78 |
+
for i, (role, message) in enumerate(messages):
|
79 |
+
if i == 0:
|
80 |
+
assert message, "first message should not be none"
|
81 |
+
assert role == self.roles[0], "first message should come from user"
|
82 |
+
if message:
|
83 |
+
if type(message) is tuple:
|
84 |
+
message, _, _ = message
|
85 |
+
if i == 0: message = wrap_sys(self.system) + message
|
86 |
+
if i % 2 == 0:
|
87 |
+
message = wrap_inst(message)
|
88 |
+
ret += self.sep + message
|
89 |
+
else:
|
90 |
+
ret += " " + message + " " + self.sep2
|
91 |
+
else:
|
92 |
+
ret += ""
|
93 |
+
ret = ret.lstrip(self.sep)
|
94 |
+
elif self.sep_style == SeparatorStyle.PLAIN:
|
95 |
+
seps = [self.sep, self.sep2]
|
96 |
+
ret = self.system
|
97 |
+
for i, (role, message) in enumerate(messages):
|
98 |
+
if message:
|
99 |
+
if type(message) is tuple:
|
100 |
+
message, _, _ = message
|
101 |
+
ret += message + seps[i % 2]
|
102 |
+
else:
|
103 |
+
ret += ""
|
104 |
+
else:
|
105 |
+
raise ValueError(f"Invalid style: {self.sep_style}")
|
106 |
+
|
107 |
+
return ret
|
108 |
+
|
109 |
+
def append_message(self, role, message):
|
110 |
+
self.messages.append([role, message])
|
111 |
+
|
112 |
+
def process_image(self, image, image_process_mode, return_pil=False, image_format='PNG', max_len=1344, min_len=672):
|
113 |
+
if image_process_mode == "Pad":
|
114 |
+
def expand2square(pil_img, background_color=(122, 116, 104)):
|
115 |
+
width, height = pil_img.size
|
116 |
+
if width == height:
|
117 |
+
return pil_img
|
118 |
+
elif width > height:
|
119 |
+
result = Image.new(pil_img.mode, (width, width), background_color)
|
120 |
+
result.paste(pil_img, (0, (width - height) // 2))
|
121 |
+
return result
|
122 |
+
else:
|
123 |
+
result = Image.new(pil_img.mode, (height, height), background_color)
|
124 |
+
result.paste(pil_img, ((height - width) // 2, 0))
|
125 |
+
return result
|
126 |
+
image = expand2square(image)
|
127 |
+
elif image_process_mode in ["Default", "Crop"]:
|
128 |
+
pass
|
129 |
+
elif image_process_mode == "Resize":
|
130 |
+
image = image.resize((336, 336))
|
131 |
+
else:
|
132 |
+
raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
|
133 |
+
if max(image.size) > max_len:
|
134 |
+
max_hw, min_hw = max(image.size), min(image.size)
|
135 |
+
aspect_ratio = max_hw / min_hw
|
136 |
+
shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
|
137 |
+
longest_edge = int(shortest_edge * aspect_ratio)
|
138 |
+
W, H = image.size
|
139 |
+
if H > W:
|
140 |
+
H, W = longest_edge, shortest_edge
|
141 |
+
else:
|
142 |
+
H, W = shortest_edge, longest_edge
|
143 |
+
image = image.resize((W, H))
|
144 |
+
if return_pil:
|
145 |
+
return image
|
146 |
+
else:
|
147 |
+
buffered = BytesIO()
|
148 |
+
image.save(buffered, format=image_format)
|
149 |
+
img_b64_str = base64.b64encode(buffered.getvalue()).decode()
|
150 |
+
return img_b64_str
|
151 |
+
|
152 |
+
def get_images(self, return_pil=False):
|
153 |
+
images = []
|
154 |
+
for i, (role, msg) in enumerate(self.messages[self.offset:]):
|
155 |
+
if i % 2 == 0:
|
156 |
+
if type(msg) is tuple:
|
157 |
+
msg, image, image_process_mode = msg
|
158 |
+
image = self.process_image(image, image_process_mode, return_pil=return_pil)
|
159 |
+
images.append(image)
|
160 |
+
return images
|
161 |
+
|
162 |
+
def to_gradio_chatbot(self):
|
163 |
+
ret = []
|
164 |
+
for i, (role, msg) in enumerate(self.messages[self.offset:]):
|
165 |
+
if i % 2 == 0:
|
166 |
+
if type(msg) is tuple:
|
167 |
+
msg, image, image_process_mode = msg
|
168 |
+
img_b64_str = self.process_image(
|
169 |
+
image, "Default", return_pil=False,
|
170 |
+
image_format='JPEG')
|
171 |
+
img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
|
172 |
+
msg = img_str + msg.replace('<image>', '').strip()
|
173 |
+
ret.append([msg, None])
|
174 |
+
else:
|
175 |
+
ret.append([msg, None])
|
176 |
+
else:
|
177 |
+
ret[-1][-1] = msg
|
178 |
+
return ret
|
179 |
+
|
180 |
+
def copy(self):
|
181 |
+
return Conversation(
|
182 |
+
system=self.system,
|
183 |
+
roles=self.roles,
|
184 |
+
messages=[[x, y] for x, y in self.messages],
|
185 |
+
offset=self.offset,
|
186 |
+
sep_style=self.sep_style,
|
187 |
+
sep=self.sep,
|
188 |
+
sep2=self.sep2,
|
189 |
+
version=self.version)
|
190 |
+
|
191 |
+
def dict(self):
|
192 |
+
if len(self.get_images()) > 0:
|
193 |
+
return {
|
194 |
+
"system": self.system,
|
195 |
+
"roles": self.roles,
|
196 |
+
"messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
|
197 |
+
"offset": self.offset,
|
198 |
+
"sep": self.sep,
|
199 |
+
"sep2": self.sep2,
|
200 |
+
}
|
201 |
+
return {
|
202 |
+
"system": self.system,
|
203 |
+
"roles": self.roles,
|
204 |
+
"messages": self.messages,
|
205 |
+
"offset": self.offset,
|
206 |
+
"sep": self.sep,
|
207 |
+
"sep2": self.sep2,
|
208 |
+
}
|
209 |
+
|
210 |
+
|
211 |
+
conv_vicuna_v0 = Conversation(
|
212 |
+
system="A chat between a curious human and an artificial intelligence assistant. "
|
213 |
+
"The assistant gives helpful, detailed, and polite answers to the human's questions.",
|
214 |
+
roles=("Human", "Assistant"),
|
215 |
+
messages=(
|
216 |
+
("Human", "What are the key differences between renewable and non-renewable energy sources?"),
|
217 |
+
("Assistant",
|
218 |
+
"Renewable energy sources are those that can be replenished naturally in a relatively "
|
219 |
+
"short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
|
220 |
+
"Non-renewable energy sources, on the other hand, are finite and will eventually be "
|
221 |
+
"depleted, such as coal, oil, and natural gas. Here are some key differences between "
|
222 |
+
"renewable and non-renewable energy sources:\n"
|
223 |
+
"1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
|
224 |
+
"energy sources are finite and will eventually run out.\n"
|
225 |
+
"2. Environmental impact: Renewable energy sources have a much lower environmental impact "
|
226 |
+
"than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
|
227 |
+
"and other negative effects.\n"
|
228 |
+
"3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
|
229 |
+
"have lower operational costs than non-renewable sources.\n"
|
230 |
+
"4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
|
231 |
+
"locations than non-renewable sources.\n"
|
232 |
+
"5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
|
233 |
+
"situations and needs, while non-renewable sources are more rigid and inflexible.\n"
|
234 |
+
"6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
|
235 |
+
"non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
|
236 |
+
),
|
237 |
+
offset=2,
|
238 |
+
sep_style=SeparatorStyle.SINGLE,
|
239 |
+
sep="###",
|
240 |
+
)
|
241 |
+
|
242 |
+
conv_vicuna_v1 = Conversation(
|
243 |
+
system="A chat between a curious user and an artificial intelligence assistant. "
|
244 |
+
"The assistant gives helpful, detailed, and polite answers to the user's questions.",
|
245 |
+
roles=("USER", "ASSISTANT"),
|
246 |
+
version="v1",
|
247 |
+
messages=(),
|
248 |
+
offset=0,
|
249 |
+
sep_style=SeparatorStyle.TWO,
|
250 |
+
sep=" ",
|
251 |
+
sep2="</s>",
|
252 |
+
)
|
253 |
+
|
254 |
+
conv_llama_2 = Conversation(
|
255 |
+
system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
|
256 |
+
|
257 |
+
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
|
258 |
+
roles=("USER", "ASSISTANT"),
|
259 |
+
version="llama_v2",
|
260 |
+
messages=(),
|
261 |
+
offset=0,
|
262 |
+
sep_style=SeparatorStyle.LLAMA_2,
|
263 |
+
sep="<s>",
|
264 |
+
sep2="</s>",
|
265 |
+
)
|
266 |
+
|
267 |
+
conv_llava_llama_2 = Conversation(
|
268 |
+
system="You are a helpful language and vision assistant. "
|
269 |
+
"You are able to understand the visual content that the user provides, "
|
270 |
+
"and assist the user with a variety of tasks using natural language.",
|
271 |
+
roles=("USER", "ASSISTANT"),
|
272 |
+
version="llama_v2",
|
273 |
+
messages=(),
|
274 |
+
offset=0,
|
275 |
+
sep_style=SeparatorStyle.LLAMA_2,
|
276 |
+
sep="<s>",
|
277 |
+
sep2="</s>",
|
278 |
+
)
|
279 |
+
|
280 |
+
conv_mpt = Conversation(
|
281 |
+
system="""<|im_start|>system
|
282 |
+
A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
|
283 |
+
roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
|
284 |
+
version="mpt",
|
285 |
+
messages=(),
|
286 |
+
offset=0,
|
287 |
+
sep_style=SeparatorStyle.MPT,
|
288 |
+
sep="<|im_end|>",
|
289 |
+
)
|
290 |
+
|
291 |
+
conv_llava_plain = Conversation(
|
292 |
+
system="",
|
293 |
+
roles=("", ""),
|
294 |
+
messages=(
|
295 |
+
),
|
296 |
+
offset=0,
|
297 |
+
sep_style=SeparatorStyle.PLAIN,
|
298 |
+
sep="\n",
|
299 |
+
)
|
300 |
+
|
301 |
+
conv_llava_v0 = Conversation(
|
302 |
+
system="A chat between a curious human and an artificial intelligence assistant. "
|
303 |
+
"The assistant gives helpful, detailed, and polite answers to the human's questions.",
|
304 |
+
roles=("Human", "Assistant"),
|
305 |
+
messages=(
|
306 |
+
),
|
307 |
+
offset=0,
|
308 |
+
sep_style=SeparatorStyle.SINGLE,
|
309 |
+
sep="###",
|
310 |
+
)
|
311 |
+
|
312 |
+
conv_llava_v0_mmtag = Conversation(
|
313 |
+
system="A chat between a curious user and an artificial intelligence assistant. "
|
314 |
+
"The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
|
315 |
+
"The visual content will be provided with the following format: <Image>visual content</Image>.",
|
316 |
+
roles=("Human", "Assistant"),
|
317 |
+
messages=(
|
318 |
+
),
|
319 |
+
offset=0,
|
320 |
+
sep_style=SeparatorStyle.SINGLE,
|
321 |
+
sep="###",
|
322 |
+
version="v0_mmtag",
|
323 |
+
)
|
324 |
+
|
325 |
+
conv_llava_v1 = Conversation(
|
326 |
+
system="A chat between a curious human and an artificial intelligence assistant. "
|
327 |
+
"The assistant gives helpful, detailed, and polite answers to the human's questions.",
|
328 |
+
roles=("USER", "ASSISTANT"),
|
329 |
+
version="v1",
|
330 |
+
messages=(),
|
331 |
+
offset=0,
|
332 |
+
sep_style=SeparatorStyle.TWO,
|
333 |
+
sep=" ",
|
334 |
+
sep2="</s>",
|
335 |
+
)
|
336 |
+
|
337 |
+
conv_llava_v1_mmtag = Conversation(
|
338 |
+
system="A chat between a curious user and an artificial intelligence assistant. "
|
339 |
+
"The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
|
340 |
+
"The visual content will be provided with the following format: <Image>visual content</Image>.",
|
341 |
+
roles=("USER", "ASSISTANT"),
|
342 |
+
messages=(),
|
343 |
+
offset=0,
|
344 |
+
sep_style=SeparatorStyle.TWO,
|
345 |
+
sep=" ",
|
346 |
+
sep2="</s>",
|
347 |
+
version="v1_mmtag",
|
348 |
+
)
|
349 |
+
|
350 |
+
conv_mistral_instruct = Conversation(
|
351 |
+
system="",
|
352 |
+
roles=("USER", "ASSISTANT"),
|
353 |
+
version="llama_v2",
|
354 |
+
messages=(),
|
355 |
+
offset=0,
|
356 |
+
sep_style=SeparatorStyle.LLAMA_2,
|
357 |
+
sep="",
|
358 |
+
sep2="</s>",
|
359 |
+
)
|
360 |
+
|
361 |
+
conv_chatml_direct = Conversation(
|
362 |
+
system="""<|im_start|>system
|
363 |
+
Answer the questions.""",
|
364 |
+
roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
|
365 |
+
version="mpt",
|
366 |
+
messages=(),
|
367 |
+
offset=0,
|
368 |
+
sep_style=SeparatorStyle.MPT,
|
369 |
+
sep="<|im_end|>",
|
370 |
+
)
|
371 |
+
|
372 |
+
conv_aya = Conversation(
|
373 |
+
system="A chat between a curious user and an artificial intelligence assistant. "
|
374 |
+
"The assistant gives helpful, detailed, and polite answers to the user's questions.",
|
375 |
+
roles=("USER", "ASSISTANT"),
|
376 |
+
version="v1",
|
377 |
+
messages=(),
|
378 |
+
offset=0,
|
379 |
+
sep_style=SeparatorStyle.TWO,
|
380 |
+
sep=" ",
|
381 |
+
sep2="<|END_OF_TURN_TOKEN|>",
|
382 |
+
)
|
383 |
+
|
384 |
+
default_conversation = conv_vicuna_v1
|
385 |
+
conv_templates = {
|
386 |
+
"default": conv_vicuna_v0,
|
387 |
+
"v0": conv_vicuna_v0,
|
388 |
+
"v1": conv_vicuna_v1,
|
389 |
+
"vicuna_v1": conv_vicuna_v1,
|
390 |
+
"llama_2": conv_llama_2,
|
391 |
+
"mistral_instruct": conv_mistral_instruct,
|
392 |
+
"chatml_direct": conv_chatml_direct,
|
393 |
+
"mistral_direct": conv_chatml_direct,
|
394 |
+
|
395 |
+
"plain": conv_llava_plain,
|
396 |
+
"v0_plain": conv_llava_plain,
|
397 |
+
"llava_v0": conv_llava_v0,
|
398 |
+
"v0_mmtag": conv_llava_v0_mmtag,
|
399 |
+
"llava_v1": conv_llava_v1,
|
400 |
+
"v1_mmtag": conv_llava_v1_mmtag,
|
401 |
+
"llava_llama_2": conv_llava_llama_2,
|
402 |
+
|
403 |
+
"mpt": conv_mpt,
|
404 |
+
"aya": conv_aya
|
405 |
+
}
|
406 |
+
|
407 |
+
|
408 |
+
if __name__ == "__main__":
|
409 |
+
print(default_conversation.get_prompt())
|
eval_utils.py
ADDED
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
Cherry picked from Roshan's PR https://github.com/nahidalam/LLaVA/blob/1ecc141d7f20f16518f38a0d99320268305c17c3/llava/eval/maya/eval_utils.py
|
3 |
+
'''
|
4 |
+
|
5 |
+
import os
|
6 |
+
import sys
|
7 |
+
import torch
|
8 |
+
import requests
|
9 |
+
from io import BytesIO
|
10 |
+
from PIL import Image
|
11 |
+
|
12 |
+
|
13 |
+
from transformers import AutoTokenizer, AutoConfig, TextStreamer
|
14 |
+
from transformers.models.cohere.tokenization_cohere_fast import CohereTokenizerFast
|
15 |
+
from model.language_model.llava_cohere import LlavaCohereForCausalLM, LlavaCohereConfig
|
16 |
+
from constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_IMAGE_PATCH_TOKEN
|
17 |
+
from conversation import conv_templates, SeparatorStyle
|
18 |
+
from mm_utils import process_images, tokenizer_image_token, get_model_name_from_path
|
19 |
+
|
20 |
+
from typing import Optional, Literal
|
21 |
+
|
22 |
+
|
23 |
+
def load_maya_model(model_base: str, model_path : str, projector_path : Optional[str] = None, mode = Literal['pretrained','finetuned']):
|
24 |
+
|
25 |
+
""" Function that helps load a trained Maya model
|
26 |
+
|
27 |
+
Trained Maya model can be of two flavors :
|
28 |
+
1. Pretrained : The model has only gone through pretraining and the changes are restricted to the projector layer
|
29 |
+
2. Finetuned : Model has gone through instruction finetuning post pretraining stage. This affects the whole model
|
30 |
+
|
31 |
+
This is a replication of the load_pretrained_model function from llava.model.builder thats specific to Cohere/Maya
|
32 |
+
|
33 |
+
Args:
|
34 |
+
model_base : Path of the base LLM model in HF. Eg: 'CohereForAI/aya-23-8B', 'meta-llama/Meta-Llama-3-8B-Instruct'.
|
35 |
+
This is used to instantiate the tokenizer and the model (in case of loading the pretrained model)
|
36 |
+
model_path : Path of the trained model repo in HF. Eg : 'nahidalam/Maya'
|
37 |
+
This is used to load the config file. So this path/directory should have the config.json file
|
38 |
+
For the finetuned model, this is used to load the final model weights as well
|
39 |
+
projector_path : For the pretrained model, this represents the path to the local directory which holds the mm_projector.bin file
|
40 |
+
model : Helps specify if this is loading a pretrained only model or a finetuned model
|
41 |
+
|
42 |
+
Returns:
|
43 |
+
model: LlavaCohereForCausalLM object
|
44 |
+
tokenizer: CohereTokenizerFast object
|
45 |
+
image_processor:
|
46 |
+
content_len:
|
47 |
+
"""
|
48 |
+
|
49 |
+
device_map = 'auto'
|
50 |
+
kwargs = {"device_map": device_map}
|
51 |
+
kwargs['torch_dtype'] = torch.float32
|
52 |
+
# kwargs['attn_implementation'] = 'flash_attention_2'
|
53 |
+
|
54 |
+
## Instantiating tokenizer and model base
|
55 |
+
tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=True)
|
56 |
+
cfg_pretrained = LlavaCohereConfig.from_pretrained(model_path)
|
57 |
+
|
58 |
+
if mode == 'pretrained':
|
59 |
+
model = LlavaCohereForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
|
60 |
+
|
61 |
+
## Loading Projector layer weights
|
62 |
+
mm_projector_weights = torch.load(projector_path, map_location='cpu')
|
63 |
+
mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
|
64 |
+
model.load_state_dict(mm_projector_weights, strict=False)
|
65 |
+
else:
|
66 |
+
# Load model with ignore_mismatched_sizes to handle vision tower weights
|
67 |
+
model = LlavaCohereForCausalLM.from_pretrained(
|
68 |
+
model_path,
|
69 |
+
config=cfg_pretrained,
|
70 |
+
ignore_mismatched_sizes=True, # Add this to handle vision tower weights
|
71 |
+
**kwargs
|
72 |
+
)
|
73 |
+
|
74 |
+
## Loading image processor
|
75 |
+
image_processor = None
|
76 |
+
|
77 |
+
mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
|
78 |
+
mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
|
79 |
+
if mm_use_im_patch_token:
|
80 |
+
tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
|
81 |
+
if mm_use_im_start_end:
|
82 |
+
tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
|
83 |
+
model.resize_token_embeddings(len(tokenizer))
|
84 |
+
|
85 |
+
# Get and load vision tower
|
86 |
+
vision_tower = model.get_vision_tower()
|
87 |
+
if vision_tower is None:
|
88 |
+
raise ValueError("Vision tower not found in model config")
|
89 |
+
|
90 |
+
print(f"Loading vision tower... Is loaded: {vision_tower.is_loaded}")
|
91 |
+
if not vision_tower.is_loaded:
|
92 |
+
try:
|
93 |
+
vision_tower.load_model()
|
94 |
+
print("Vision tower loaded successfully")
|
95 |
+
except Exception as e:
|
96 |
+
print(f"Error loading vision tower: {str(e)}")
|
97 |
+
raise
|
98 |
+
|
99 |
+
if device_map != 'auto':
|
100 |
+
vision_tower.to(device=device_map, dtype=torch.float16)
|
101 |
+
image_processor = vision_tower.image_processor
|
102 |
+
|
103 |
+
if hasattr(model.config, "max_sequence_length"):
|
104 |
+
context_len = model.config.max_sequence_length
|
105 |
+
else:
|
106 |
+
context_len = 2048
|
107 |
+
|
108 |
+
#maya = MayaModel(model, tokenizer, image_processor, context_len)
|
109 |
+
|
110 |
+
return model, tokenizer, image_processor, context_len
|
111 |
+
|
112 |
+
|
113 |
+
class MayaModel(object):
|
114 |
+
|
115 |
+
def __init__(self, model : LlavaCohereForCausalLM, tokenizer : CohereTokenizerFast, image_processor, context_length):
|
116 |
+
self.model = model
|
117 |
+
self.tokenizer = tokenizer
|
118 |
+
self.image_processor = image_processor
|
119 |
+
self.context_length = context_length
|
120 |
+
|
121 |
+
def validate_inputs(self):
|
122 |
+
"""
|
123 |
+
Method to validate the inputs
|
124 |
+
"""
|
125 |
+
pass
|
126 |
+
|
127 |
+
|
128 |
+
|
129 |
+
|
130 |
+
def load_image(image_input):
|
131 |
+
"""
|
132 |
+
Convert various image inputs to a PIL Image object.
|
133 |
+
|
134 |
+
:param image_input: Can be a URL string, a file path string, or image bytes
|
135 |
+
:return: PIL Image object
|
136 |
+
"""
|
137 |
+
try:
|
138 |
+
if isinstance(image_input, str):
|
139 |
+
if image_input.startswith(('http://', 'https://')):
|
140 |
+
# Input is a URL
|
141 |
+
response = requests.get(image_input)
|
142 |
+
response.raise_for_status() # Raise an exception for bad responses
|
143 |
+
return Image.open(BytesIO(response.content))
|
144 |
+
elif os.path.isfile(image_input):
|
145 |
+
# Input is a file path
|
146 |
+
return Image.open(image_input)
|
147 |
+
else:
|
148 |
+
raise ValueError("Invalid input: string is neither a valid URL nor a file path")
|
149 |
+
elif isinstance(image_input, bytes):
|
150 |
+
# Input is bytes
|
151 |
+
return Image.open(BytesIO(image_input))
|
152 |
+
else:
|
153 |
+
raise ValueError("Invalid input type. Expected URL string, file path string, or bytes.")
|
154 |
+
except requests.RequestException as e:
|
155 |
+
raise ValueError(f"Error fetching image from URL: {e}")
|
156 |
+
except IOError as e:
|
157 |
+
raise ValueError(f"Error opening image file: {e}")
|
158 |
+
except Exception as e:
|
159 |
+
raise ValueError(f"An unexpected error occurred: {e}")
|
160 |
+
|
161 |
+
|
162 |
+
|
163 |
+
|
164 |
+
def get_single_sample_prediction(maya_model, image_file, user_question, temperature = 0.0, max_new_tokens = 100, conv_mode = 'aya'):
|
165 |
+
"""Generates the prediction for a single image-user question pair.
|
166 |
+
|
167 |
+
Args:
|
168 |
+
model (MayaModel): Trained Maya model
|
169 |
+
image_file : One of the following: Online image url, local image path, or image bytes
|
170 |
+
user_question (str): Question to be shared with LLM
|
171 |
+
temperature (float, optional): Temperature param for LLMs. Defaults to 0.0.
|
172 |
+
max_new_tokens (int, optional): Max new number of tokens generated. Defaults to 100
|
173 |
+
conv_model (str, optional): Conversation model to be used. Defaults to 'aya'.
|
174 |
+
|
175 |
+
Returns:
|
176 |
+
output (str): Model's response to user question
|
177 |
+
"""
|
178 |
+
|
179 |
+
|
180 |
+
conv = conv_templates[conv_mode].copy()
|
181 |
+
roles = conv.roles
|
182 |
+
model = maya_model.model
|
183 |
+
tokenizer = maya_model.tokenizer
|
184 |
+
image_processor = maya_model.image_processor
|
185 |
+
|
186 |
+
image = load_image(image_file)
|
187 |
+
image_size = image.size
|
188 |
+
|
189 |
+
image_tensor = process_images([image], image_processor, model.config)
|
190 |
+
if type(image_tensor) is list:
|
191 |
+
image_tensor = [image.to(model.device, dtype=torch.float16) for image in image_tensor]
|
192 |
+
else:
|
193 |
+
image_tensor = image_tensor.to(model.device, dtype=torch.float16)
|
194 |
+
|
195 |
+
inp = user_question
|
196 |
+
|
197 |
+
if image is not None:
|
198 |
+
# first message
|
199 |
+
if model.config.mm_use_im_start_end:
|
200 |
+
inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp
|
201 |
+
else:
|
202 |
+
inp = DEFAULT_IMAGE_TOKEN + '\n' + inp
|
203 |
+
# image = None
|
204 |
+
|
205 |
+
conv.append_message(conv.roles[0], inp)
|
206 |
+
conv.append_message(conv.roles[1], None)
|
207 |
+
prompt = conv.get_prompt()
|
208 |
+
|
209 |
+
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)
|
210 |
+
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
|
211 |
+
keywords = [stop_str]
|
212 |
+
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
|
213 |
+
|
214 |
+
with torch.inference_mode():
|
215 |
+
output_ids = model.generate(
|
216 |
+
input_ids,
|
217 |
+
images=image_tensor,
|
218 |
+
image_sizes=[image_size],
|
219 |
+
do_sample=True if temperature > 0 else False,
|
220 |
+
temperature=temperature,
|
221 |
+
max_new_tokens=max_new_tokens,
|
222 |
+
streamer=streamer,
|
223 |
+
use_cache=True)
|
224 |
+
|
225 |
+
outputs = tokenizer.decode(output_ids[0]).strip()
|
226 |
+
|
227 |
+
return outputs
|
hawaii.jpg
ADDED
mm_utils.py
ADDED
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from PIL import Image
|
2 |
+
from io import BytesIO
|
3 |
+
import base64
|
4 |
+
import torch
|
5 |
+
import math
|
6 |
+
import ast
|
7 |
+
|
8 |
+
from transformers import StoppingCriteria
|
9 |
+
from constants import IMAGE_TOKEN_INDEX
|
10 |
+
|
11 |
+
|
12 |
+
def select_best_resolution(original_size, possible_resolutions):
|
13 |
+
"""
|
14 |
+
Selects the best resolution from a list of possible resolutions based on the original size.
|
15 |
+
|
16 |
+
Args:
|
17 |
+
original_size (tuple): The original size of the image in the format (width, height).
|
18 |
+
possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
tuple: The best fit resolution in the format (width, height).
|
22 |
+
"""
|
23 |
+
original_width, original_height = original_size
|
24 |
+
best_fit = None
|
25 |
+
max_effective_resolution = 0
|
26 |
+
min_wasted_resolution = float('inf')
|
27 |
+
|
28 |
+
for width, height in possible_resolutions:
|
29 |
+
scale = min(width / original_width, height / original_height)
|
30 |
+
downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
|
31 |
+
effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
|
32 |
+
wasted_resolution = (width * height) - effective_resolution
|
33 |
+
|
34 |
+
if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
|
35 |
+
max_effective_resolution = effective_resolution
|
36 |
+
min_wasted_resolution = wasted_resolution
|
37 |
+
best_fit = (width, height)
|
38 |
+
|
39 |
+
return best_fit
|
40 |
+
|
41 |
+
|
42 |
+
def resize_and_pad_image(image, target_resolution):
|
43 |
+
"""
|
44 |
+
Resize and pad an image to a target resolution while maintaining aspect ratio.
|
45 |
+
|
46 |
+
Args:
|
47 |
+
image (PIL.Image.Image): The input image.
|
48 |
+
target_resolution (tuple): The target resolution (width, height) of the image.
|
49 |
+
|
50 |
+
Returns:
|
51 |
+
PIL.Image.Image: The resized and padded image.
|
52 |
+
"""
|
53 |
+
original_width, original_height = image.size
|
54 |
+
target_width, target_height = target_resolution
|
55 |
+
|
56 |
+
scale_w = target_width / original_width
|
57 |
+
scale_h = target_height / original_height
|
58 |
+
|
59 |
+
if scale_w < scale_h:
|
60 |
+
new_width = target_width
|
61 |
+
new_height = min(math.ceil(original_height * scale_w), target_height)
|
62 |
+
else:
|
63 |
+
new_height = target_height
|
64 |
+
new_width = min(math.ceil(original_width * scale_h), target_width)
|
65 |
+
|
66 |
+
# Resize the image
|
67 |
+
resized_image = image.resize((new_width, new_height))
|
68 |
+
|
69 |
+
new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0))
|
70 |
+
paste_x = (target_width - new_width) // 2
|
71 |
+
paste_y = (target_height - new_height) // 2
|
72 |
+
new_image.paste(resized_image, (paste_x, paste_y))
|
73 |
+
|
74 |
+
return new_image
|
75 |
+
|
76 |
+
|
77 |
+
def divide_to_patches(image, patch_size):
|
78 |
+
"""
|
79 |
+
Divides an image into patches of a specified size.
|
80 |
+
|
81 |
+
Args:
|
82 |
+
image (PIL.Image.Image): The input image.
|
83 |
+
patch_size (int): The size of each patch.
|
84 |
+
|
85 |
+
Returns:
|
86 |
+
list: A list of PIL.Image.Image objects representing the patches.
|
87 |
+
"""
|
88 |
+
patches = []
|
89 |
+
width, height = image.size
|
90 |
+
for i in range(0, height, patch_size):
|
91 |
+
for j in range(0, width, patch_size):
|
92 |
+
box = (j, i, j + patch_size, i + patch_size)
|
93 |
+
patch = image.crop(box)
|
94 |
+
patches.append(patch)
|
95 |
+
|
96 |
+
return patches
|
97 |
+
|
98 |
+
|
99 |
+
def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
|
100 |
+
"""
|
101 |
+
Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
|
102 |
+
|
103 |
+
Args:
|
104 |
+
image_size (tuple): The size of the input image in the format (width, height).
|
105 |
+
grid_pinpoints (str): A string representation of a list of possible resolutions.
|
106 |
+
patch_size (int): The size of each image patch.
|
107 |
+
|
108 |
+
Returns:
|
109 |
+
tuple: The shape of the image patch grid in the format (width, height).
|
110 |
+
"""
|
111 |
+
if type(grid_pinpoints) is list:
|
112 |
+
possible_resolutions = grid_pinpoints
|
113 |
+
else:
|
114 |
+
possible_resolutions = ast.literal_eval(grid_pinpoints)
|
115 |
+
width, height = select_best_resolution(image_size, possible_resolutions)
|
116 |
+
return width // patch_size, height // patch_size
|
117 |
+
|
118 |
+
|
119 |
+
def process_anyres_image(image, processor, grid_pinpoints):
|
120 |
+
"""
|
121 |
+
Process an image with variable resolutions.
|
122 |
+
|
123 |
+
Args:
|
124 |
+
image (PIL.Image.Image): The input image to be processed.
|
125 |
+
processor: The image processor object.
|
126 |
+
grid_pinpoints (str): A string representation of a list of possible resolutions.
|
127 |
+
|
128 |
+
Returns:
|
129 |
+
torch.Tensor: A tensor containing the processed image patches.
|
130 |
+
"""
|
131 |
+
if type(grid_pinpoints) is list:
|
132 |
+
possible_resolutions = grid_pinpoints
|
133 |
+
else:
|
134 |
+
possible_resolutions = ast.literal_eval(grid_pinpoints)
|
135 |
+
best_resolution = select_best_resolution(image.size, possible_resolutions)
|
136 |
+
image_padded = resize_and_pad_image(image, best_resolution)
|
137 |
+
|
138 |
+
patches = divide_to_patches(image_padded, processor.crop_size['height'])
|
139 |
+
|
140 |
+
image_original_resize = image.resize((processor.size['shortest_edge'], processor.size['shortest_edge']))
|
141 |
+
|
142 |
+
image_patches = [image_original_resize] + patches
|
143 |
+
image_patches = [processor.preprocess(image_patch, return_tensors='pt')['pixel_values'][0]
|
144 |
+
for image_patch in image_patches]
|
145 |
+
return torch.stack(image_patches, dim=0)
|
146 |
+
|
147 |
+
|
148 |
+
def load_image_from_base64(image):
|
149 |
+
return Image.open(BytesIO(base64.b64decode(image)))
|
150 |
+
|
151 |
+
|
152 |
+
def expand2square(pil_img, background_color):
|
153 |
+
width, height = pil_img.size
|
154 |
+
if width == height:
|
155 |
+
return pil_img
|
156 |
+
elif width > height:
|
157 |
+
result = Image.new(pil_img.mode, (width, width), background_color)
|
158 |
+
result.paste(pil_img, (0, (width - height) // 2))
|
159 |
+
return result
|
160 |
+
else:
|
161 |
+
result = Image.new(pil_img.mode, (height, height), background_color)
|
162 |
+
result.paste(pil_img, ((height - width) // 2, 0))
|
163 |
+
return result
|
164 |
+
|
165 |
+
|
166 |
+
def process_images(images, image_processor, model_cfg):
|
167 |
+
image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
|
168 |
+
new_images = []
|
169 |
+
if image_aspect_ratio == 'pad':
|
170 |
+
for image in images:
|
171 |
+
image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
|
172 |
+
image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
|
173 |
+
new_images.append(image)
|
174 |
+
elif image_aspect_ratio == "anyres":
|
175 |
+
for image in images:
|
176 |
+
image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints)
|
177 |
+
new_images.append(image)
|
178 |
+
else:
|
179 |
+
return image_processor(images, return_tensors='pt')['pixel_values']
|
180 |
+
if all(x.shape == new_images[0].shape for x in new_images):
|
181 |
+
new_images = torch.stack(new_images, dim=0)
|
182 |
+
return new_images
|
183 |
+
|
184 |
+
|
185 |
+
def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
|
186 |
+
prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
|
187 |
+
|
188 |
+
def insert_separator(X, sep):
|
189 |
+
return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
|
190 |
+
|
191 |
+
input_ids = []
|
192 |
+
offset = 0
|
193 |
+
if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
|
194 |
+
offset = 1
|
195 |
+
input_ids.append(prompt_chunks[0][0])
|
196 |
+
|
197 |
+
for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
|
198 |
+
input_ids.extend(x[offset:])
|
199 |
+
|
200 |
+
if return_tensors is not None:
|
201 |
+
if return_tensors == 'pt':
|
202 |
+
return torch.tensor(input_ids, dtype=torch.long)
|
203 |
+
raise ValueError(f'Unsupported tensor type: {return_tensors}')
|
204 |
+
return input_ids
|
205 |
+
|
206 |
+
|
207 |
+
def get_model_name_from_path(model_path):
|
208 |
+
model_path = model_path.strip("/")
|
209 |
+
model_paths = model_path.split("/")
|
210 |
+
if model_paths[-1].startswith('checkpoint-'):
|
211 |
+
return model_paths[-2] + "_" + model_paths[-1]
|
212 |
+
else:
|
213 |
+
return model_paths[-1]
|
214 |
+
|
215 |
+
class KeywordsStoppingCriteria(StoppingCriteria):
|
216 |
+
def __init__(self, keywords, tokenizer, input_ids):
|
217 |
+
self.keywords = keywords
|
218 |
+
self.keyword_ids = []
|
219 |
+
self.max_keyword_len = 0
|
220 |
+
for keyword in keywords:
|
221 |
+
cur_keyword_ids = tokenizer(keyword).input_ids
|
222 |
+
if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
|
223 |
+
cur_keyword_ids = cur_keyword_ids[1:]
|
224 |
+
if len(cur_keyword_ids) > self.max_keyword_len:
|
225 |
+
self.max_keyword_len = len(cur_keyword_ids)
|
226 |
+
self.keyword_ids.append(torch.tensor(cur_keyword_ids))
|
227 |
+
self.tokenizer = tokenizer
|
228 |
+
self.start_len = input_ids.shape[1]
|
229 |
+
|
230 |
+
def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
|
231 |
+
offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
|
232 |
+
self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
|
233 |
+
for keyword_id in self.keyword_ids:
|
234 |
+
truncated_output_ids = output_ids[0, -keyword_id.shape[0]:]
|
235 |
+
if torch.equal(truncated_output_ids, keyword_id):
|
236 |
+
return True
|
237 |
+
outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
|
238 |
+
for keyword in self.keywords:
|
239 |
+
if keyword in outputs:
|
240 |
+
return True
|
241 |
+
return False
|
242 |
+
|
243 |
+
def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
|
244 |
+
outputs = []
|
245 |
+
for i in range(output_ids.shape[0]):
|
246 |
+
outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
|
247 |
+
return all(outputs)
|
model/___init__.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
try:
|
2 |
+
from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
|
3 |
+
from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig
|
4 |
+
from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig
|
5 |
+
from .language_model.llava_cohere import LlavaCohereForCausalLM, LlavaCohereConfig
|
6 |
+
except:
|
7 |
+
pass
|
model/apply_delta.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Usage:
|
3 |
+
python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
|
4 |
+
"""
|
5 |
+
import argparse
|
6 |
+
|
7 |
+
import torch
|
8 |
+
from tqdm import tqdm
|
9 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
10 |
+
from llava import LlavaLlamaForCausalLM
|
11 |
+
|
12 |
+
|
13 |
+
def apply_delta(base_model_path, target_model_path, delta_path):
|
14 |
+
print("Loading base model")
|
15 |
+
base = AutoModelForCausalLM.from_pretrained(
|
16 |
+
base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
|
17 |
+
|
18 |
+
print("Loading delta")
|
19 |
+
delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
|
20 |
+
delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
|
21 |
+
|
22 |
+
print("Applying delta")
|
23 |
+
for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
|
24 |
+
if name not in base.state_dict():
|
25 |
+
assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
|
26 |
+
continue
|
27 |
+
if param.data.shape == base.state_dict()[name].shape:
|
28 |
+
param.data += base.state_dict()[name]
|
29 |
+
else:
|
30 |
+
assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \
|
31 |
+
f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
|
32 |
+
bparam = base.state_dict()[name]
|
33 |
+
param.data[:bparam.shape[0], :bparam.shape[1]] += bparam
|
34 |
+
|
35 |
+
print("Saving target model")
|
36 |
+
delta.save_pretrained(target_model_path)
|
37 |
+
delta_tokenizer.save_pretrained(target_model_path)
|
38 |
+
|
39 |
+
|
40 |
+
if __name__ == "__main__":
|
41 |
+
parser = argparse.ArgumentParser()
|
42 |
+
parser.add_argument("--base-model-path", type=str, required=True)
|
43 |
+
parser.add_argument("--target-model-path", type=str, required=True)
|
44 |
+
parser.add_argument("--delta-path", type=str, required=True)
|
45 |
+
|
46 |
+
args = parser.parse_args()
|
47 |
+
|
48 |
+
apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
|
model/builder.py
ADDED
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2023 Haotian Liu
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
|
16 |
+
import os
|
17 |
+
import warnings
|
18 |
+
import shutil
|
19 |
+
|
20 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
|
21 |
+
import torch
|
22 |
+
from llava.model import *
|
23 |
+
from llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
|
24 |
+
|
25 |
+
# from transformers.models.cohere.tokenization_cohere_fast import CohereTokenizerFast
|
26 |
+
# from llava.model.language_model.llava_cohere import LlavaCohereForCausalLM, LlavaCohereConfig
|
27 |
+
|
28 |
+
|
29 |
+
|
30 |
+
def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
|
31 |
+
kwargs = {"device_map": device_map, **kwargs}
|
32 |
+
|
33 |
+
if device != "cuda":
|
34 |
+
kwargs['device_map'] = {"": device}
|
35 |
+
|
36 |
+
if load_8bit:
|
37 |
+
kwargs['load_in_8bit'] = True
|
38 |
+
elif load_4bit:
|
39 |
+
kwargs['load_in_4bit'] = True
|
40 |
+
kwargs['quantization_config'] = BitsAndBytesConfig(
|
41 |
+
load_in_4bit=True,
|
42 |
+
bnb_4bit_compute_dtype=torch.float16,
|
43 |
+
bnb_4bit_use_double_quant=True,
|
44 |
+
bnb_4bit_quant_type='nf4'
|
45 |
+
)
|
46 |
+
else:
|
47 |
+
kwargs['torch_dtype'] = torch.float16
|
48 |
+
|
49 |
+
if use_flash_attn:
|
50 |
+
kwargs['attn_implementation'] = 'flash_attention_2'
|
51 |
+
|
52 |
+
if 'llava' in model_name.lower():
|
53 |
+
# Load LLaVA model
|
54 |
+
if 'lora' in model_name.lower() and model_base is None:
|
55 |
+
warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged.')
|
56 |
+
if 'lora' in model_name.lower() and model_base is not None:
|
57 |
+
from llava.model.language_model.llava_llama import LlavaConfig
|
58 |
+
lora_cfg_pretrained = LlavaConfig.from_pretrained(model_path)
|
59 |
+
tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
|
60 |
+
print('Loading LLaVA from base model...')
|
61 |
+
model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
|
62 |
+
token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
|
63 |
+
if model.lm_head.weight.shape[0] != token_num:
|
64 |
+
model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
|
65 |
+
model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
|
66 |
+
|
67 |
+
print('Loading additional LLaVA weights...')
|
68 |
+
if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
|
69 |
+
non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
|
70 |
+
else:
|
71 |
+
# this is probably from HF Hub
|
72 |
+
from huggingface_hub import hf_hub_download
|
73 |
+
def load_from_hf(repo_id, filename, subfolder=None):
|
74 |
+
cache_file = hf_hub_download(
|
75 |
+
repo_id=repo_id,
|
76 |
+
filename=filename,
|
77 |
+
subfolder=subfolder)
|
78 |
+
return torch.load(cache_file, map_location='cpu')
|
79 |
+
non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')
|
80 |
+
non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
|
81 |
+
if any(k.startswith('model.model.') for k in non_lora_trainables):
|
82 |
+
non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
|
83 |
+
model.load_state_dict(non_lora_trainables, strict=False)
|
84 |
+
|
85 |
+
from peft import PeftModel
|
86 |
+
print('Loading LoRA weights...')
|
87 |
+
model = PeftModel.from_pretrained(model, model_path)
|
88 |
+
print('Merging LoRA weights...')
|
89 |
+
model = model.merge_and_unload()
|
90 |
+
print('Model is loaded...')
|
91 |
+
elif model_base is not None:
|
92 |
+
# this may be mm projector only
|
93 |
+
print('Loading LLaVA from base model...')
|
94 |
+
if 'mpt' in model_name.lower():
|
95 |
+
if not os.path.isfile(os.path.join(model_path, 'configuration_mpt.py')):
|
96 |
+
shutil.copyfile(os.path.join(model_base, 'configuration_mpt.py'), os.path.join(model_path, 'configuration_mpt.py'))
|
97 |
+
tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=True)
|
98 |
+
cfg_pretrained = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
|
99 |
+
model = LlavaMptForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
|
100 |
+
else:
|
101 |
+
tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
|
102 |
+
cfg_pretrained = AutoConfig.from_pretrained(model_path)
|
103 |
+
model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
|
104 |
+
|
105 |
+
mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
|
106 |
+
mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
|
107 |
+
model.load_state_dict(mm_projector_weights, strict=False)
|
108 |
+
else:
|
109 |
+
if 'mpt' in model_name.lower():
|
110 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
|
111 |
+
model = LlavaMptForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
|
112 |
+
elif 'mistral' in model_name.lower():
|
113 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
114 |
+
model = LlavaMistralForCausalLM.from_pretrained(
|
115 |
+
model_path,
|
116 |
+
low_cpu_mem_usage=True,
|
117 |
+
**kwargs
|
118 |
+
)
|
119 |
+
else:
|
120 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
|
121 |
+
model = LlavaLlamaForCausalLM.from_pretrained(
|
122 |
+
model_path,
|
123 |
+
low_cpu_mem_usage=True,
|
124 |
+
**kwargs
|
125 |
+
)
|
126 |
+
elif 'aya' in model_name.lower():
|
127 |
+
|
128 |
+
## TO DO : Currently only works for projector pretrained models. Doesnt support PEFT models or models with base LLMs trained
|
129 |
+
tokenizer = AutoTokenizer.from_pretrained(model_base, padding_side="right", use_fast=True)
|
130 |
+
cfg_pretrained = LlavaCohereConfig.from_pretrained(model_path)
|
131 |
+
model = LlavaCohereForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
|
132 |
+
|
133 |
+
## TO DO : Improve the processing/loading/saving of the projector file
|
134 |
+
projector_file_path = os.path.join(os.getcwd(), 'mm_projector.bin')
|
135 |
+
if not os.path.exists(projector_file_path):
|
136 |
+
|
137 |
+
projector_file_link = os.path.join('https://huggingface.co/',model_path,'resolve/main/mm_projector.bin')
|
138 |
+
print(f"Downloading {projector_file_link} ...")
|
139 |
+
os.system(f"wget {projector_file_link}")
|
140 |
+
|
141 |
+
mm_projector_weights = torch.load(projector_file_path, map_location='cpu')
|
142 |
+
mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
|
143 |
+
model.load_state_dict(mm_projector_weights, strict=False)
|
144 |
+
|
145 |
+
|
146 |
+
|
147 |
+
else:
|
148 |
+
# Load language model
|
149 |
+
if model_base is not None:
|
150 |
+
# PEFT model
|
151 |
+
from peft import PeftModel
|
152 |
+
tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
|
153 |
+
model = AutoModelForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
|
154 |
+
print(f"Loading LoRA weights from {model_path}")
|
155 |
+
model = PeftModel.from_pretrained(model, model_path)
|
156 |
+
print(f"Merging weights")
|
157 |
+
model = model.merge_and_unload()
|
158 |
+
print('Convert to FP16...')
|
159 |
+
model.to(torch.float16)
|
160 |
+
else:
|
161 |
+
use_fast = False
|
162 |
+
if 'mpt' in model_name.lower():
|
163 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
|
164 |
+
model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs)
|
165 |
+
else:
|
166 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
|
167 |
+
model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
|
168 |
+
|
169 |
+
image_processor = None
|
170 |
+
|
171 |
+
if 'llava' in model_name.lower() or 'aya' in model_name.lower():
|
172 |
+
mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
|
173 |
+
mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
|
174 |
+
if mm_use_im_patch_token:
|
175 |
+
tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
|
176 |
+
if mm_use_im_start_end:
|
177 |
+
tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
|
178 |
+
model.resize_token_embeddings(len(tokenizer))
|
179 |
+
|
180 |
+
vision_tower = model.get_vision_tower()
|
181 |
+
if not vision_tower.is_loaded:
|
182 |
+
vision_tower.load_model(device_map=device_map)
|
183 |
+
if device_map != 'auto':
|
184 |
+
vision_tower.to(device=device_map, dtype=torch.float16)
|
185 |
+
image_processor = vision_tower.image_processor
|
186 |
+
|
187 |
+
if hasattr(model.config, "max_sequence_length"):
|
188 |
+
context_len = model.config.max_sequence_length
|
189 |
+
else:
|
190 |
+
context_len = 2048
|
191 |
+
|
192 |
+
return tokenizer, model, image_processor, context_len
|
model/consolidate.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Usage:
|
3 |
+
python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
|
4 |
+
"""
|
5 |
+
import argparse
|
6 |
+
|
7 |
+
import torch
|
8 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
9 |
+
from llava.model import *
|
10 |
+
from llava.model.utils import auto_upgrade
|
11 |
+
|
12 |
+
|
13 |
+
def consolidate_ckpt(src_path, dst_path):
|
14 |
+
print("Loading model")
|
15 |
+
auto_upgrade(src_path)
|
16 |
+
src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
|
17 |
+
src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
|
18 |
+
src_model.save_pretrained(dst_path)
|
19 |
+
src_tokenizer.save_pretrained(dst_path)
|
20 |
+
|
21 |
+
|
22 |
+
if __name__ == "__main__":
|
23 |
+
parser = argparse.ArgumentParser()
|
24 |
+
parser.add_argument("--src", type=str, required=True)
|
25 |
+
parser.add_argument("--dst", type=str, required=True)
|
26 |
+
|
27 |
+
args = parser.parse_args()
|
28 |
+
|
29 |
+
consolidate_ckpt(args.src, args.dst)
|
model/language_model/llava_cohere.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
reference: https://github.com/Satyajitv/LLaVA/blob/maya_exp/llava/model/language_model/llava_cohere.py
|
3 |
+
'''
|
4 |
+
|
5 |
+
from typing import List, Optional, Tuple, Union
|
6 |
+
|
7 |
+
import torch
|
8 |
+
import torch.nn as nn
|
9 |
+
|
10 |
+
from transformers import AutoConfig, AutoModelForCausalLM, \
|
11 |
+
CohereConfig, CohereModel, CohereForCausalLM
|
12 |
+
|
13 |
+
from transformers.modeling_outputs import CausalLMOutputWithPast
|
14 |
+
from transformers.generation.utils import GenerateOutput
|
15 |
+
|
16 |
+
from model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
|
17 |
+
|
18 |
+
class LlavaCohereConfig(CohereConfig):
|
19 |
+
model_type = "llava_cohere"
|
20 |
+
|
21 |
+
class LlavaCohereModel(LlavaMetaModel, CohereModel):
|
22 |
+
config_class = LlavaCohereConfig
|
23 |
+
|
24 |
+
def __init__(self, config: CohereConfig):
|
25 |
+
super(LlavaCohereModel, self).__init__(config)
|
26 |
+
|
27 |
+
class LlavaCohereForCausalLM(CohereForCausalLM, LlavaMetaForCausalLM):
|
28 |
+
config_class = LlavaCohereConfig
|
29 |
+
|
30 |
+
def __init__(self, config):
|
31 |
+
super().__init__(config)
|
32 |
+
self.model = LlavaCohereModel(config)
|
33 |
+
|
34 |
+
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
35 |
+
|
36 |
+
# Initialize weights and apply final processing
|
37 |
+
self.post_init()
|
38 |
+
|
39 |
+
def get_model(self):
|
40 |
+
return self.model
|
41 |
+
|
42 |
+
def forward(
|
43 |
+
self,
|
44 |
+
input_ids: torch.LongTensor = None,
|
45 |
+
attention_mask: Optional[torch.Tensor] = None,
|
46 |
+
position_ids: Optional[torch.LongTensor] = None,
|
47 |
+
past_key_values: Optional[List[torch.FloatTensor]] = None,
|
48 |
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
49 |
+
labels: Optional[torch.LongTensor] = None,
|
50 |
+
use_cache: Optional[bool] = None,
|
51 |
+
output_attentions: Optional[bool] = None,
|
52 |
+
output_hidden_states: Optional[bool] = None,
|
53 |
+
images: Optional[torch.FloatTensor] = None,
|
54 |
+
image_sizes: Optional[List[List[int]]] = None,
|
55 |
+
return_dict: Optional[bool] = None,
|
56 |
+
cache_position=None,
|
57 |
+
) -> Union[Tuple, CausalLMOutputWithPast]:
|
58 |
+
|
59 |
+
if inputs_embeds is None:
|
60 |
+
(
|
61 |
+
input_ids,
|
62 |
+
position_ids,
|
63 |
+
attention_mask,
|
64 |
+
past_key_values,
|
65 |
+
inputs_embeds,
|
66 |
+
labels
|
67 |
+
) = self.prepare_inputs_labels_for_multimodal(
|
68 |
+
input_ids,
|
69 |
+
position_ids,
|
70 |
+
attention_mask,
|
71 |
+
past_key_values,
|
72 |
+
labels,
|
73 |
+
images,
|
74 |
+
image_sizes
|
75 |
+
)
|
76 |
+
|
77 |
+
return super().forward(
|
78 |
+
input_ids=input_ids,
|
79 |
+
attention_mask=attention_mask,
|
80 |
+
position_ids=position_ids,
|
81 |
+
past_key_values=past_key_values,
|
82 |
+
inputs_embeds=inputs_embeds,
|
83 |
+
labels=labels,
|
84 |
+
use_cache=use_cache,
|
85 |
+
output_attentions=output_attentions,
|
86 |
+
output_hidden_states=output_hidden_states,
|
87 |
+
return_dict=return_dict
|
88 |
+
)
|
89 |
+
|
90 |
+
@torch.no_grad()
|
91 |
+
def generate(
|
92 |
+
self,
|
93 |
+
inputs: Optional[torch.Tensor] = None,
|
94 |
+
images: Optional[torch.Tensor] = None,
|
95 |
+
image_sizes: Optional[torch.Tensor] = None,
|
96 |
+
**kwargs,
|
97 |
+
) -> Union[GenerateOutput, torch.LongTensor]:
|
98 |
+
position_ids = kwargs.pop("position_ids", None)
|
99 |
+
attention_mask = kwargs.pop("attention_mask", None)
|
100 |
+
if "inputs_embeds" in kwargs:
|
101 |
+
raise NotImplementedError("`inputs_embeds` is not supported")
|
102 |
+
|
103 |
+
if images is not None:
|
104 |
+
(
|
105 |
+
inputs,
|
106 |
+
position_ids,
|
107 |
+
attention_mask,
|
108 |
+
_,
|
109 |
+
inputs_embeds,
|
110 |
+
_
|
111 |
+
) = self.prepare_inputs_labels_for_multimodal(
|
112 |
+
inputs,
|
113 |
+
position_ids,
|
114 |
+
attention_mask,
|
115 |
+
None,
|
116 |
+
None,
|
117 |
+
images,
|
118 |
+
image_sizes=image_sizes
|
119 |
+
)
|
120 |
+
else:
|
121 |
+
inputs_embeds = self.get_model().embed_tokens(inputs)
|
122 |
+
|
123 |
+
return super().generate(
|
124 |
+
position_ids=position_ids,
|
125 |
+
attention_mask=attention_mask,
|
126 |
+
inputs_embeds=inputs_embeds,
|
127 |
+
**kwargs
|
128 |
+
)
|
129 |
+
|
130 |
+
def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
|
131 |
+
inputs_embeds=None, **kwargs):
|
132 |
+
images = kwargs.pop("images", None)
|
133 |
+
image_sizes = kwargs.pop("image_sizes", None)
|
134 |
+
inputs = super().prepare_inputs_for_generation(
|
135 |
+
input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
|
136 |
+
)
|
137 |
+
if images is not None:
|
138 |
+
inputs['images'] = images
|
139 |
+
if image_sizes is not None:
|
140 |
+
inputs['image_sizes'] = image_sizes
|
141 |
+
return inputs
|
142 |
+
|
143 |
+
AutoConfig.register("llava_cohere", LlavaCohereConfig)
|
144 |
+
AutoModelForCausalLM.register(LlavaCohereConfig, LlavaCohereForCausalLM)
|
model/llava_arch.py
ADDED
@@ -0,0 +1,371 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2023 Haotian Liu
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
|
16 |
+
from abc import ABC, abstractmethod
|
17 |
+
|
18 |
+
import torch
|
19 |
+
import torch.nn as nn
|
20 |
+
|
21 |
+
from model.multimodal_encoder.builder import build_vision_tower
|
22 |
+
from model.multimodal_projector.builder import build_vision_projector
|
23 |
+
|
24 |
+
from constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
|
25 |
+
|
26 |
+
from mm_utils import get_anyres_image_grid_shape
|
27 |
+
|
28 |
+
|
29 |
+
class LlavaMetaModel:
|
30 |
+
|
31 |
+
def __init__(self, config):
|
32 |
+
super(LlavaMetaModel, self).__init__(config)
|
33 |
+
|
34 |
+
if hasattr(config, "mm_vision_tower"):
|
35 |
+
self.vision_tower = build_vision_tower(config, delay_load=True)
|
36 |
+
self.mm_projector = build_vision_projector(config)
|
37 |
+
|
38 |
+
if 'unpad' in getattr(config, 'mm_patch_merge_type', ''):
|
39 |
+
self.image_newline = nn.Parameter(
|
40 |
+
torch.empty(config.hidden_size, dtype=self.dtype)
|
41 |
+
)
|
42 |
+
|
43 |
+
def get_vision_tower(self):
|
44 |
+
vision_tower = getattr(self, 'vision_tower', None)
|
45 |
+
if type(vision_tower) is list:
|
46 |
+
vision_tower = vision_tower[0]
|
47 |
+
return vision_tower
|
48 |
+
|
49 |
+
def initialize_vision_modules(self, model_args, fsdp=None):
|
50 |
+
vision_tower = model_args.vision_tower
|
51 |
+
mm_vision_select_layer = model_args.mm_vision_select_layer
|
52 |
+
mm_vision_select_feature = model_args.mm_vision_select_feature
|
53 |
+
pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
|
54 |
+
mm_patch_merge_type = model_args.mm_patch_merge_type
|
55 |
+
|
56 |
+
self.config.mm_vision_tower = vision_tower
|
57 |
+
|
58 |
+
if self.get_vision_tower() is None:
|
59 |
+
vision_tower = build_vision_tower(model_args)
|
60 |
+
|
61 |
+
if fsdp is not None and len(fsdp) > 0:
|
62 |
+
self.vision_tower = [vision_tower]
|
63 |
+
else:
|
64 |
+
self.vision_tower = vision_tower
|
65 |
+
else:
|
66 |
+
if fsdp is not None and len(fsdp) > 0:
|
67 |
+
vision_tower = self.vision_tower[0]
|
68 |
+
else:
|
69 |
+
vision_tower = self.vision_tower
|
70 |
+
vision_tower.load_model()
|
71 |
+
|
72 |
+
self.config.use_mm_proj = True
|
73 |
+
self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
|
74 |
+
self.config.mm_hidden_size = vision_tower.hidden_size
|
75 |
+
self.config.mm_vision_select_layer = mm_vision_select_layer
|
76 |
+
self.config.mm_vision_select_feature = mm_vision_select_feature
|
77 |
+
self.config.mm_patch_merge_type = mm_patch_merge_type
|
78 |
+
|
79 |
+
if getattr(self, 'mm_projector', None) is None:
|
80 |
+
self.mm_projector = build_vision_projector(self.config)
|
81 |
+
|
82 |
+
if 'unpad' in mm_patch_merge_type:
|
83 |
+
embed_std = 1 / torch.sqrt(torch.tensor(self.config.hidden_size, dtype=self.dtype))
|
84 |
+
self.image_newline = nn.Parameter(
|
85 |
+
torch.randn(self.config.hidden_size, dtype=self.dtype) * embed_std
|
86 |
+
)
|
87 |
+
else:
|
88 |
+
# In case it is frozen by LoRA
|
89 |
+
for p in self.mm_projector.parameters():
|
90 |
+
p.requires_grad = True
|
91 |
+
|
92 |
+
if pretrain_mm_mlp_adapter is not None:
|
93 |
+
mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
|
94 |
+
def get_w(weights, keyword):
|
95 |
+
return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
|
96 |
+
|
97 |
+
self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
|
98 |
+
|
99 |
+
|
100 |
+
def unpad_image(tensor, original_size):
|
101 |
+
"""
|
102 |
+
Unpads a PyTorch tensor of a padded and resized image.
|
103 |
+
|
104 |
+
Args:
|
105 |
+
tensor (torch.Tensor): The image tensor, assumed to be in CxHxW format.
|
106 |
+
original_size (tuple): The original size of PIL image (width, height).
|
107 |
+
|
108 |
+
Returns:
|
109 |
+
torch.Tensor: The unpadded image tensor.
|
110 |
+
"""
|
111 |
+
original_width, original_height = original_size
|
112 |
+
current_height, current_width = tensor.shape[1:]
|
113 |
+
|
114 |
+
original_aspect_ratio = original_width / original_height
|
115 |
+
current_aspect_ratio = current_width / current_height
|
116 |
+
|
117 |
+
if original_aspect_ratio > current_aspect_ratio:
|
118 |
+
scale_factor = current_width / original_width
|
119 |
+
new_height = int(original_height * scale_factor)
|
120 |
+
padding = (current_height - new_height) // 2
|
121 |
+
unpadded_tensor = tensor[:, padding:current_height - padding, :]
|
122 |
+
else:
|
123 |
+
scale_factor = current_height / original_height
|
124 |
+
new_width = int(original_width * scale_factor)
|
125 |
+
padding = (current_width - new_width) // 2
|
126 |
+
unpadded_tensor = tensor[:, :, padding:current_width - padding]
|
127 |
+
|
128 |
+
return unpadded_tensor
|
129 |
+
|
130 |
+
|
131 |
+
class LlavaMetaForCausalLM(ABC):
|
132 |
+
|
133 |
+
@abstractmethod
|
134 |
+
def get_model(self):
|
135 |
+
pass
|
136 |
+
|
137 |
+
def get_vision_tower(self):
|
138 |
+
return self.get_model().get_vision_tower()
|
139 |
+
|
140 |
+
def encode_images(self, images):
|
141 |
+
image_features = self.get_model().get_vision_tower()(images)
|
142 |
+
image_features = self.get_model().mm_projector(image_features)
|
143 |
+
return image_features
|
144 |
+
|
145 |
+
def prepare_inputs_labels_for_multimodal(
|
146 |
+
self, input_ids, position_ids, attention_mask, past_key_values, labels,
|
147 |
+
images, image_sizes=None
|
148 |
+
):
|
149 |
+
vision_tower = self.get_vision_tower()
|
150 |
+
if vision_tower is None or images is None or input_ids.shape[1] == 1:
|
151 |
+
return input_ids, position_ids, attention_mask, past_key_values, None, labels
|
152 |
+
|
153 |
+
if type(images) is list or images.ndim == 5:
|
154 |
+
if type(images) is list:
|
155 |
+
images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images]
|
156 |
+
concat_images = torch.cat([image for image in images], dim=0)
|
157 |
+
image_features = self.encode_images(concat_images)
|
158 |
+
split_sizes = [image.shape[0] for image in images]
|
159 |
+
image_features = torch.split(image_features, split_sizes, dim=0)
|
160 |
+
mm_patch_merge_type = getattr(self.config, 'mm_patch_merge_type', 'flat')
|
161 |
+
image_aspect_ratio = getattr(self.config, 'image_aspect_ratio', 'square')
|
162 |
+
if mm_patch_merge_type == 'flat':
|
163 |
+
image_features = [x.flatten(0, 1) for x in image_features]
|
164 |
+
elif mm_patch_merge_type.startswith('spatial'):
|
165 |
+
new_image_features = []
|
166 |
+
for image_idx, image_feature in enumerate(image_features):
|
167 |
+
if image_feature.shape[0] > 1:
|
168 |
+
base_image_feature = image_feature[0]
|
169 |
+
image_feature = image_feature[1:]
|
170 |
+
height = width = self.get_vision_tower().num_patches_per_side
|
171 |
+
assert height * width == base_image_feature.shape[0]
|
172 |
+
if image_aspect_ratio == 'anyres':
|
173 |
+
num_patch_width, num_patch_height = get_anyres_image_grid_shape(image_sizes[image_idx], self.config.image_grid_pinpoints, self.get_vision_tower().config.image_size)
|
174 |
+
image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
|
175 |
+
else:
|
176 |
+
raise NotImplementedError
|
177 |
+
if 'unpad' in mm_patch_merge_type:
|
178 |
+
image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
|
179 |
+
image_feature = image_feature.flatten(1, 2).flatten(2, 3)
|
180 |
+
image_feature = unpad_image(image_feature, image_sizes[image_idx])
|
181 |
+
image_feature = torch.cat((
|
182 |
+
image_feature,
|
183 |
+
self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)
|
184 |
+
), dim=-1)
|
185 |
+
image_feature = image_feature.flatten(1, 2).transpose(0, 1)
|
186 |
+
else:
|
187 |
+
image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
|
188 |
+
image_feature = image_feature.flatten(0, 3)
|
189 |
+
image_feature = torch.cat((base_image_feature, image_feature), dim=0)
|
190 |
+
else:
|
191 |
+
image_feature = image_feature[0]
|
192 |
+
if 'unpad' in mm_patch_merge_type:
|
193 |
+
image_feature = torch.cat((
|
194 |
+
image_feature,
|
195 |
+
self.model.image_newline[None].to(image_feature.device)
|
196 |
+
), dim=0)
|
197 |
+
new_image_features.append(image_feature)
|
198 |
+
image_features = new_image_features
|
199 |
+
else:
|
200 |
+
raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}")
|
201 |
+
else:
|
202 |
+
image_features = self.encode_images(images)
|
203 |
+
|
204 |
+
# TODO: image start / end is not implemented here to support pretraining.
|
205 |
+
if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
|
206 |
+
raise NotImplementedError
|
207 |
+
|
208 |
+
# Let's just add dummy tensors if they do not exist,
|
209 |
+
# it is a headache to deal with None all the time.
|
210 |
+
# But it is not ideal, and if you have a better idea,
|
211 |
+
# please open an issue / submit a PR, thanks.
|
212 |
+
_labels = labels
|
213 |
+
_position_ids = position_ids
|
214 |
+
_attention_mask = attention_mask
|
215 |
+
if attention_mask is None:
|
216 |
+
attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
|
217 |
+
else:
|
218 |
+
attention_mask = attention_mask.bool()
|
219 |
+
if position_ids is None:
|
220 |
+
position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
|
221 |
+
if labels is None:
|
222 |
+
labels = torch.full_like(input_ids, IGNORE_INDEX)
|
223 |
+
|
224 |
+
# remove the padding using attention_mask -- FIXME
|
225 |
+
_input_ids = input_ids
|
226 |
+
input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
|
227 |
+
labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
|
228 |
+
|
229 |
+
new_input_embeds = []
|
230 |
+
new_labels = []
|
231 |
+
cur_image_idx = 0
|
232 |
+
for batch_idx, cur_input_ids in enumerate(input_ids):
|
233 |
+
num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
|
234 |
+
if num_images == 0:
|
235 |
+
cur_image_features = image_features[cur_image_idx]
|
236 |
+
cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
|
237 |
+
cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
|
238 |
+
new_input_embeds.append(cur_input_embeds)
|
239 |
+
new_labels.append(labels[batch_idx])
|
240 |
+
cur_image_idx += 1
|
241 |
+
continue
|
242 |
+
|
243 |
+
image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
|
244 |
+
cur_input_ids_noim = []
|
245 |
+
cur_labels = labels[batch_idx]
|
246 |
+
cur_labels_noim = []
|
247 |
+
for i in range(len(image_token_indices) - 1):
|
248 |
+
cur_input_ids_noim.append(cur_input_ids[image_token_indices[i]+1:image_token_indices[i+1]])
|
249 |
+
cur_labels_noim.append(cur_labels[image_token_indices[i]+1:image_token_indices[i+1]])
|
250 |
+
split_sizes = [x.shape[0] for x in cur_labels_noim]
|
251 |
+
cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim))
|
252 |
+
cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
|
253 |
+
cur_new_input_embeds = []
|
254 |
+
cur_new_labels = []
|
255 |
+
|
256 |
+
for i in range(num_images + 1):
|
257 |
+
cur_new_input_embeds.append(cur_input_embeds_no_im[i])
|
258 |
+
cur_new_labels.append(cur_labels_noim[i])
|
259 |
+
if i < num_images:
|
260 |
+
try:
|
261 |
+
cur_image_features = image_features[cur_image_idx]
|
262 |
+
except Exception as e:
|
263 |
+
print(f'Index ERROR issue due to data/image mismatch/missing: {e}')
|
264 |
+
cur_image_idx += 1
|
265 |
+
cur_new_input_embeds.append(cur_image_features)
|
266 |
+
cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
|
267 |
+
|
268 |
+
cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
|
269 |
+
|
270 |
+
cur_new_input_embeds = torch.cat(cur_new_input_embeds)
|
271 |
+
cur_new_labels = torch.cat(cur_new_labels)
|
272 |
+
|
273 |
+
new_input_embeds.append(cur_new_input_embeds)
|
274 |
+
new_labels.append(cur_new_labels)
|
275 |
+
|
276 |
+
# Truncate sequences to max length as image embeddings can make the sequence longer
|
277 |
+
tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None)
|
278 |
+
if tokenizer_model_max_length is not None:
|
279 |
+
new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
|
280 |
+
new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
|
281 |
+
|
282 |
+
# Combine them
|
283 |
+
max_len = max(x.shape[0] for x in new_input_embeds)
|
284 |
+
batch_size = len(new_input_embeds)
|
285 |
+
|
286 |
+
new_input_embeds_padded = []
|
287 |
+
new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
|
288 |
+
attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
|
289 |
+
position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
|
290 |
+
|
291 |
+
for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
|
292 |
+
cur_len = cur_new_embed.shape[0]
|
293 |
+
if getattr(self.config, 'tokenizer_padding_side', 'right') == "left":
|
294 |
+
new_input_embeds_padded.append(torch.cat((
|
295 |
+
torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device),
|
296 |
+
cur_new_embed
|
297 |
+
), dim=0))
|
298 |
+
if cur_len > 0:
|
299 |
+
new_labels_padded[i, -cur_len:] = cur_new_labels
|
300 |
+
attention_mask[i, -cur_len:] = True
|
301 |
+
position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
|
302 |
+
else:
|
303 |
+
new_input_embeds_padded.append(torch.cat((
|
304 |
+
cur_new_embed,
|
305 |
+
torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)
|
306 |
+
), dim=0))
|
307 |
+
if cur_len > 0:
|
308 |
+
new_labels_padded[i, :cur_len] = cur_new_labels
|
309 |
+
attention_mask[i, :cur_len] = True
|
310 |
+
position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
|
311 |
+
|
312 |
+
new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
|
313 |
+
|
314 |
+
if _labels is None:
|
315 |
+
new_labels = None
|
316 |
+
else:
|
317 |
+
new_labels = new_labels_padded
|
318 |
+
|
319 |
+
if _attention_mask is None:
|
320 |
+
attention_mask = None
|
321 |
+
else:
|
322 |
+
attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
|
323 |
+
|
324 |
+
if _position_ids is None:
|
325 |
+
position_ids = None
|
326 |
+
|
327 |
+
return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
|
328 |
+
|
329 |
+
def initialize_vision_tokenizer(self, model_args, tokenizer):
|
330 |
+
if model_args.mm_use_im_patch_token:
|
331 |
+
tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
|
332 |
+
self.resize_token_embeddings(len(tokenizer))
|
333 |
+
|
334 |
+
if model_args.mm_use_im_start_end:
|
335 |
+
num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
|
336 |
+
self.resize_token_embeddings(len(tokenizer))
|
337 |
+
|
338 |
+
if num_new_tokens > 0:
|
339 |
+
input_embeddings = self.get_input_embeddings().weight.data
|
340 |
+
output_embeddings = self.get_output_embeddings().weight.data
|
341 |
+
|
342 |
+
input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
|
343 |
+
dim=0, keepdim=True)
|
344 |
+
output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
|
345 |
+
dim=0, keepdim=True)
|
346 |
+
|
347 |
+
input_embeddings[-num_new_tokens:] = input_embeddings_avg
|
348 |
+
output_embeddings[-num_new_tokens:] = output_embeddings_avg
|
349 |
+
|
350 |
+
if model_args.tune_mm_mlp_adapter:
|
351 |
+
for p in self.get_input_embeddings().parameters():
|
352 |
+
p.requires_grad = True
|
353 |
+
for p in self.get_output_embeddings().parameters():
|
354 |
+
p.requires_grad = False
|
355 |
+
|
356 |
+
if model_args.pretrain_mm_mlp_adapter:
|
357 |
+
mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location='cpu')
|
358 |
+
embed_tokens_weight = mm_projector_weights['model.embed_tokens.weight']
|
359 |
+
assert num_new_tokens == 2
|
360 |
+
if input_embeddings.shape == embed_tokens_weight.shape:
|
361 |
+
input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
|
362 |
+
elif embed_tokens_weight.shape[0] == num_new_tokens:
|
363 |
+
input_embeddings[-num_new_tokens:] = embed_tokens_weight
|
364 |
+
else:
|
365 |
+
raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
|
366 |
+
elif model_args.mm_use_im_patch_token:
|
367 |
+
if model_args.tune_mm_mlp_adapter:
|
368 |
+
for p in self.get_input_embeddings().parameters():
|
369 |
+
p.requires_grad = False
|
370 |
+
for p in self.get_output_embeddings().parameters():
|
371 |
+
p.requires_grad = False
|
model/make_delta.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Usage:
|
3 |
+
python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta
|
4 |
+
"""
|
5 |
+
import argparse
|
6 |
+
|
7 |
+
import torch
|
8 |
+
from tqdm import tqdm
|
9 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
10 |
+
from llava.model.utils import auto_upgrade
|
11 |
+
|
12 |
+
|
13 |
+
def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id):
|
14 |
+
print("Loading base model")
|
15 |
+
base = AutoModelForCausalLM.from_pretrained(
|
16 |
+
base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
|
17 |
+
|
18 |
+
print("Loading target model")
|
19 |
+
auto_upgrade(target_model_path)
|
20 |
+
target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
|
21 |
+
|
22 |
+
print("Calculating delta")
|
23 |
+
for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
|
24 |
+
if name not in base.state_dict():
|
25 |
+
assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
|
26 |
+
continue
|
27 |
+
if param.data.shape == base.state_dict()[name].shape:
|
28 |
+
param.data -= base.state_dict()[name]
|
29 |
+
else:
|
30 |
+
assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
|
31 |
+
bparam = base.state_dict()[name]
|
32 |
+
param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam
|
33 |
+
|
34 |
+
print("Saving delta")
|
35 |
+
if hub_repo_id:
|
36 |
+
kwargs = {"push_to_hub": True, "repo_id": hub_repo_id}
|
37 |
+
else:
|
38 |
+
kwargs = {}
|
39 |
+
target.save_pretrained(delta_path, **kwargs)
|
40 |
+
target_tokenizer = AutoTokenizer.from_pretrained(target_model_path)
|
41 |
+
target_tokenizer.save_pretrained(delta_path, **kwargs)
|
42 |
+
|
43 |
+
|
44 |
+
if __name__ == "__main__":
|
45 |
+
parser = argparse.ArgumentParser()
|
46 |
+
parser.add_argument("--base-model-path", type=str, required=True)
|
47 |
+
parser.add_argument("--target-model-path", type=str, required=True)
|
48 |
+
parser.add_argument("--delta-path", type=str, required=True)
|
49 |
+
parser.add_argument("--hub-repo-id", type=str, default=None)
|
50 |
+
args = parser.parse_args()
|
51 |
+
|
52 |
+
make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id)
|
model/multimodal_encoder/builder.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2
|
3 |
+
from .siglip_encoder import SiglipVisionTower
|
4 |
+
|
5 |
+
def build_vision_tower(vision_tower_cfg, **kwargs):
|
6 |
+
vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
|
7 |
+
is_absolute_path_exists = os.path.exists(vision_tower)
|
8 |
+
use_s2 = getattr(vision_tower_cfg, 's2', False)
|
9 |
+
if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower:
|
10 |
+
if use_s2:
|
11 |
+
return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs)
|
12 |
+
else:
|
13 |
+
return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
|
14 |
+
elif 'siglip' in vision_tower:
|
15 |
+
return SiglipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
|
16 |
+
raise ValueError(f'Unknown vision tower: {vision_tower}')
|
model/multimodal_encoder/clip_encoder.py
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
|
4 |
+
from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
|
5 |
+
|
6 |
+
|
7 |
+
class CLIPVisionTower(nn.Module):
|
8 |
+
def __init__(self, vision_tower, args, delay_load=False):
|
9 |
+
super().__init__()
|
10 |
+
|
11 |
+
self.is_loaded = False
|
12 |
+
|
13 |
+
self.vision_tower_name = vision_tower
|
14 |
+
self.select_layer = args.mm_vision_select_layer
|
15 |
+
self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
|
16 |
+
|
17 |
+
if not delay_load:
|
18 |
+
self.load_model()
|
19 |
+
elif getattr(args, 'unfreeze_mm_vision_tower', False):
|
20 |
+
self.load_model()
|
21 |
+
else:
|
22 |
+
self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
|
23 |
+
|
24 |
+
def load_model(self, device_map=None):
|
25 |
+
if self.is_loaded:
|
26 |
+
print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
|
27 |
+
return
|
28 |
+
|
29 |
+
self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
|
30 |
+
self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
|
31 |
+
self.vision_tower.requires_grad_(False)
|
32 |
+
|
33 |
+
self.is_loaded = True
|
34 |
+
|
35 |
+
def feature_select(self, image_forward_outs):
|
36 |
+
image_features = image_forward_outs.hidden_states[self.select_layer]
|
37 |
+
if self.select_feature == 'patch':
|
38 |
+
image_features = image_features[:, 1:]
|
39 |
+
elif self.select_feature == 'cls_patch':
|
40 |
+
image_features = image_features
|
41 |
+
else:
|
42 |
+
raise ValueError(f'Unexpected select feature: {self.select_feature}')
|
43 |
+
return image_features
|
44 |
+
|
45 |
+
@torch.no_grad()
|
46 |
+
def forward(self, images):
|
47 |
+
if type(images) is list:
|
48 |
+
image_features = []
|
49 |
+
for image in images:
|
50 |
+
image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
|
51 |
+
image_feature = self.feature_select(image_forward_out).to(image.dtype)
|
52 |
+
image_features.append(image_feature)
|
53 |
+
else:
|
54 |
+
image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
|
55 |
+
image_features = self.feature_select(image_forward_outs).to(images.dtype)
|
56 |
+
|
57 |
+
return image_features
|
58 |
+
|
59 |
+
@property
|
60 |
+
def dummy_feature(self):
|
61 |
+
return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
|
62 |
+
|
63 |
+
@property
|
64 |
+
def dtype(self):
|
65 |
+
return self.vision_tower.dtype
|
66 |
+
|
67 |
+
@property
|
68 |
+
def device(self):
|
69 |
+
return self.vision_tower.device
|
70 |
+
|
71 |
+
@property
|
72 |
+
def config(self):
|
73 |
+
if self.is_loaded:
|
74 |
+
return self.vision_tower.config
|
75 |
+
else:
|
76 |
+
return self.cfg_only
|
77 |
+
|
78 |
+
@property
|
79 |
+
def hidden_size(self):
|
80 |
+
return self.config.hidden_size
|
81 |
+
|
82 |
+
@property
|
83 |
+
def num_patches_per_side(self):
|
84 |
+
return self.config.image_size // self.config.patch_size
|
85 |
+
|
86 |
+
@property
|
87 |
+
def num_patches(self):
|
88 |
+
return (self.config.image_size // self.config.patch_size) ** 2
|
89 |
+
|
90 |
+
|
91 |
+
|
92 |
+
class CLIPVisionTowerS2(CLIPVisionTower):
|
93 |
+
def __init__(self, vision_tower, args, delay_load=False):
|
94 |
+
super().__init__(vision_tower, args, delay_load)
|
95 |
+
|
96 |
+
self.s2_scales = getattr(args, 's2_scales', '336,672,1008')
|
97 |
+
self.s2_scales = list(map(int, self.s2_scales.split(',')))
|
98 |
+
self.s2_scales.sort()
|
99 |
+
self.s2_split_size = self.s2_scales[0]
|
100 |
+
self.s2_image_size = self.s2_scales[-1]
|
101 |
+
|
102 |
+
try:
|
103 |
+
from s2wrapper import forward as multiscale_forward
|
104 |
+
except ImportError:
|
105 |
+
raise ImportError('Package s2wrapper not found! Please install by running: \npip install git+https://github.com/bfshi/scaling_on_scales.git')
|
106 |
+
self.multiscale_forward = multiscale_forward
|
107 |
+
|
108 |
+
# change resize/crop size in preprocessing to the largest image size in s2_scale
|
109 |
+
if not delay_load or getattr(args, 'unfreeze_mm_vision_tower', False):
|
110 |
+
self.image_processor.size['shortest_edge'] = self.s2_image_size
|
111 |
+
self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size
|
112 |
+
|
113 |
+
def load_model(self, device_map=None):
|
114 |
+
if self.is_loaded:
|
115 |
+
print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
|
116 |
+
return
|
117 |
+
|
118 |
+
self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
|
119 |
+
self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
|
120 |
+
self.vision_tower.requires_grad_(False)
|
121 |
+
|
122 |
+
self.image_processor.size['shortest_edge'] = self.s2_image_size
|
123 |
+
self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size
|
124 |
+
|
125 |
+
self.is_loaded = True
|
126 |
+
|
127 |
+
@torch.no_grad()
|
128 |
+
def forward_feature(self, images):
|
129 |
+
image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
|
130 |
+
image_features = self.feature_select(image_forward_outs).to(images.dtype)
|
131 |
+
return image_features
|
132 |
+
|
133 |
+
@torch.no_grad()
|
134 |
+
def forward(self, images):
|
135 |
+
if type(images) is list:
|
136 |
+
image_features = []
|
137 |
+
for image in images:
|
138 |
+
image_feature = self.multiscale_forward(self.forward_feature, image.unsqueeze(0), img_sizes=self.s2_scales, max_split_size=self.s2_split_size)
|
139 |
+
image_features.append(image_feature)
|
140 |
+
else:
|
141 |
+
image_features = self.multiscale_forward(self.forward_feature, images, img_sizes=self.s2_scales, max_split_size=self.s2_split_size)
|
142 |
+
|
143 |
+
return image_features
|
144 |
+
|
145 |
+
@property
|
146 |
+
def hidden_size(self):
|
147 |
+
return self.config.hidden_size * len(self.s2_scales)
|
model/multimodal_encoder/siglip_encoder.py
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
|
4 |
+
from transformers import SiglipVisionModel, SiglipImageProcessor, SiglipVisionConfig
|
5 |
+
|
6 |
+
class SiglipVisionTower(nn.Module):
|
7 |
+
def __init__(self, vision_tower, args, delay_load=False):
|
8 |
+
super().__init__()
|
9 |
+
|
10 |
+
self.is_loaded = False
|
11 |
+
|
12 |
+
self.vision_tower_name = vision_tower
|
13 |
+
self.select_layer = args.mm_vision_select_layer
|
14 |
+
self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
|
15 |
+
|
16 |
+
if not delay_load:
|
17 |
+
self.load_model()
|
18 |
+
elif getattr(args, 'unfreeze_mm_vision_tower', False):
|
19 |
+
self.load_model()
|
20 |
+
else:
|
21 |
+
self.cfg_only = SiglipVisionConfig.from_pretrained(self.vision_tower_name)
|
22 |
+
|
23 |
+
def load_model(self, device_map=None):
|
24 |
+
if self.is_loaded:
|
25 |
+
print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
|
26 |
+
return
|
27 |
+
|
28 |
+
self.image_processor = SiglipImageProcessor.from_pretrained(self.vision_tower_name)
|
29 |
+
self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
|
30 |
+
self.vision_tower.requires_grad_(False)
|
31 |
+
|
32 |
+
self.is_loaded = True
|
33 |
+
|
34 |
+
def feature_select(self, image_forward_outs):
|
35 |
+
image_features = image_forward_outs.hidden_states[self.select_layer]
|
36 |
+
if self.select_feature == 'patch':
|
37 |
+
image_features = image_features[:, 1:]
|
38 |
+
elif self.select_feature == 'cls_patch':
|
39 |
+
image_features = image_features
|
40 |
+
else:
|
41 |
+
raise ValueError(f'Unexpected select feature: {self.select_feature}')
|
42 |
+
return image_features
|
43 |
+
|
44 |
+
@torch.no_grad()
|
45 |
+
def forward(self, images):
|
46 |
+
if type(images) is list:
|
47 |
+
image_features = []
|
48 |
+
for image in images:
|
49 |
+
image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
|
50 |
+
image_feature = self.feature_select(image_forward_out).to(image.dtype)
|
51 |
+
image_features.append(image_feature)
|
52 |
+
else:
|
53 |
+
image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
|
54 |
+
image_features = self.feature_select(image_forward_outs).to(images.dtype)
|
55 |
+
|
56 |
+
return image_features
|
57 |
+
|
58 |
+
@property
|
59 |
+
def dummy_feature(self):
|
60 |
+
return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
|
61 |
+
|
62 |
+
@property
|
63 |
+
def dtype(self):
|
64 |
+
return self.vision_tower.dtype
|
65 |
+
|
66 |
+
@property
|
67 |
+
def device(self):
|
68 |
+
return self.vision_tower.device
|
69 |
+
|
70 |
+
@property
|
71 |
+
def config(self):
|
72 |
+
if self.is_loaded:
|
73 |
+
return self.vision_tower.config
|
74 |
+
else:
|
75 |
+
return self.cfg_only
|
76 |
+
|
77 |
+
@property
|
78 |
+
def hidden_size(self):
|
79 |
+
return self.config.hidden_size
|
80 |
+
|
81 |
+
@property
|
82 |
+
def num_patches_per_side(self):
|
83 |
+
return self.config.image_size // self.config.patch_size
|
84 |
+
|
85 |
+
@property
|
86 |
+
def num_patches(self):
|
87 |
+
return (self.config.image_size // self.config.patch_size) ** 2
|
88 |
+
|
89 |
+
|
90 |
+
|
91 |
+
class SiglipVisionTowerS2(SiglipVisionTower):
|
92 |
+
def __init__(self, vision_tower, args, delay_load=False):
|
93 |
+
super().__init__(vision_tower, args, delay_load)
|
94 |
+
|
95 |
+
self.s2_scales = getattr(args, 's2_scales', '336,672,1008')
|
96 |
+
self.s2_scales = list(map(int, self.s2_scales.split(',')))
|
97 |
+
self.s2_scales.sort()
|
98 |
+
self.s2_split_size = self.s2_scales[0]
|
99 |
+
self.s2_image_size = self.s2_scales[-1]
|
100 |
+
|
101 |
+
try:
|
102 |
+
from s2wrapper import forward as multiscale_forward
|
103 |
+
except ImportError:
|
104 |
+
raise ImportError('Package s2wrapper not found! Please install by running: \npip install git+https://github.com/bfshi/scaling_on_scales.git')
|
105 |
+
self.multiscale_forward = multiscale_forward
|
106 |
+
|
107 |
+
# change resize/crop size in preprocessing to the largest image size in s2_scale
|
108 |
+
if not delay_load or getattr(args, 'unfreeze_mm_vision_tower', False):
|
109 |
+
self.image_processor.size['shortest_edge'] = self.s2_image_size
|
110 |
+
self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size
|
111 |
+
|
112 |
+
def load_model(self, device_map=None):
|
113 |
+
if self.is_loaded:
|
114 |
+
print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
|
115 |
+
return
|
116 |
+
|
117 |
+
self.image_processor = SiglipImageProcessor.from_pretrained(self.vision_tower_name)
|
118 |
+
self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
|
119 |
+
self.vision_tower.requires_grad_(False)
|
120 |
+
|
121 |
+
self.image_processor.size['shortest_edge'] = self.s2_image_size
|
122 |
+
self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size
|
123 |
+
|
124 |
+
self.is_loaded = True
|
125 |
+
|
126 |
+
@torch.no_grad()
|
127 |
+
def forward_feature(self, images):
|
128 |
+
image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
|
129 |
+
image_features = self.feature_select(image_forward_outs).to(images.dtype)
|
130 |
+
return image_features
|
131 |
+
|
132 |
+
@torch.no_grad()
|
133 |
+
def forward(self, images):
|
134 |
+
if type(images) is list:
|
135 |
+
image_features = []
|
136 |
+
for image in images:
|
137 |
+
image_feature = self.multiscale_forward(self.forward_feature, image.unsqueeze(0), img_sizes=self.s2_scales, max_split_size=self.s2_split_size)
|
138 |
+
image_features.append(image_feature)
|
139 |
+
else:
|
140 |
+
image_features = self.multiscale_forward(self.forward_feature, images, img_sizes=self.s2_scales, max_split_size=self.s2_split_size)
|
141 |
+
|
142 |
+
return image_features
|
143 |
+
|
144 |
+
@property
|
145 |
+
def hidden_size(self):
|
146 |
+
return self.config.hidden_size * len(self.s2_scales)
|
147 |
+
|
model/multimodal_projector/builder.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import re
|
4 |
+
|
5 |
+
|
6 |
+
class IdentityMap(nn.Module):
|
7 |
+
def __init__(self):
|
8 |
+
super().__init__()
|
9 |
+
|
10 |
+
def forward(self, x, *args, **kwargs):
|
11 |
+
return x
|
12 |
+
|
13 |
+
@property
|
14 |
+
def config(self):
|
15 |
+
return {"mm_projector_type": 'identity'}
|
16 |
+
|
17 |
+
|
18 |
+
class SimpleResBlock(nn.Module):
|
19 |
+
def __init__(self, channels):
|
20 |
+
super().__init__()
|
21 |
+
self.pre_norm = nn.LayerNorm(channels)
|
22 |
+
|
23 |
+
self.proj = nn.Sequential(
|
24 |
+
nn.Linear(channels, channels),
|
25 |
+
nn.GELU(),
|
26 |
+
nn.Linear(channels, channels)
|
27 |
+
)
|
28 |
+
def forward(self, x):
|
29 |
+
x = self.pre_norm(x)
|
30 |
+
return x + self.proj(x)
|
31 |
+
|
32 |
+
|
33 |
+
def build_vision_projector(config, delay_load=False, **kwargs):
|
34 |
+
projector_type = getattr(config, 'mm_projector_type', 'linear')
|
35 |
+
|
36 |
+
if projector_type == 'linear':
|
37 |
+
return nn.Linear(config.mm_hidden_size, config.hidden_size)
|
38 |
+
|
39 |
+
mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
|
40 |
+
if mlp_gelu_match:
|
41 |
+
mlp_depth = int(mlp_gelu_match.group(1))
|
42 |
+
modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
|
43 |
+
for _ in range(1, mlp_depth):
|
44 |
+
modules.append(nn.GELU())
|
45 |
+
modules.append(nn.Linear(config.hidden_size, config.hidden_size))
|
46 |
+
return nn.Sequential(*modules)
|
47 |
+
|
48 |
+
if projector_type == 'identity':
|
49 |
+
return IdentityMap()
|
50 |
+
|
51 |
+
raise ValueError(f'Unknown projector type: {projector_type}')
|
model/utils.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoConfig
|
2 |
+
|
3 |
+
|
4 |
+
def auto_upgrade(config):
|
5 |
+
cfg = AutoConfig.from_pretrained(config)
|
6 |
+
if 'llava' in config and 'llava' not in cfg.model_type:
|
7 |
+
assert cfg.model_type == 'llama'
|
8 |
+
print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
|
9 |
+
print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
|
10 |
+
confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
|
11 |
+
if confirm.lower() in ["y", "yes"]:
|
12 |
+
print("Upgrading checkpoint...")
|
13 |
+
assert len(cfg.architectures) == 1
|
14 |
+
setattr(cfg.__class__, "model_type", "llava")
|
15 |
+
cfg.architectures[0] = 'LlavaLlamaForCausalLM'
|
16 |
+
cfg.save_pretrained(config)
|
17 |
+
print("Checkpoint upgraded.")
|
18 |
+
else:
|
19 |
+
print("Checkpoint upgrade aborted.")
|
20 |
+
exit(1)
|
requirements.txt
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
numpy==1.26.4
|
2 |
+
scikit-learn==1.2.2
|
3 |
+
|
4 |
+
# HuggingFace ecosystem
|
5 |
+
transformers==4.47.0
|
6 |
+
tokenizers==0.21
|
7 |
+
sentencepiece==0.1.99
|
8 |
+
accelerate==0.27.2
|
9 |
+
datasets==2.15.0
|
10 |
+
peft==0.12.0
|
11 |
+
huggingface_hub>=0.25.2
|
12 |
+
|
13 |
+
|
14 |
+
# Additional ML libraries
|
15 |
+
bitsandbytes==0.43.3
|
16 |
+
timm==0.6.13
|
17 |
+
einops==0.6.1
|
18 |
+
einops-exts==0.0.4
|
19 |
+
|
20 |
+
# Utilities
|
21 |
+
shortuuid==1.0.13
|
22 |
+
pydantic==2.8.2
|
23 |
+
markdown2[all]
|
24 |
+
|
25 |
+
# Web framework and API
|
26 |
+
gradio==5.1.0
|
27 |
+
gradio_client
|
28 |
+
fastapi
|
29 |
+
uvicorn
|
30 |
+
requests==2.32.3
|
31 |
+
httpx==0.27.2
|
torch_requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Core dependencies
|
2 |
+
setuptools>=45.0
|
3 |
+
wheel>=0.36.2
|
4 |
+
ninja>=1.10.0
|
5 |
+
packaging>=20.0
|
6 |
+
|
7 |
+
# PyTorch
|
8 |
+
torch==2.1.2
|
9 |
+
torchvision==0.16.2
|
utils.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import logging.handlers
|
3 |
+
import os
|
4 |
+
import sys
|
5 |
+
|
6 |
+
import requests
|
7 |
+
|
8 |
+
from constants import LOGDIR
|
9 |
+
|
10 |
+
server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
|
11 |
+
moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
|
12 |
+
|
13 |
+
handler = None
|
14 |
+
|
15 |
+
|
16 |
+
def build_logger(logger_name, logger_filename):
|
17 |
+
global handler
|
18 |
+
|
19 |
+
formatter = logging.Formatter(
|
20 |
+
fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
|
21 |
+
datefmt="%Y-%m-%d %H:%M:%S",
|
22 |
+
)
|
23 |
+
|
24 |
+
# Set the format of root handlers
|
25 |
+
if not logging.getLogger().handlers:
|
26 |
+
logging.basicConfig(level=logging.INFO)
|
27 |
+
logging.getLogger().handlers[0].setFormatter(formatter)
|
28 |
+
|
29 |
+
# Redirect stdout and stderr to loggers
|
30 |
+
stdout_logger = logging.getLogger("stdout")
|
31 |
+
stdout_logger.setLevel(logging.INFO)
|
32 |
+
sl = StreamToLogger(stdout_logger, logging.INFO)
|
33 |
+
sys.stdout = sl
|
34 |
+
|
35 |
+
stderr_logger = logging.getLogger("stderr")
|
36 |
+
stderr_logger.setLevel(logging.ERROR)
|
37 |
+
sl = StreamToLogger(stderr_logger, logging.ERROR)
|
38 |
+
sys.stderr = sl
|
39 |
+
|
40 |
+
# Get logger
|
41 |
+
logger = logging.getLogger(logger_name)
|
42 |
+
logger.setLevel(logging.INFO)
|
43 |
+
|
44 |
+
# Add a file handler for all loggers
|
45 |
+
if handler is None:
|
46 |
+
os.makedirs(LOGDIR, exist_ok=True)
|
47 |
+
filename = os.path.join(LOGDIR, logger_filename)
|
48 |
+
handler = logging.handlers.TimedRotatingFileHandler(
|
49 |
+
filename, when='D', utc=True, encoding='UTF-8')
|
50 |
+
handler.setFormatter(formatter)
|
51 |
+
|
52 |
+
for name, item in logging.root.manager.loggerDict.items():
|
53 |
+
if isinstance(item, logging.Logger):
|
54 |
+
item.addHandler(handler)
|
55 |
+
|
56 |
+
return logger
|
57 |
+
|
58 |
+
|
59 |
+
class StreamToLogger(object):
|
60 |
+
"""
|
61 |
+
Fake file-like stream object that redirects writes to a logger instance.
|
62 |
+
"""
|
63 |
+
def __init__(self, logger, log_level=logging.INFO):
|
64 |
+
self.terminal = sys.stdout
|
65 |
+
self.logger = logger
|
66 |
+
self.log_level = log_level
|
67 |
+
self.linebuf = ''
|
68 |
+
|
69 |
+
def __getattr__(self, attr):
|
70 |
+
return getattr(self.terminal, attr)
|
71 |
+
|
72 |
+
def write(self, buf):
|
73 |
+
temp_linebuf = self.linebuf + buf
|
74 |
+
self.linebuf = ''
|
75 |
+
for line in temp_linebuf.splitlines(True):
|
76 |
+
# From the io.TextIOWrapper docs:
|
77 |
+
# On output, if newline is None, any '\n' characters written
|
78 |
+
# are translated to the system default line separator.
|
79 |
+
# By default sys.stdout.write() expects '\n' newlines and then
|
80 |
+
# translates them so this is still cross platform.
|
81 |
+
if line[-1] == '\n':
|
82 |
+
self.logger.log(self.log_level, line.rstrip())
|
83 |
+
else:
|
84 |
+
self.linebuf += line
|
85 |
+
|
86 |
+
def flush(self):
|
87 |
+
if self.linebuf != '':
|
88 |
+
self.logger.log(self.log_level, self.linebuf.rstrip())
|
89 |
+
self.linebuf = ''
|
90 |
+
|
91 |
+
|
92 |
+
def disable_torch_init():
|
93 |
+
"""
|
94 |
+
Disable the redundant torch default initialization to accelerate model creation.
|
95 |
+
"""
|
96 |
+
import torch
|
97 |
+
setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
|
98 |
+
setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
|
99 |
+
|
100 |
+
|
101 |
+
def violates_moderation(text):
|
102 |
+
"""
|
103 |
+
Check whether the text violates OpenAI moderation API.
|
104 |
+
"""
|
105 |
+
url = "https://api.openai.com/v1/moderations"
|
106 |
+
headers = {"Content-Type": "application/json",
|
107 |
+
"Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
|
108 |
+
text = text.replace("\n", "")
|
109 |
+
data = "{" + '"input": ' + f'"{text}"' + "}"
|
110 |
+
data = data.encode("utf-8")
|
111 |
+
try:
|
112 |
+
ret = requests.post(url, headers=headers, data=data, timeout=5)
|
113 |
+
flagged = ret.json()["results"][0]["flagged"]
|
114 |
+
except requests.exceptions.RequestException as e:
|
115 |
+
flagged = False
|
116 |
+
except KeyError as e:
|
117 |
+
flagged = False
|
118 |
+
|
119 |
+
return flagged
|
120 |
+
|
121 |
+
|
122 |
+
def pretty_print_semaphore(semaphore):
|
123 |
+
if semaphore is None:
|
124 |
+
return "None"
|
125 |
+
return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
|