Spaces:
Running
Running
File size: 10,243 Bytes
4f234f9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 |
import dataclasses
from enum import auto, Enum
from typing import List, Tuple
import os
class SeparatorStyle(Enum):
"""Different separator style."""
SINGLE = auto()
TWO = auto()
MPT = auto()
PLAIN = auto()
LLAMA_2 = auto()
MISTRAL = auto()
# video_helper_map = {
# # 'Chips Making Deal Video' : {'path' : '/data/videos/ChipmakingDeal/sub-videos/', 'prefix' : 'ChipmakingDeal_split'},
# 'Keynote 2023' : {'path' : '/data/videos/PatsKeynote23/sub-videos/', 'prefix' : 'keynotes23_split'},
# 'Intel Behind the Bell' : {'path' : '/data/videos/BehindTheBell/sub-videos/', 'prefix' : 'Behind the Bell Intel_split'},
# 'CEOs Talk' : {'path' : '/data/videos/SamPatTalkAI/sub-videos/', 'prefix' : 'Sam Altman and Pat Gelsinger Talk Artificial Intelligence_split'},
# 'Chips Act Funding Announcement' : {'path' : '/data/videos/IntelChipsFundingAnnounce/sub-videos/', 'prefix' : 'Intel Celebrates CHIPS and Science Act Direct Funding Announcement (Replay)_split'},
# '22nm-Chip Technology' : {'path' : '/data/videos/MarkBohrExplains22nm/sub-videos/', 'prefix' : 'Video Animation Mark Bohr Gets Small 22nm Explained Intel_split'},
# '14nm-Chip Technology' : {'path' : '/data/videos/MarkBohrExplains14nm/sub-videos/', 'prefix' : 'Explanation of Intels 14nm Process_split'},
# }
video_helper_map = {
# 'Chips Making Deal Video' : {'path' : '/data/videos/ChipmakingDeal/sub-videos/', 'prefix' : 'ChipmakingDeal_split'},
'Innovation-2023' : {'path' : '/data1/tile_gh/Multimodal-RAG/videos/PatsKeynote23/sub-videos/', 'prefix' : 'keynotes23_split'},
'Behind-the-Bell-Intel' : {'path' : '/data1/tile_gh/Multimodal-RAG/videos/BehindTheBell/sub-videos/', 'prefix' : 'Behind the Bell Intel_split'},
'Foundry-Connect' : {'path' : '/data1/tile_gh/Multimodal-RAG/videos/SamPatTalkAI/sub-videos/', 'prefix' : 'Sam Altman and Pat Gelsinger Talk Artificial Intelligence_split'},
'Chips Act Funding Announcement' : {'path' : '/data1/tile_gh/Multimodal-RAG/videos/IntelChipsFundingAnnounce/sub-videos/', 'prefix' : 'Intel Celebrates CHIPS and Science Act Direct Funding Announcement (Replay)_split'},
'22nm-transistor-animation' : {'path' : '/data1/tile_gh/Multimodal-RAG/videos/MarkBohrExplains22nm/sub-videos/', 'prefix' : 'Video Animation Mark Bohr Gets Small 22nm Explained Intel_split'},
'14nm-transistor-animation' : {'path' : '/data1/tile_gh/Multimodal-RAG/videos/MarkBohrExplains14nm/sub-videos/', 'prefix' : 'Explanation of Intels 14nm Process_split'},
}
@dataclasses.dataclass
class Conversation:
"""A class that keeps all conversation history."""
system: str
roles: List[str]
messages: List[List[str]]
offset: int
sep_style: SeparatorStyle = SeparatorStyle.SINGLE
sep: str = "\n"
sep2: str = None
version: str = "Unknown"
path_to_img: str = None
video_title: str = None
caption: str = None
skip_next: bool = False
def _template_caption(self):
out = ""
if self.caption is not None:
out = f"The caption associated with the image is '{self.caption}'. "
return out
def get_prompt(self):
messages = self.messages
if len(messages) > 0 and messages[1][1] is not None and "<image>" not in messages[0][1]:
# if there is a history message and <image> is not yet in the first message of user
# then add <image>\n to the beginning
messages = self.messages.copy()
init_role, init_msg = messages[0].copy()
messages[0] = (init_role, "<image>\n" + self._template_caption() + init_msg)
if len(messages) > 1 and messages[1][1] is None:
#Need to do RAG. prompt is the query only
ret = messages[0][1]
else:
if self.sep_style == SeparatorStyle.SINGLE:
ret = ""
for role, message in messages:
if message:
ret += role + ": " + message + self.sep
else:
ret += role + ":"
elif self.sep_style == SeparatorStyle.LLAMA_2:
wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n" if len(msg) > 0 else msg
wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
ret = ""
for i, (role, message) in enumerate(messages):
if i == 0:
assert message, "first message should not be none"
assert role == self.roles[0], "first message should come from user"
if message:
if type(message) is tuple:
message, _, _ = message
if i == 0: message = wrap_sys(self.system) + message
if i % 2 == 0:
message = wrap_inst(message)
ret += self.sep + message
else:
ret += " " + message + " " + self.sep2
else:
ret += ""
ret = ret.lstrip(self.sep)
else:
raise ValueError(f"Invalid style: {self.sep_style}")
return ret
def append_message(self, role, message):
self.messages.append([role, message])
def get_images(self, return_pil=False):
images = []
if self.path_to_img is not None:
path_to_image = self.path_to_img
images.append(path_to_image)
# import base64
# from io import BytesIO
# from PIL import Image
# image = Image.open(path_to_image)
# max_hw, min_hw = max(image.size), min(image.size)
# aspect_ratio = max_hw / min_hw
# max_len, min_len = 800, 400
# shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
# longest_edge = int(shortest_edge * aspect_ratio)
# W, H = image.size
# if longest_edge != max(image.size):
# if H > W:
# H, W = longest_edge, shortest_edge
# else:
# H, W = shortest_edge, longest_edge
# image = image.resize((W, H))
# if return_pil:
# images.append(image)
# else:
# # buffered = BytesIO()
# # # image.save(buffered, format="PNG")
# # img_b64_str = base64.b64encode(buffered.getvalue()).decode()
# images.append(path_to_image)
return images
def to_gradio_chatbot(self):
ret = []
for i, (role, msg) in enumerate(self.messages[self.offset:]):
if i % 2 == 0:
if type(msg) is tuple:
import base64
from io import BytesIO
msg, image, image_process_mode = msg
max_hw, min_hw = max(image.size), min(image.size)
aspect_ratio = max_hw / min_hw
max_len, min_len = 800, 400
shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
longest_edge = int(shortest_edge * aspect_ratio)
W, H = image.size
if H > W:
H, W = longest_edge, shortest_edge
else:
H, W = shortest_edge, longest_edge
image = image.resize((W, H))
buffered = BytesIO()
image.save(buffered, format="JPEG")
img_b64_str = base64.b64encode(buffered.getvalue()).decode()
img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
msg = img_str + msg.replace('<image>', '').strip()
ret.append([msg, None])
else:
ret.append([msg, None])
else:
ret[-1][-1] = msg
return ret
def copy(self):
return Conversation(
system=self.system,
roles=self.roles,
messages=[[x, y] for x, y in self.messages],
offset=self.offset,
sep_style=self.sep_style,
sep=self.sep,
sep2=self.sep2,
version=self.version,)
def dict(self):
return {
"system": self.system,
"roles": self.roles,
"messages": self.messages,
"offset": self.offset,
"sep": self.sep,
"sep2": self.sep2,
"path_to_img": self.path_to_img,
"video_title" : self.video_title,
"caption" : self.caption,
}
def get_path_to_subvideos(self):
print(f"self.video_title {self.video_title}")
print(f"self.path_to_image {self.path_to_img}")
return None
if self.video_title is not None and self.path_to_img is not None:
info = video_helper_map[self.video_title]
path = info['path']
prefix = info['prefix']
vid_index = self.path_to_img.split('/')[-1]
vid_index = vid_index.split('_')[-1]
vid_index = vid_index.replace('.jpg', '')
ret = f"{prefix}{vid_index}.mp4"
ret = os.path.join(path, ret)
return ret
elif self.path_to_img is not None:
return self.path_to_img
return None
multimodal_rag = Conversation(
system="",
roles=("USER", "ASSISTANT"),
messages=(),
offset=0,
sep_style=SeparatorStyle.SINGLE,
sep="\n",
path_to_img=None,
video_title=None,
caption=None,
)
conv_mistral_instruct = Conversation(
system="",
roles=("USER", "ASSISTANT"),
version="llama_v2",
messages=(),
offset=0,
sep_style=SeparatorStyle.LLAMA_2,
sep="",
sep2="</s>",
path_to_img=None,
video_title=None,
caption=None,
)
default_conversation = multimodal_rag
conv_templates = {
"default": multimodal_rag,
"multimodal_rag" : multimodal_rag,
"llavamed_rag" : conv_mistral_instruct,
}
if __name__ == "__main__":
print(default_conversation.get_prompt())
|