OneChart / modeling_OneChart.py
kppkkp's picture
Upload modeling_OneChart.py
49c6f36 verified
raw
history blame
21 kB
from transformers import OPTConfig, OPTModel, OPTForCausalLM, StoppingCriteria, TextStreamer
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
from typing import List, Optional, Tuple, Union
import requests
from PIL import Image
from io import BytesIO
import json
import re
import torch
import numpy as np
import torch.nn as nn
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
from .sam_vision_b import build_SAM_vit_b
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
import dataclasses
DEFAULT_IMAGE_TOKEN = "<image>"
DEFAULT_IMAGE_PATCH_TOKEN = '<imgpad>'
DEFAULT_IM_START_TOKEN = '<img>'
DEFAULT_IM_END_TOKEN = '</img>'
from enum import auto, Enum
class SeparatorStyle(Enum):
"""Different separator style."""
SINGLE = auto()
TWO = auto()
MPT = auto()
@dataclasses.dataclass
class Conversation:
"""A class that keeps all conversation history."""
system: str
roles: List[str]
messages: List[List[str]]
offset: int
sep_style: SeparatorStyle = SeparatorStyle.SINGLE
sep: str = "<|im_end|>"
sep2: str = None
version: str = "Unknown"
skip_next: bool = False
def get_prompt(self):
if self.sep_style == SeparatorStyle.SINGLE:
ret = self.system + self.sep + '\n'
for role, message in self.messages:
if message:
if type(message) is tuple:
message, _, _ = message
ret += role + ": " + message + self.sep
else:
ret += role + ":"
return ret
elif self.sep_style == SeparatorStyle.TWO:
seps = [self.sep, self.sep2]
ret = self.system + seps[0]
for i, (role, message) in enumerate(self.messages):
if message:
if type(message) is tuple:
message, _, _ = message
ret += role + ": " + message + seps[i % 2]
else:
ret += role + ":"
return ret
if self.sep_style == SeparatorStyle.MPT:
if self.system:
ret = self.system + self.sep
else:
ret = ''
for role, message in self.messages:
if message:
if type(message) is tuple:
message, _, _ = message
ret += role + message + self.sep
else:
ret += role
return ret
else:
raise ValueError(f"Invalid style: {self.sep_style}")
def append_message(self, role, message):
self.messages.append([role, message])
def copy(self):
return Conversation(
system=self.system,
roles=self.roles,
messages=[[x, y] for x, y in self.messages],
offset=self.offset,
sep_style=self.sep_style,
sep=self.sep,
sep2=self.sep2)
class KeywordsStoppingCriteria(StoppingCriteria):
def __init__(self, keywords, tokenizer, input_ids):
self.keywords = keywords
self.keyword_ids = [tokenizer(keyword).input_ids for keyword in keywords]
self.keyword_ids = [keyword_id[0] for keyword_id in self.keyword_ids if type(keyword_id) is list and len(keyword_id) == 1]
self.tokenizer = tokenizer
self.start_len = None
self.input_ids = input_ids
def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
if self.start_len is None:
self.start_len = self.input_ids.shape[1]
else:
for keyword_id in self.keyword_ids:
if output_ids[0, -1] == keyword_id:
return True
outputs = self.tokenizer.batch_decode(output_ids[:, self.start_len:], skip_special_tokens=True)[0]
for keyword in self.keywords:
if keyword in outputs:
return True
return False
conv_vicuna_v1_1 = Conversation(
system="A chat between a curious user and an artificial intelligence assistant. "
"The assistant gives helpful, detailed, and polite answers to the user's questions.",
roles=("USER", "ASSISTANT"),
version="v1",
messages=(),
offset=0,
sep_style=SeparatorStyle.TWO,
sep=" ",
sep2="</s>",
)
class OneChartImageEvalProcessor:
def __init__(self, image_size=1024):
mean = (0., 0., 0.)
std = (1., 1., 1.)
self.normalize = transforms.Normalize(mean, std)
self.transform = transforms.Compose(
[
transforms.Resize(
(image_size, image_size), interpolation=InterpolationMode.BICUBIC
),
transforms.ToTensor(),
self.normalize,
]
)
def __call__(self, item):
return self.transform(item)
class OneChartConfig(OPTConfig):
model_type = "OneChart"
class OneChartModel(OPTModel):
config_class = OneChartConfig
def __init__(self, config: OPTConfig):
super(OneChartModel, self).__init__(config)
self.vision_tower = build_SAM_vit_b()
self.mm_projector = nn.Linear(1024, 768)
def embed_tokens(self, x):
return self.get_input_embeddings()(x)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
images: Optional[torch.FloatTensor] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPast]:
if inputs_embeds is None:
inputs_embeds = self.embed_tokens(input_ids)
vision_tower_high = getattr(self, 'vision_tower', None)
if vision_tower_high is not None and (input_ids.shape[1] != 1 or self.training) and images is not None:
use_im_start_end = getattr(self.config, "use_im_start_end", -1)
vision_select_layer = getattr(self.config, "vision_select_layer", -1)
im_patch_token = getattr(self.config, "im_patch_token", -1)
im_start_token = getattr(self.config, "im_start_token", -1)
im_end_token = getattr(self.config, "im_end_token", -1)
freeze_vision_tower = getattr(self.config, "freeze_vision_tower", False)
image_features = []
for image in images:
P, C, H, W = image.shape
if P == 1:
with torch.set_grad_enabled(False):
cnn_feature = vision_tower_high(image)
cnn_feature = cnn_feature.flatten(2).permute(0, 2, 1) # 256*1024
image_feature = self.mm_projector(cnn_feature)
image_features.append(image_feature)
else:
raise NotImplementedError("Batch inference needs to be implemented.")
use_im_start_end = True
new_input_embeds = []
for cur_input_ids, cur_input_embeds, cur_image_features in zip(input_ids, inputs_embeds, image_features):
if use_im_start_end:
if (cur_input_ids == im_start_token).sum() != (cur_input_ids == im_end_token).sum():
raise ValueError("The number of image start tokens and image end tokens should be the same.")
image_start_tokens = torch.where(cur_input_ids == im_start_token)[0]
for image_start_token_pos, per_cur_image_features in zip(image_start_tokens, cur_image_features):
per_cur_image_features = per_cur_image_features.to(device=cur_input_embeds.device)
num_patches = per_cur_image_features.shape[0]
if cur_input_ids[image_start_token_pos + num_patches + 1] != im_end_token:
raise ValueError("The image end token should follow the image start token.")
cur_input_embeds = torch.cat(
(
cur_input_embeds[:image_start_token_pos+1],
per_cur_image_features,
cur_input_embeds[image_start_token_pos + num_patches + 1:]
),
dim=0
)
new_input_embeds.append(cur_input_embeds)
else:
raise NotImplementedError
inputs_embeds = torch.stack(new_input_embeds, dim=0)
return super(OneChartModel, self).forward(
input_ids=None, attention_mask=attention_mask, past_key_values=past_key_values,
inputs_embeds=inputs_embeds, use_cache=use_cache,
output_attentions=output_attentions, output_hidden_states=output_hidden_states,
return_dict=return_dict
)
class OneChartOPTForCausalLM(OPTForCausalLM):
config_class = OneChartConfig
def __init__(self, config):
super(OneChartOPTForCausalLM, self).__init__(config)
self.model = OneChartModel(config)
self.vocab_size = config.vocab_size
self.num_decoder = nn.Sequential(
nn.Linear(config.hidden_size, config.hidden_size // 2),
nn.ReLU(),
nn.Linear(config.hidden_size // 2, config.hidden_size // 2),
nn.ReLU(),
nn.Linear(config.hidden_size // 2, 256),
)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.pred_locs = []
# Initialize weights and apply final processing
self.post_init()
def get_model(self):
return self.model
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
images: Optional[torch.FloatTensor] = None,
return_dict: Optional[bool] = None,
loc_labels=None,
) -> Union[Tuple, CausalLMOutputWithPast]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.model(
input_ids=input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
images=images,
return_dict=return_dict
)
hidden_states = outputs[0]
if (loc_labels is not None) and len(loc_labels) > 0:
det_patch_token = torch.where(input_ids == self.config.number_token)[1][0]
pred_locs = self.num_decoder(hidden_states[:, det_patch_token, :]) # shape: [batch_size, 256]
# inference时输出num_head预测的值
if not self.training:
try:
det_patch_token = torch.where(input_ids == self.config.number_token)[1][0]
pred_locs = self.num_decoder(hidden_states[:, det_patch_token, :]) # shape: [batch_size, 256]
self.pred_locs = pred_locs[0][:100].cpu().tolist()
except Exception as e:
pass
logits = self.lm_head(hidden_states)
logits = logits.float()
# logits
loss = None
if labels is not None:
# Shift so that tokens < n predict n
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens
loss_fct = CrossEntropyLoss()
shift_logits = shift_logits.view(-1, self.config.vocab_size)
shift_labels = shift_labels.view(-1)
# Enable model parallelism
shift_labels = shift_labels.to(shift_logits.device)
loss = loss_fct(shift_logits, shift_labels)
if not return_dict:
output = (logits,) + outputs[1:]
return (loss,) + output if loss is not None else output
return CausalLMOutputWithPast(
loss=loss,
logits=logits,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs
):
token_type_ids = kwargs.get("token_type_ids", None)
if past_key_values:
input_ids = input_ids[:, -1].unsqueeze(-1)
if token_type_ids is not None:
token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
attention_mask = kwargs.get("attention_mask", None)
position_ids = kwargs.get("position_ids", None)
if attention_mask is not None and position_ids is None:
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)
if past_key_values:
position_ids = position_ids[:, -1].unsqueeze(-1)
else:
position_ids = None
if inputs_embeds is not None and past_key_values is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}
model_inputs.update(
{
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"position_ids": position_ids,
"attention_mask": attention_mask,
"token_type_ids": token_type_ids,
"images": kwargs.get("images", None),
}
)
return model_inputs
def load_image(self, image_file):
if image_file.startswith('http') or image_file.startswith('https'):
response = requests.get(image_file)
image = Image.open(BytesIO(response.content)).convert('RGB')
else:
image = Image.open(image_file).convert('RGB')
return image
def disable_torch_init(self):
"""
Disable the redundant torch default initialization to accelerate model creation.
"""
setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
def chat(self, tokenizer, image_file, reliable_check=True, print_prompt=False):
device = "cuda" if torch.cuda.is_available() else "cpu"
# dtype = torch.bfloat16 if device=="cuda" else next(self.get_model().parameters()).dtype
dtype=torch.float16 if device=="cuda" else torch.float32
# print(device, dtype)
def list_json_value(json_dict):
rst_str = []
sort_flag = True
try:
for key, value in json_dict.items():
if isinstance(value, dict):
decimal_out = list_json_value(value)
rst_str = rst_str + decimal_out
sort_flag = False
elif isinstance(value, list):
return []
else:
if isinstance(value, float) or isinstance(value, int):
rst_str.append(value)
else:
# num_value = value.replace("%", "").replace("$", "").replace(" ", "").replace(",", "")
value = re.sub(r'\(\d+\)|\[\d+\]', '', value)
num_value = re.sub(r'[^\d.-]', '', str(value))
if num_value not in ["-", "*", "none", "None", ""]:
rst_str.append(float(num_value))
except Exception as e:
print(f"Error: {e}")
# print(json_dict)
return []
# if len(rst_str) > 0:
# rst_str = rst_str + [float(-1)]
return rst_str
def norm_(rst_list):
if len(rst_list) < 2:
return rst_list
min_vals = min(rst_list)
max_vals = max(rst_list)
rst_list = np.array(rst_list)
normalized_tensor = (rst_list - min_vals) / (max_vals - min_vals + 1e-9)
return list(normalized_tensor)
self.disable_torch_init()
image_processor_high = OneChartImageEvalProcessor(image_size=1024)
use_im_start_end = True
image_token_len = 256
image = self.load_image(image_file)
image_tensor_1 = image_processor_high(image).to(dtype=dtype, device=device)
query = 'Convert the key information of the chart to a python dict:'
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN * image_token_len + DEFAULT_IM_END_TOKEN + query + '\n'
conv = conv_vicuna_v1_1.copy()
conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
if print_prompt:
print(prompt)
inputs = tokenizer([prompt])
input_ids = torch.as_tensor(inputs.input_ids).to(device=device)
stop_str = '</s>'
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
if device=='cuda':
with torch.autocast(device, dtype=dtype):
output_ids = self.generate(
input_ids,
images=[image_tensor_1.unsqueeze(0)],
do_sample=False,
num_beams = 1,
# no_repeat_ngram_size = 20,
# streamer=streamer,
max_new_tokens=4096,
stopping_criteria=[stopping_criteria]
)
else:
output_ids = self.generate(
input_ids,
images=[image_tensor_1.unsqueeze(0)],
do_sample=False,
num_beams = 1,
# no_repeat_ngram_size = 20,
# streamer=streamer,
max_new_tokens=4096,
stopping_criteria=[stopping_criteria]
)
outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:], skip_special_tokens=True)
outputs = outputs.replace("<Number>", "")
outputs = outputs.strip()
if outputs.endswith(stop_str):
outputs = outputs[:-len(stop_str)]
response_str = outputs
if reliable_check:
pred_nums = self.pred_locs
try:
outputs_json = json.loads(outputs)
list_v = list_json_value(outputs_json['values'])
list_v = [round(x,4) for x in norm_(list_v)]
gt_nums = torch.tensor(list_v).reshape(1,-1)
response_str = response_str + "\n<Chart>: " + str(pred_nums[:len(list_v)])
pred_nums_ = torch.tensor(pred_nums[:len(list_v)]).reshape(1,-1)
reliable_distence = F.l1_loss(pred_nums_, gt_nums)
response_str = response_str + "\nreliable_distence: " + str(reliable_distence)
if reliable_distence < 0.1:
response_str = response_str + "\nAfter OneChart checking, this prediction is reliable."
else:
response_str = response_str + "\nThis prediction may be has error! "
except Exception as e:
response_str = response_str + "\nThis prediction may be has error! "
response_str = response_str + "\n" + str(e)
return response_str