|
import sys |
|
import torch |
|
import os |
|
import random |
|
import base64 |
|
import msgpack |
|
from io import BytesIO |
|
import numpy as np |
|
|
|
from transformers import AutoTokenizer |
|
from llava.constants import MM_TOKEN_INDEX, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN, DEFAULT_VIDEO_TOKEN, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN |
|
from llava.conversation import conv_templates, SeparatorStyle |
|
from llava.utils import disable_torch_init |
|
from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria, process_images_v2 |
|
from llava.model.builder import load_pretrained_model |
|
from llava.model.multimodal_encoder.processor import Blip2ImageTrainProcessor |
|
from llava.model import LlavaMistralForCausalLM |
|
|
|
|
|
from transformers import CLIPImageProcessor |
|
from PIL import Image |
|
import logging |
|
import time |
|
from concurrent.futures import ThreadPoolExecutor, as_completed |
|
import threading |
|
|
|
def select_frames(input_frames, num_segments = 10): |
|
|
|
indices = np.linspace(start=0, stop=len(input_frames)-1, num=num_segments).astype(int) |
|
|
|
frames = [input_frames[ind] for ind in indices] |
|
|
|
return frames |
|
|
|
def load_model(model_path, device_map): |
|
kwargs = {"device_map": device_map} |
|
kwargs['torch_dtype'] = torch.float16 |
|
tokenizer = AutoTokenizer.from_pretrained(model_path) |
|
model = LlavaMistralForCausalLM.from_pretrained( |
|
model_path, |
|
low_cpu_mem_usage=True, |
|
**kwargs |
|
) |
|
tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN], special_tokens=True) |
|
model.resize_token_embeddings(len(tokenizer)) |
|
|
|
vision_tower = model.get_vision_tower() |
|
if not vision_tower.is_loaded: |
|
vision_tower.load_model(device_map=device_map) |
|
|
|
return model, tokenizer |
|
|
|
|
|
class EndpointHandler: |
|
|
|
def __init__(self): |
|
model_path = './masp_094_v2' |
|
disable_torch_init() |
|
model_path = os.path.expanduser(model_path) |
|
|
|
model_name = get_model_name_from_path(model_path) |
|
|
|
model, tokenizer = load_model(model_path, device_map={"":0}) |
|
|
|
image_processor = Blip2ImageTrainProcessor( |
|
image_size=model.config.img_size, |
|
is_training=False) |
|
|
|
""" |
|
import os |
|
from PIL import Image |
|
input_dir = './v12044gd0000clg1n4fog65p7pag5n6g/video' |
|
image_paths = os.listdir(input_dir) |
|
images = [Image.open(os.path.join(input_dir, item)) for item in image_paths] |
|
num_segments = 10 |
|
images = images[:num_segments] |
|
|
|
import torch |
|
device = torch.device('cuda:0') |
|
image_processor = Blip2ImageTrainProcessor( |
|
image_size=224, |
|
is_training=False) |
|
images_tensor = [image_processor.preprocess(image).cpu().to(device) for image in images] |
|
""" |
|
|
|
self.tokenizer = tokenizer |
|
self.device = torch.device('cuda:0') |
|
self.model = model.to(self.device) |
|
|
|
self.image_processor = image_processor |
|
self.conv_mode = 'v1' |
|
|
|
def inference_frames_batch(self, batch_image_lists, batch_prompts, batch_temperatures): |
|
start_time = time.perf_counter() |
|
|
|
batch_size = len(batch_image_lists) |
|
|
|
|
|
images_tensors_list = [] |
|
input_ids_list = [] |
|
for images, prompt in zip(batch_image_lists, batch_prompts): |
|
|
|
if len(images) > 10: |
|
images = select_frames(images) |
|
if len(images) < 10: |
|
images += [images[-1]] * (10 - len(images)) |
|
|
|
|
|
images_tensor = process_images_v2(images, self.image_processor, self.model.config) |
|
images_tensor = images_tensor.half().to(self.device) |
|
images_tensors_list.append(images_tensor) |
|
|
|
|
|
if len(images) == 1: |
|
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + prompt |
|
else: |
|
qs = DEFAULT_VIDEO_START_TOKEN + DEFAULT_VIDEO_TOKEN + DEFAULT_VIDEO_END_TOKEN + '\n' + prompt |
|
|
|
|
|
conv = conv_templates[self.conv_mode].copy() |
|
conv.append_message(conv.roles[0], qs) |
|
conv.append_message(conv.roles[1], None) |
|
prompt_text = conv.get_prompt() |
|
|
|
input_ids = tokenizer_image_token(prompt_text, self.tokenizer, MM_TOKEN_INDEX, return_tensors='pt').squeeze(0) |
|
input_ids_list.append(input_ids) |
|
|
|
|
|
input_ids_padded = torch.nn.utils.rnn.pad_sequence( |
|
input_ids_list, |
|
batch_first=True, |
|
padding_value=self.tokenizer.pad_token_id |
|
).to(self.device) |
|
|
|
|
|
|
|
|
|
|
|
conv = conv_templates[self.conv_mode].copy() |
|
stop_str = conv.sep if conv.sep2 is None else conv.sep2 |
|
keywords = [stop_str] |
|
stopping_criteria = KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids_padded) |
|
|
|
|
|
temperature = batch_temperatures[0] |
|
|
|
|
|
with torch.inference_mode(): |
|
output_ids = self.model.generate( |
|
input_ids_padded, |
|
images=images_tensors_list, |
|
temperature=temperature, |
|
do_sample=True, |
|
top_p=None, |
|
num_beams=1, |
|
no_repeat_ngram_size=3, |
|
max_new_tokens=1024, |
|
use_cache=True, |
|
stopping_criteria=[stopping_criteria], |
|
) |
|
|
|
|
|
outputs = [] |
|
for output_id in output_ids: |
|
output = self.tokenizer.decode(output_id, skip_special_tokens=True).strip() |
|
output = output.rstrip(stop_str).strip() |
|
outputs.append(output) |
|
|
|
end_time = time.perf_counter() |
|
latency = end_time - start_time |
|
print(f"Latency for this batch inference: {latency:.4f} seconds") |
|
|
|
return outputs |
|
|
|
def __call__(self, request): |
|
|
|
|
|
packed_data_list = request['images'] |
|
prompt_list = request.get('prompt', [''.encode()] * len(packed_data_list)) |
|
temperature_list = request.get('temperature', ['0.01'.encode()] * len(packed_data_list)) |
|
|
|
|
|
all_image_lists = [] |
|
all_prompts = [] |
|
all_temperatures = [] |
|
|
|
for packed_data, prompt_encoded, temperature_encoded in zip(packed_data_list, prompt_list, temperature_list): |
|
|
|
unpacked_data = msgpack.unpackb(packed_data, raw=False) |
|
image_list = [Image.open(BytesIO(byte_data)).convert('RGB') for byte_data in unpacked_data] |
|
all_image_lists.append(image_list) |
|
|
|
|
|
prompt = prompt_encoded.decode() |
|
if prompt == '': |
|
if len(image_list) == 1: |
|
prompt = "Please describe this image in detail." |
|
else: |
|
prompt = "Describe the following video in detail." |
|
all_prompts.append(prompt) |
|
|
|
|
|
temperature = float(temperature_encoded.decode()) |
|
all_temperatures.append(temperature) |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = self.inference_frames_batch(all_image_lists, all_prompts, all_temperatures) |
|
|
|
return {'output': outputs} |
|
|
|
def benchmark_qps_batched(handler, batched_request, num_batches=10): |
|
start_time = time.perf_counter() |
|
completed_samples = 0 |
|
|
|
for _ in range(num_batches): |
|
handler(batched_request) |
|
completed_samples += len(batched_request['images']) |
|
|
|
end_time = time.perf_counter() |
|
total_time = end_time - start_time |
|
qps = completed_samples / total_time |
|
print(f"Processed {completed_samples} samples in {total_time:.2f} seconds. QPS: {qps:.2f}") |
|
|
|
if __name__ == "__main__": |
|
|
|
video_dir = './v12044gd0000cl5c6rfog65i2eoqcqig' |
|
|
|
frames = [(int(os.path.splitext(item)[0]), os.path.join(video_dir, item)) for item in os.listdir(video_dir)] |
|
frames = [item[1] for item in sorted(frames, key=lambda x: x[0])] |
|
out_frames = [Image.open(frame).convert('RGB') for frame in frames] |
|
|
|
|
|
|
|
|
|
batch_size = 4 |
|
|
|
|
|
batched_packed_data = [] |
|
batched_prompts = [] |
|
batched_temperatures = [] |
|
|
|
for _ in range(batch_size): |
|
|
|
byte_images = [] |
|
for img in out_frames: |
|
byte_io = BytesIO() |
|
img.save(byte_io, format='JPEG') |
|
byte_images.append(byte_io.getvalue()) |
|
|
|
|
|
packed_data = msgpack.packb(byte_images) |
|
batched_packed_data.append(packed_data) |
|
|
|
|
|
batched_prompts.append(''.encode()) |
|
batched_temperatures.append('0.01'.encode()) |
|
|
|
|
|
batched_request = { |
|
'images': batched_packed_data, |
|
'prompt': batched_prompts, |
|
'temperature': batched_temperatures, |
|
} |
|
|
|
handler = EndpointHandler() |
|
|
|
|
|
|
|
response = handler(batched_request) |
|
print(response) |
|
|
|
|
|
|
|
|
|
|
|
|