# + from typing import Dict, List, Any from PIL import Image import torch import os import io import base64 from io import BytesIO # from transformers import BlipForConditionalGeneration, BlipProcessor # from transformers import Blip2Processor, Blip2ForConditionalGeneration from transformers import Blip2ForConditionalGeneration, AutoProcessor from peft import PeftModel, PeftConfig # - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') class EndpointHandler(): def __init__(self, path=""): # load the optimized model print("####### Start Deploying #####") # self.processor = Blip2Processor.from_pretrained("ChirathD/Blip-2-test-1") # self.model = Blip2ForConditionalGeneration.from_pretrained("ChirathD/Blip-2-test-1") # self.model.eval() # self.model = self.model.to(device) peft_model_id = "ChirathD/Blip-2-test-4" config = PeftConfig.from_pretrained(peft_model_id) self.model = Blip2ForConditionalGeneration.from_pretrained(config.base_model_name_or_path, load_in_8bit=True, device_map="auto") self.model = PeftModel.from_pretrained(self.model, peft_model_id) self.processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b") def __call__(self, data: Any) -> Dict[str, Any]: """ Args: data (:obj:): includes the input data and the parameters for the inference. Return: A :obj:`dict`:. The object returned should be a dict of one list like {"captions": ["A hugging face at the office"]} containing : - "caption": A string corresponding to the generated caption. """ print(data) inputs = data.pop("inputs", data) parameters = data.pop("parameters", {}) print(input) image_bytes = base64.b64decode(inputs) image_io = io.BytesIO(image_bytes) image = Image.open(image_io) device = 'cuda' if torch.cuda.is_available() else 'cpu' inputs = self.processor(images=image, return_tensors="pt").to(device, torch.float16) pixel_values = inputs.pixel_values generated_ids = self.model.generate(pixel_values=pixel_values, max_length=100) generated_caption = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0] # device = 'cuda' if torch.cuda.is_available() else 'cpu' # inputs = processor(images=image, return_tensors="pt").to(device, torch.float16) # pixel_values = inputs.pixel_values # generated_ids = model.generate(pixel_values=pixel_values, max_length=100) # generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] # raw_images = [Image.open(BytesIO(_img)) for _img in inputs] # processed_image = self.processor(images=raw_images, return_tensors="pt") # processed_image["pixel_values"] = processed_image["pixel_values"].to(device) # processed_image = {**processed_image, **parameters} # with torch.no_grad(): # out = self.model.generate( # **processed_image # ) # captions = self.processor.batch_decode(out, skip_special_tokens=True) return {"captions": generated_caption}