florentgbelidji
/

blip_captioning

Image-to-Text

generic

image-captioning

endpoints-template

Inference Endpoints

Model card Files Files and versions Community

florentgbelidji HF staff commited on Aug 17, 2022

Commit

baa2ff5

•

1 Parent(s): 993e825

Updating docstring, loading local model weights and adding parameters

Browse files

Files changed (1) hide show

pipeline.py +21 -11

pipeline.py CHANGED Viewed

@@ -5,7 +5,7 @@ import torch
 import base64
 import os
 from io import BytesIO
-from blip import blip_decoder
 from torchvision import transforms
 from torchvision.transforms.functional import InterpolationMode
@@ -13,10 +13,15 @@ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 print(device)
 class PreTrainedPipeline():
-    def __init__(self, path=""):
         # load the optimized model
-        self.model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth'
-        self.model = blip_decoder(pretrained=self.model_url, image_size=384, vit='large',med_config=os.path.join(path, 'configs/med_config.json'))
         self.model.eval()
         self.model = self.model.to(device)
@@ -29,23 +34,28 @@ class PreTrainedPipeline():
-    def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
         """
         Args:
             data (:obj:):
                 includes the input data and the parameters for the inference.
         Return:
-            A :obj:`list`:. The object returned should be a list of one list like [[{"label": 0.9939950108528137}]] containing :
-                - "label": A string representing what the label/class is. There can be multiple labels.
-                - "score": A score between 0 and 1 describing how confident the model is for this label/class.
         """
         inputs = data.pop("inputs", data)
-        parameters = data.pop("parameters", None)
         # decode base64 image to PIL
         image = Image.open(BytesIO(base64.b64decode(inputs['image'])))
         image = self.transform(image).unsqueeze(0).to(device)
         with torch.no_grad():
-            caption = self.model.generate(image, sample=True, top_p=0.9, max_length=20, min_length=5)
         # postprocess the prediction
-        return caption

 import base64
 import os
 from io import BytesIO
+from models.blip_decoder import blip_decoder
 from torchvision import transforms
 from torchvision.transforms.functional import InterpolationMode
 print(device)
 class PreTrainedPipeline():
+    def __init__(self):
         # load the optimized model
+        self.model_path = 'model_base_capfilt_large.pth'
+        self.model = blip_decoder(
+            pretrained=self.model_path,
+            image_size=384,
+            vit='large',
+            med_config=os.path.join(path, 'configs/med_config.json')
+        )
         self.model.eval()
         self.model = self.model.to(device)
+    def __call__(self, data: Any) -> Dict[str]:
         """
         Args:
             data (:obj:):
                 includes the input data and the parameters for the inference.
         Return:
+            A :obj:`dict`:. The object returned should be a dict of one list like [[{"label": 0.9939950108528137}]] containing :
+                - "caption": A string corresponding to the generated caption.
         """
         inputs = data.pop("inputs", data)
+        parameters = data.pop("parameters", {})
         # decode base64 image to PIL
         image = Image.open(BytesIO(base64.b64decode(inputs['image'])))
         image = self.transform(image).unsqueeze(0).to(device)
         with torch.no_grad():
+            caption = self.model.generate(
+                image,
+                sample=parameters.get('sample',True),
+                top_p=parameters.get('top_p',0.9),
+                max_length=parameters.get('max_length',20),
+                min_length=parameters.get('min_length',5)
+            )
         # postprocess the prediction
+        return {"caption": caption}