File size: 2,214 Bytes
f512565
 
 
 
9c6cead
f512565
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec1ee5d
 
f512565
ec1ee5d
 
 
 
 
 
 
f512565
6e75214
f512565
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import requests
import numpy as np 
import gradio as gr

## CLIP 
from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def inference(input_img=None, input_text=None):
    
    if input_img is not None and input_text is not None:

      inputs = processor(text=input_text.split(","), images=input_img, return_tensors="pt", padding=True)
      outputs = model(**inputs)
      logits_per_image = outputs.logits_per_image # this is the image-text similarity score
      probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities

      output_prob = ', '.join([str(probs.detach().numpy()[0][i]) for i in range(np.shape(probs.detach().numpy()[0])[0])])

    else: 
        output_prob = None 
         
    return output_prob


title = "CLIP OpenAI model"
description = "A simple Gradio interface to find similarity between images and text"
text_examples = ["A man and a dog, A man wearing a blue coat with a dog inside",
                 "Train tracks and a train, A dog playing in the field",
                 "An outdoor seating glass box, A movie theater",
                 "A building, A building and multiple cars on the road",
                 "A living area, Planet earth",
                 "A dining room, A football stadium",
                 "A red car, A yellow car",
                 "A chair and a book, A building falling",
                 "A man and a horse, A child playing with a dog",
                 "A man and a horse, A child playing with a dog"
                 ]
examples = [['examples/test_'+str(i)+'.jpg', text_examples[i]] for i in range(10)]

demo = gr.Interface(inference,
                    inputs = [gr.Image(label="Input image"), 
                              gr.Textbox(placeholder="Input text (Multiple entries separated by commas)")], 
                    outputs = [gr.Textbox(label="Similarity score between the input image and input text")],
                    title = title,
                    description = description,
                    examples = examples
                    )
demo.launch()