LN1996 commited on
Commit
f512565
1 Parent(s): 09b9d77

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -0
app.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import numpy as np
3
+ import gradio as gr
4
+
5
+ from transformers import CLIPProcessor, CLIPModel
6
+
7
+ model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
8
+ processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
9
+
10
+
11
+ def inference(input_img=None, input_text=None):
12
+
13
+ if input_img is not None and input_text is not None:
14
+
15
+ inputs = processor(text=input_text.split(","), images=input_img, return_tensors="pt", padding=True)
16
+ outputs = model(**inputs)
17
+ logits_per_image = outputs.logits_per_image # this is the image-text similarity score
18
+ probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
19
+
20
+ output_prob = ', '.join([str(probs.detach().numpy()[0][i]) for i in range(np.shape(probs.detach().numpy()[0])[0])])
21
+
22
+ else:
23
+ output_prob = None
24
+
25
+ return output_prob
26
+
27
+
28
+ title = "CLIP OpenAI model"
29
+ description = "A simple Gradio interface to find similarity between images and text"
30
+ text_examples = ["A man and a dog, A man wearing a blue coat with a dog inside",
31
+ "Train tracks and a train, A dog playing in the field",
32
+ "An outdoor seating glass box, A movie theater",
33
+ "A building", "A building and multiple cars on the road",
34
+ "A living area",
35
+ "A dining room, A football stadium",
36
+ "A red car",
37
+ "A chair and a book, A book and a chair",
38
+ "A man and a horse",
39
+ "A man and a horse"
40
+ ]
41
+ examples = [['examples/test_'+str(i)+'.jpg', ] for i in range(10)]
42
+
43
+ demo = gr.Interface(inference,
44
+ inputs = [gr.Image(label="Input image"),
45
+ gr.Textbox(placeholder="Input text (Multiple entries separated by commas)")],
46
+ outputs = [gr.Textbox(label="Similarity score between the input image and input text")],
47
+ title = title,
48
+ description = description,
49
+ examples = examples
50
+ )
51
+ demo.launch()