LN1996's picture
Update app.py
9c6cead
import requests
import numpy as np
import gradio as gr
## CLIP
from transformers import CLIPProcessor, CLIPModel
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
def inference(input_img=None, input_text=None):
if input_img is not None and input_text is not None:
inputs = processor(text=input_text.split(","), images=input_img, return_tensors="pt", padding=True)
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
output_prob = ', '.join([str(probs.detach().numpy()[0][i]) for i in range(np.shape(probs.detach().numpy()[0])[0])])
else:
output_prob = None
return output_prob
title = "CLIP OpenAI model"
description = "A simple Gradio interface to find similarity between images and text"
text_examples = ["A man and a dog, A man wearing a blue coat with a dog inside",
"Train tracks and a train, A dog playing in the field",
"An outdoor seating glass box, A movie theater",
"A building, A building and multiple cars on the road",
"A living area, Planet earth",
"A dining room, A football stadium",
"A red car, A yellow car",
"A chair and a book, A building falling",
"A man and a horse, A child playing with a dog",
"A man and a horse, A child playing with a dog"
]
examples = [['examples/test_'+str(i)+'.jpg', text_examples[i]] for i in range(10)]
demo = gr.Interface(inference,
inputs = [gr.Image(label="Input image"),
gr.Textbox(placeholder="Input text (Multiple entries separated by commas)")],
outputs = [gr.Textbox(label="Similarity score between the input image and input text")],
title = title,
description = description,
examples = examples
)
demo.launch()