Spaces:
Sleeping
Sleeping
from PIL import Image | |
from transformers import VisionEncoderDecoderModel , ViTFeatureExtractor , PreTrainedTokenizerFast | |
import gradio as gr | |
model = VisionEncoderDecoderModel.from_pretrained("ydshieh/vit-gpt2-coco-en") | |
vit_feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch32-224-in21k") | |
tokenizer = PreTrainedTokenizerFast.from_pretrained("distilgpt2") | |
def caption_images(image): | |
pixel_values = vit_feature_extractor(images=image,return_tensors="pt").pixel_values | |
encoder_outputs = model.generate(pixel_values.to('cpu'),num_beams=5) | |
generated_sentence = tokenizer.batch_decode(encoder_outputs,skip_special_tokens=True) | |
return (generated_sentence[0].strip()) | |
inputs = [ | |
gr.components.Image(type='pil',label='Original Image') | |
] | |
outputs = [ | |
gr.components.Textbox(label='Caption') | |
] | |
title = "Simple Image captioning Application" | |
description = "Upload an image to see the caption generated" | |
example =['messi.jpg'] | |
gr.Interface( | |
caption_images, | |
inputs, | |
outputs, | |
title=title, | |
description = description, | |
examples = example, | |
).launch(debug=True) | |