test-image-Caption

Sleeping

File size: 1,639 Bytes

70af6b0
 
947d2f8
70af6b0
c36694a
70af6b0
 
 
947d2f8
 
 
70af6b0
 
 
6bb6d88
 
947d2f8
c36694a
6bb6d88
 
 
70af6b0
d688b4b
 
947d2f8
70af6b0
6bb6d88
70af6b0
6bb6d88
70af6b0
6bb6d88
70af6b0
6bb6d88
70af6b0
6bb6d88
 
 
 
 
 
 
 
 
 
 
c36694a

import torch 
import gradio as gr
from transformers import AutoTokenizer, ViTImageProcessor, VisionEncoderDecoderModel 

device = 'cpu'
encoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
decoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
model_checkpoint = "nlpconnect/vit-gpt2-image-captioning"

# Replace ViTFeatureExtractor with ViTImageProcessor
feature_extractor = ViTImageProcessor.from_pretrained(encoder_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)
model = VisionEncoderDecoderModel.from_pretrained(model_checkpoint).to(device)

def predict(image, max_length=64, num_beams=4):
    image = image.convert('RGB')
    image = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device)
    clean_text = lambda x: x.replace('', '').split('\n')[0]
    caption_ids = model.generate(image, max_length=max_length, num_beams=num_beams)[0]
    caption_text = clean_text(tokenizer.decode(caption_ids, skip_special_tokens=True))
    return caption_text

# Remove 'optional=True' from gr.Image
input_image = gr.Image(label="Upload your Image", type='pil')
output_text = gr.Textbox(label="Captions")

examples = [f"example{i}.jpg" for i in range(1, 7)]

description = "Image captioning application made using transformers"
title = "Image Captioning 🖼️"
article = "Created By : Shreyas Dixit"

# Create the Gradio interface
interface = gr.Interface(
    fn=predict,
    inputs=input_image,
    outputs=output_text,
    examples=examples,
    title=title,
    description=description,
    article=article,
    theme="grass"
)

# Launch the interface
interface.launch(share=True)