Spaces:
Runtime error
Runtime error
File size: 5,638 Bytes
1e84921 98fe1c9 1e84921 8e35138 1e84921 a6dac9a 1e84921 a6dac9a 1e84921 a6dac9a c258b27 1e84921 c258b27 1e84921 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import string
import gradio as gr
import requests
import torch
from transformers import T5Tokenizer
from model import T5ForMultimodalGeneration
from PIL import Image
import timm
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform
rationale_model_dir = "cooelf/MM-CoT-UnifiedQA-Base-Rationale-Joint"
vit_model = timm.create_model("vit_base_patch16_384", pretrained=True, num_classes=0)
vit_model.eval()
config = resolve_data_config({}, model=vit_model)
transform = create_transform(**config)
tokenizer = T5Tokenizer.from_pretrained(rationale_model_dir)
r_model = T5ForMultimodalGeneration.from_pretrained(rationale_model_dir, patch_size=(577, 768))
def inference_chat(input_image,input_text):
with torch.no_grad():
img = Image.open(input_image).convert("RGB")
input = transform(img).unsqueeze(0)
out = vit_model.forward_features(input)
image_features = out.detach()
input_ids = tokenizer(input_text, return_tensors='pt', padding=True).input_ids
source = tokenizer.batch_encode_plus(
[input_text],
max_length=512,
pad_to_max_length=True,
truncation=True,
padding="max_length",
return_tensors="pt",
)
source_ids = source["input_ids"]
source_mask = source["attention_mask"]
rationale = r_model.generate(
input_ids=source_ids,
attention_mask=source_mask,
image_ids=image_features,
max_length=512,
num_beams=1,
do_sample=False
)
gpt3_out = tokenizer.batch_decode(rationale, skip_special_tokens=True)[0]
gpt3_out1 = gpt3_out
return gpt3_out, gpt3_out,gpt3_out1
title = """# VQA with VLE and LLM"""
description = """**VLE** (Visual-Language Encoder) is an image-text multimodal understanding model built on the pre-trained text and image encoders. See https://github.com/iflytek/VLE for more details.
We demonstrate visual question answering systems built with VLE and LLM."""
description1 = """**VQA**: The image and the question are fed to a VQA model (VLEForVQA) and the model predicts the answer.
**VQA+LLM**: We feed the caption, question, and answers predicted by the VQA model to the LLM and ask the LLM to generate the final answer. The outptus from VQA+LLM may vary due to the decoding strategy of the LLM."""
with gr.Blocks(
css="""
.message.svelte-w6rprc.svelte-w6rprc.svelte-w6rprc {font-size: 20px; margin-top: 20px}
#component-21 > div.wrap.svelte-w6rprc {height: 600px;}
"""
) as iface:
state = gr.State([])
#caption_output = None
gr.Markdown(title)
gr.Markdown(description)
#gr.Markdown(article)
with gr.Row():
with gr.Column(scale=1):
image_input = gr.Image(type="pil",label="VQA Image Input")
with gr.Row():
with gr.Column(scale=1):
chat_input = gr.Textbox(lines=1, label="VQA Question Input")
with gr.Row():
clear_button = gr.Button(value="Clear", interactive=True,width=30)
submit_button = gr.Button(
value="Submit", interactive=True, variant="primary"
)
'''
cap_submit_button = gr.Button(
value="Submit_CAP", interactive=True, variant="primary"
)
gpt3_submit_button = gr.Button(
value="Submit_GPT3", interactive=True, variant="primary"
)
'''
with gr.Column():
gr.Markdown(description1)
caption_output = gr.Textbox(lines=0, label="VQA")
caption_output_v1 = gr.Textbox(lines=0, label="VQA + LLM (short answer)")
gpt3_output_v1 = gr.Textbox(lines=0, label="VQA+LLM (long answer)")
chat_input.submit(
inference_chat,
[
image_input,
chat_input,
],
[ caption_output,gpt3_output_v1,caption_output_v1],
)
clear_button.click(
lambda: ("", [],"","",""),
[],
[chat_input, state,caption_output,gpt3_output_v1,caption_output_v1],
queue=False,
)
submit_button.click(
inference_chat,
[
image_input,
chat_input,
],
[caption_output,gpt3_output_v1,caption_output_v1],
)
examples=[['api/61.png',"Question: Think about the magnetic force between the magnets in each pair. Which of the following statements is true?\nContext: The images below show two pairs of magnets. The magnets in different pairs do not affect each other. All the magnets shown are made of the same material, but some of them are different sizes and shapes.\nOptions: (A) The magnitude of the magnetic force is the same in both pairs. (B) The magnitude of the magnetic force is smaller in Pair 1. (C) The magnitude of the magnetic force is smaller in Pair 2.\nSolution:","2","2","2"],
]
examples = gr.Examples(
examples=examples,inputs=[image_input, chat_input,caption_output,caption_output_v1,gpt3_output_v1],
)
iface.queue(concurrency_count=1, api_open=False, max_size=10)
iface.launch(enable_queue=True) |