Spaces:
Sleeping
Sleeping
File size: 4,453 Bytes
b472dbe 7ec0cf0 e4a6840 7ec0cf0 e4a6840 7ec0cf0 b472dbe a3581d4 b472dbe 11e0f9b 4a98f1e 7ec0cf0 11e0f9b b472dbe 4a98f1e b472dbe 7ec0cf0 11e0f9b 7ec0cf0 11e0f9b b472dbe c6232d6 a7e8129 4958f5d a7e8129 c6232d6 a7e8129 1d00385 a7e8129 e0ebd29 a7e8129 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import PIL.Image
import gradio as gr
import base64
import time
import os
import google.generativeai as genai
import pathlib
txt_model = genai.GenerativeModel('gemini-pro')
vis_model = genai.GenerativeModel('gemini-pro-vision')
import os
GOOGLE_API_KEY=os.getenv('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)
# Image to Base 64 Converter
def image_to_base64(image_path):
with open(image_path, 'rb') as img:
encoded_string = base64.b64encode(img.read())
return encoded_string.decode('utf-8')
# Function that takes User Inputs and displays it on ChatUI
def query_message(history,txt,img):
if not img:
history += [(txt,None)]
return history
base64 = image_to_base64(img)
data_url = f"data:image/jpeg;base64,{base64}"
history += [(f"{txt} ![]({data_url})", None)]
return history
# Function that takes User Inputs, generates Response and displays on Chat UI
def llm_response(history,text,img):
if not img:
response = txt_model.generate_content(text)
history += [(None,response.text)]
return history
else:
img = PIL.Image.open(img)
response = vis_model.generate_content([text,img])
history += [(None,response.text)]
return history
# Function that takes User Inputs and displays it on ChatUI
def output_query_message(txt,img):
if not img:
return txt
base64 = image_to_base64(img)
data_url = f"data:image/jpeg;base64,{base64}"
outputText = [(f"{txt} ![]({data_url})", None)]
return outputText
# Function that takes User Inputs, generates Response and displays on Chat UI
def output_llm_response(text,img):
if not img:
response = txt_model.generate_content(text)
return response.text
else:
img = PIL.Image.open(img)
response = vis_model.generate_content([text,img])
return response.text
# Interface Code- Selector method
def sentence_builder(animal, place):
return f"""how many {animal}s from the {place} are shown in the picture?"""
# gradio block
with gr.Blocks(theme='snehilsanyal/scikit-learn') as app1:
title = gr.Markdown("## COT ##")
with gr.Column():
outputbox = gr.Textbox(label="AI prediction here...")
text_box = gr.Dropdown(
["what is in the image",
"provide alternative title for the image",
"how many birds can be seen in the picture?"],
label="Prompts", info="Will add more animals later!"
)
image_box = gr.Image(type="filepath")
btn = gr.Button("Submit")
clicked = btn.click(output_query_message,
[text_box,image_box],
outputbox
).then(output_llm_response,
[text_box,image_box],
outputbox
)
gr.Markdown("""
# Multimodal Chain-of-Thought Reasoning in Language Models
<h5 align="center"><i>"Imagine learning a textbook without figures or tables."</i></h5>
Multimodal-CoT incorporates vision features in a decoupled training framework. The framework consists of two training stages: (i) rationale generation and (ii) answer inference. Both stages share the same model architecture but differ in the input and output.
""")
with gr.Blocks(theme='snehilsanyal/scikit-learn') as app2:
gr.Markdown("## MM 2BB ##")
with gr.Row():
image_box = gr.Image(type="filepath")
chatbot = gr.Chatbot(
scale = 2,
height=750
)
text_box = gr.Dropdown(
["what is in the image",
"provide alternative title for the image",
"how many birds can be seen in the picture?"],
label="Select--",
info="Will add more animals later!"
)
btn = gr.Button("Submit")
clicked = btn.click(query_message,
[chatbot,text_box,image_box],
chatbot
).then(llm_response,
[chatbot,text_box,image_box],
chatbot
)
with gr.Blocks(theme='snehilsanyal/scikit-learn') as demo:
gr.Markdown("# DEMO #")
gr.TabbedInterface([app1, app2], ["APP #1", "APP #2"])
demo.queue()
demo.launch() |