gemini-mm-cot / app.py
hsienchen's picture
Update app.py
24d0656 verified
raw
history blame
2.37 kB
import PIL.Image
import gradio as gr
import base64
import time
import os
import google.generativeai as genai
import pathlib
txt_model = genai.GenerativeModel('gemini-pro')
vis_model = genai.GenerativeModel('gemini-pro-vision')
txt_prompt_1 = 'I have upload the image. The image contains two sample images, A images contains 4 objects--Lens, Aducam Board, Anti-Static Strap, and Raspberry Pi Board. Determine if all 4 objects are also in Image B. If missing, list the names.'
txt_display_1 = 'name the missing items on B'
import os
GOOGLE_API_KEY=os.getenv('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)
# Image to Base 64 Converter
def image_to_base64(image_path):
with open(image_path, 'rb') as img:
encoded_string = base64.b64encode(img.read())
return encoded_string.decode('utf-8')
def output_query_message(img):
if not img:
return txt_prompt_1
base64 = image_to_base64(img)
data_url = f"data:image/jpeg;base64,{base64}"
outputText = [(f"{txt_display_1} ![]({data_url})", None)]
return outputText
# Function that takes User Inputs, generates Response and displays on Chat UI
def output_llm_response(img):
if not img:
response = txt_model.generate_content(txt_prompt_1)
return response.text
else:
img = PIL.Image.open(img)
response = vis_model.generate_content([txt_prompt_1,img])
return response.text
with gr.Blocks(theme='snehilsanyal/scikit-learn') as app1:
with gr.Column():
outputbox = gr.Textbox(label="line clearance...")
image_box = gr.Image(type="filepath")
btn = gr.Button("Check This")
clicked = btn.click(output_query_message,
[image_box],
outputbox
).then(output_llm_response,
[image_box],
outputbox
)
gr.Markdown("""
## SOP-302: Line Clearance ##
<h5 align="center"><i>"XXXX here here."</i></h5>
Multimodal-CoT incorporates vision features in a decoupled training framework. The framework consists of two training stages: (i) rationale generation and (ii) answer inference. Both stages share the same model architecture but differ in the input and output.
""")
app1.queue()
app1.launch()