Spaces:

hsienchen
/

gemini-mm-cot

Sleeping

File size: 2,366 Bytes

1dd1da1
 
 
 
 
 
 
 
 
 
 
 
9720f9c
ae730de
6b2e788
1dd1da1
 
 
 
 
 
 
 
 
 
 
 
6b2e788
1dd1da1
 
6b2e788
1dd1da1
 
6b2e788
1dd1da1
 
 
 
 
6b2e788
1dd1da1
 
 
 
5fed0bb
1dd1da1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb46552
24d0656

import PIL.Image
import gradio as gr
import base64
import time
import os
import google.generativeai as genai

import pathlib

txt_model = genai.GenerativeModel('gemini-pro')
vis_model = genai.GenerativeModel('gemini-pro-vision')

txt_prompt_1 = 'I have upload the image. The image contains two sample images, A images contains 4 objects--Lens, Aducam Board, Anti-Static Strap, and Raspberry Pi Board. Determine if all 4 objects are also in Image B. If missing, list the names.' 
txt_display_1 = 'name the missing items on B'

import os

GOOGLE_API_KEY=os.getenv('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

# Image to Base 64 Converter
def image_to_base64(image_path):
    with open(image_path, 'rb') as img:
        encoded_string = base64.b64encode(img.read())
    return encoded_string.decode('utf-8')


def output_query_message(img):
    if not img:
        return txt_prompt_1
    base64 = image_to_base64(img)
    data_url = f"data:image/jpeg;base64,{base64}"
    outputText = [(f"{txt_display_1} ![]({data_url})", None)]
    return outputText

# Function that takes User Inputs, generates Response and displays on Chat UI
def output_llm_response(img):
    if not img:
        response = txt_model.generate_content(txt_prompt_1)
        return response.text

    else:
        img = PIL.Image.open(img)
        response = vis_model.generate_content([txt_prompt_1,img])
        return response.text


    
with gr.Blocks(theme='snehilsanyal/scikit-learn') as app1:
    with gr.Column():    
        outputbox = gr.Textbox(label="line clearance...")
        image_box = gr.Image(type="filepath")
        
    btn = gr.Button("Check This")
    clicked = btn.click(output_query_message,
                        [image_box],
                        outputbox
                        ).then(output_llm_response,
                                [image_box],
                                outputbox
                                )
    gr.Markdown("""
    ## SOP-302: Line Clearance ##
    
    <h5 align="center"><i>"XXXX here here."</i></h5>
    
    Multimodal-CoT incorporates vision features in a decoupled training framework. The framework consists of two training stages: (i) rationale generation and (ii) answer inference. Both stages share the same model architecture but differ in the input and output.
    """)


app1.queue()
app1.launch()