Spaces:
Sleeping
Sleeping
File size: 2,366 Bytes
1dd1da1 9720f9c ae730de 6b2e788 1dd1da1 6b2e788 1dd1da1 6b2e788 1dd1da1 6b2e788 1dd1da1 6b2e788 1dd1da1 5fed0bb 1dd1da1 eb46552 24d0656 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
import PIL.Image
import gradio as gr
import base64
import time
import os
import google.generativeai as genai
import pathlib
txt_model = genai.GenerativeModel('gemini-pro')
vis_model = genai.GenerativeModel('gemini-pro-vision')
txt_prompt_1 = 'I have upload the image. The image contains two sample images, A images contains 4 objects--Lens, Aducam Board, Anti-Static Strap, and Raspberry Pi Board. Determine if all 4 objects are also in Image B. If missing, list the names.'
txt_display_1 = 'name the missing items on B'
import os
GOOGLE_API_KEY=os.getenv('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)
# Image to Base 64 Converter
def image_to_base64(image_path):
with open(image_path, 'rb') as img:
encoded_string = base64.b64encode(img.read())
return encoded_string.decode('utf-8')
def output_query_message(img):
if not img:
return txt_prompt_1
base64 = image_to_base64(img)
data_url = f"data:image/jpeg;base64,{base64}"
outputText = [(f"{txt_display_1} ![]({data_url})", None)]
return outputText
# Function that takes User Inputs, generates Response and displays on Chat UI
def output_llm_response(img):
if not img:
response = txt_model.generate_content(txt_prompt_1)
return response.text
else:
img = PIL.Image.open(img)
response = vis_model.generate_content([txt_prompt_1,img])
return response.text
with gr.Blocks(theme='snehilsanyal/scikit-learn') as app1:
with gr.Column():
outputbox = gr.Textbox(label="line clearance...")
image_box = gr.Image(type="filepath")
btn = gr.Button("Check This")
clicked = btn.click(output_query_message,
[image_box],
outputbox
).then(output_llm_response,
[image_box],
outputbox
)
gr.Markdown("""
## SOP-302: Line Clearance ##
<h5 align="center"><i>"XXXX here here."</i></h5>
Multimodal-CoT incorporates vision features in a decoupled training framework. The framework consists of two training stages: (i) rationale generation and (ii) answer inference. Both stages share the same model architecture but differ in the input and output.
""")
app1.queue()
app1.launch() |