Spaces:

hsienchen
/

gemini-mm-cot

Sleeping

App Files Files Community

gemini-mm-cot / app3.py

hsienchen

Create app3.py

89a716b verified 10 months ago

raw

history blame

3.68 kB

	import PIL.Image
	import gradio as gr
	import base64
	import time
	import os
	import google.generativeai as genai

	import pathlib

	txt_model = genai.GenerativeModel('gemini-pro')
	vis_model = genai.GenerativeModel('gemini-pro-vision')

	import os

	GOOGLE_API_KEY=os.getenv('GOOGLE_API_KEY')

	genai.configure(api_key=GOOGLE_API_KEY)

	# Image to Base 64 Converter
	def image_to_base64(image_path):
	with open(image_path, 'rb') as img:
	encoded_string = base64.b64encode(img.read())
	return encoded_string.decode('utf-8')

	# Function that takes User Inputs and displays it on ChatUI
	def query_message(history,txt,img):
	if not img:
	history += [(txt,None)]
	return history
	base64 = image_to_base64(img)
	data_url = f"data:image/jpeg;base64,{base64}"
	history += [(f"{txt} ![]({data_url})", None)]
	return history

	# Function that takes User Inputs, generates Response and displays on Chat UI
	def llm_response(history,text,img):
	if not img:
	response = txt_model.generate_content(text)
	history += [(None,response.text)]
	return history

	else:
	img = PIL.Image.open(img)
	response = vis_model.generate_content([text,img])
	history += [(None,response.text)]
	return history

	# Interface Code- Selector method

	def sentence_builder(animal, place):
	return f"""how many {animal}s from the {place} are shown in the picture?"""


	# gradio block

	with gr.Blocks(theme='freddyaboulton/dracula_revamped') as app1:
	title ="-COT-"
	with gr.Row():
	image_box = gr.Image(type="filepath")

	chatbot = gr.Chatbot(
	scale = 2,
	height=750
	)
	text_box = gr.Dropdown(
	["what is in the image", "provide alternative title for the image", "how many birds can be seen in the picture?"], label="Animal", info="Will add more animals later!"
	)

	btn = gr.Button("Submit")
	clicked = btn.click(query_message,
	[chatbot,text_box,image_box],
	chatbot
	).then(llm_response,
	[chatbot,text_box,image_box],
	chatbot
	)
	gr.Markdown("""
	# Multimodal Chain-of-Thought Reasoning in Language Models

	<h5 align="center"><i>"Imagine learning a textbook without figures or tables."</i></h5>

	Multimodal-CoT incorporates vision features in a decoupled training framework. The framework consists of two training stages: (i) rationale generation and (ii) answer inference. Both stages share the same model architecture but differ in the input and output.
	""")

	with gr.Blocks(theme='snehilsanyal/scikit-learn') as app2:
	gr.Markdown("## MM 2BB ##")
	with gr.Row():
	image_box = gr.Image(type="filepath")

	chatbot = gr.Chatbot(
	scale = 2,
	height=750
	)
	text_box = gr.Dropdown(
	["what is in the image", "provide alternative title for the image", "how many birds can be seen in the picture?"], label="Animal", info="Will add more animals later!"
	)

	btn = gr.Button("Submit")
	clicked = btn.click(query_message,
	[chatbot,text_box,image_box],
	chatbot
	).then(llm_response,
	[chatbot,text_box,image_box],
	chatbot
	)
	with gr.Blocks(theme='snehilsanyal/scikit-learn') as demo:
	gr.Markdown("# DEMO #")
	gr.TabbedInterface([app1, app2], ["APP #1", "APP #2"])

	demo.queue()
	demo.launch()