Spaces:

hsienchen
/

gemini-mm-cot

Sleeping

App Files Files Community

hsienchen commited on Jan 16

Commit

77fbc21

•

1 Parent(s): dac4eed

Create app2.py

Browse files

Files changed (1) hide show

app2.py +86 -0

app2.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import PIL.Image
+import gradio as gr
+import base64
+import time
+import os
+import google.generativeai as genai
+import pathlib
+txt_model = genai.GenerativeModel('gemini-pro')
+vis_model = genai.GenerativeModel('gemini-pro-vision')
+import os
+GOOGLE_API_KEY=os.getenv('GOOGLE_API_KEY')
+genai.configure(api_key=GOOGLE_API_KEY)
+# Image to Base 64 Converter
+def image_to_base64(image_path):
+    with open(image_path, 'rb') as img:
+        encoded_string = base64.b64encode(img.read())
+    return encoded_string.decode('utf-8')
+# Function that takes User Inputs and displays it on ChatUI
+def query_message(history,txt,img):
+    if not img:
+        history += [(txt,None)]
+        return history
+    base64 = image_to_base64(img)
+    data_url = f"data:image/jpeg;base64,{base64}"
+    history += [(f"{txt} ![]({data_url})", None)]
+    return history
+# Function that takes User Inputs, generates Response and displays on Chat UI
+def llm_response(history,text,img):
+    if not img:
+        response = txt_model.generate_content(text)
+        history += [(None,response.text)]
+        return history
+    else:
+        img = PIL.Image.open(img)
+        response = vis_model.generate_content([text,img])
+        history += [(None,response.text)]
+        return history
+# Interface Code- Selector method
+def sentence_builder(animal, place):
+    return f"""how many {animal}s from the {place} are shown in the picture?"""
+# gradio block
+with gr.Blocks(theme='snehilsanyal/scikit-learn') as app:
+    gr.Markdown("## MM COT ##")
+    with gr.Row():
+        image_box = gr.Image(type="filepath")
+        chatbot = gr.Chatbot(
+            scale = 2,
+            height=750
+        )
+    text_box = gr.Dropdown(
+                ["what is in the image", "provide alternative title for the image", "how many birds can be seen in the picture?"], label="Animal", info="Will add more animals later!"
+            )
+    btn = gr.Button("Submit")
+    clicked = btn.click(query_message,
+                        [chatbot,text_box,image_box],
+                        chatbot
+                        ).then(llm_response,
+                                [chatbot,text_box,image_box],
+                                chatbot
+                                )
+    gr.Markdown("""
+    # Multimodal Chain-of-Thought Reasoning in Language Models
+    <h5 align="center"><i>"Imagine learning a textbook without figures or tables."</i></h5>
+    Multimodal-CoT incorporates vision features in a decoupled training framework. The framework consists of two training stages: (i) rationale generation and (ii) answer inference. Both stages share the same model architecture but differ in the input and output.
+    """)
+app.queue()
+app.launch()