File size: 4,498 Bytes
b472dbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ec0cf0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b472dbe
 
 
 
 
a3581d4
b472dbe
 
11e0f9b
 
b472dbe
 
 
11e0f9b
7ec0cf0
11e0f9b
 
 
 
b472dbe
 
 
7ec0cf0
 
11e0f9b
7ec0cf0
 
11e0f9b
b472dbe
c6232d6
 
 
 
 
 
 
 
a7e8129
4958f5d
a7e8129
 
c6232d6
a7e8129
 
 
 
 
1d00385
 
 
 
 
a7e8129
 
 
 
 
 
 
 
 
 
e0ebd29
 
 
a7e8129
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import PIL.Image
import gradio as gr
import base64
import time
import os
import google.generativeai as genai

import pathlib

txt_model = genai.GenerativeModel('gemini-pro')
vis_model = genai.GenerativeModel('gemini-pro-vision')

import os

GOOGLE_API_KEY=os.getenv('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

# Image to Base 64 Converter
def image_to_base64(image_path):
    with open(image_path, 'rb') as img:
        encoded_string = base64.b64encode(img.read())
    return encoded_string.decode('utf-8')

# Function that takes User Inputs and displays it on ChatUI
def query_message(history,txt,img):
    if not img:
        history += [(txt,None)]
        return history
    base64 = image_to_base64(img)
    data_url = f"data:image/jpeg;base64,{base64}"
    history += [(f"{txt} ![]({data_url})", None)]
    return history

# Function that takes User Inputs, generates Response and displays on Chat UI
def llm_response(history,text,img):
    if not img:
        response = txt_model.generate_content(text)
        history += [(None,response.text)]
        return history

    else:
        img = PIL.Image.open(img)
        response = vis_model.generate_content([text,img])
        history += [(None,response.text)]
        return history

# Function that takes User Inputs and displays it on ChatUI
def output_query_message(txt,img):
    if not img:
        return txt
    base64 = image_to_base64(img)
    data_url = f"data:image/jpeg;base64,{base64}"
    outputText = [(f"{txt} ![]({data_url})", None)]
    return outputText

# Function that takes User Inputs, generates Response and displays on Chat UI
def output_llm_response(text,img):
    if not img:
        response = txt_model.generate_content(text)
        history = [(None,response.text)]
        return history

    else:
        img = PIL.Image.open(img)
        response = vis_model.generate_content([text,img])
        history = [(None,response.text)]
        return history

        
# Interface Code- Selector method

def sentence_builder(animal, place):
    return f"""how many {animal}s from the {place} are shown in the picture?"""


# gradio block
    
with gr.Blocks(theme='snehilsanyal/scikit-learn') as app1:
    title = gr.Markdown("## COT ##")
    with gr.Row():
        image_box = gr.Image(type="filepath")
    
        outputbox = gr.Textbox(label="Output")
        text_box = gr.Dropdown(
                ["what is in the image", 
                 "provide alternative title for the image", 
                 "how many birds can be seen in the picture?"], 
                 label="Prompts", info="Will add more animals later!"
            )

    btn = gr.Button("Submit")
    clicked = btn.click(output_query_message,
                        [text_box,image_box],
                        outputbox
                        ).then(output_llm_response,
                                [text_box,image_box],
                                outputbox
                                )
    gr.Markdown("""
    # Multimodal Chain-of-Thought Reasoning in Language Models
    
    <h5 align="center"><i>"Imagine learning a textbook without figures or tables."</i></h5>
    
    Multimodal-CoT incorporates vision features in a decoupled training framework. The framework consists of two training stages: (i) rationale generation and (ii) answer inference. Both stages share the same model architecture but differ in the input and output.
    """)

with gr.Blocks(theme='snehilsanyal/scikit-learn') as app2:
    gr.Markdown("## MM 2BB ##")
    with gr.Row():
        image_box = gr.Image(type="filepath")
    
        chatbot = gr.Chatbot(
            scale = 2,
            height=750
        )
    text_box = gr.Dropdown(
                ["what is in the image", 
                 "provide alternative title for the image", 
                 "how many birds can be seen in the picture?"], 
                 label="Select--", 
                 info="Will add more animals later!"
            )

    btn = gr.Button("Submit")
    clicked = btn.click(query_message,
                        [chatbot,text_box,image_box],
                        chatbot
                        ).then(llm_response,
                                [chatbot,text_box,image_box],
                                chatbot
                                )
with gr.Blocks(theme='snehilsanyal/scikit-learn') as demo:
    gr.Markdown("# DEMO #")
    gr.TabbedInterface([app1, app2], ["APP #1", "APP #2"])

demo.queue()
demo.launch()