hsienchen commited on
Commit
89a716b
1 Parent(s): 4958f5d

Create app3.py

Browse files
Files changed (1) hide show
  1. app3.py +111 -0
app3.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PIL.Image
2
+ import gradio as gr
3
+ import base64
4
+ import time
5
+ import os
6
+ import google.generativeai as genai
7
+
8
+ import pathlib
9
+
10
+ txt_model = genai.GenerativeModel('gemini-pro')
11
+ vis_model = genai.GenerativeModel('gemini-pro-vision')
12
+
13
+ import os
14
+
15
+ GOOGLE_API_KEY=os.getenv('GOOGLE_API_KEY')
16
+
17
+ genai.configure(api_key=GOOGLE_API_KEY)
18
+
19
+ # Image to Base 64 Converter
20
+ def image_to_base64(image_path):
21
+ with open(image_path, 'rb') as img:
22
+ encoded_string = base64.b64encode(img.read())
23
+ return encoded_string.decode('utf-8')
24
+
25
+ # Function that takes User Inputs and displays it on ChatUI
26
+ def query_message(history,txt,img):
27
+ if not img:
28
+ history += [(txt,None)]
29
+ return history
30
+ base64 = image_to_base64(img)
31
+ data_url = f"data:image/jpeg;base64,{base64}"
32
+ history += [(f"{txt} ![]({data_url})", None)]
33
+ return history
34
+
35
+ # Function that takes User Inputs, generates Response and displays on Chat UI
36
+ def llm_response(history,text,img):
37
+ if not img:
38
+ response = txt_model.generate_content(text)
39
+ history += [(None,response.text)]
40
+ return history
41
+
42
+ else:
43
+ img = PIL.Image.open(img)
44
+ response = vis_model.generate_content([text,img])
45
+ history += [(None,response.text)]
46
+ return history
47
+
48
+ # Interface Code- Selector method
49
+
50
+ def sentence_builder(animal, place):
51
+ return f"""how many {animal}s from the {place} are shown in the picture?"""
52
+
53
+
54
+ # gradio block
55
+
56
+ with gr.Blocks(theme='freddyaboulton/dracula_revamped') as app1:
57
+ title ="-COT-"
58
+ with gr.Row():
59
+ image_box = gr.Image(type="filepath")
60
+
61
+ chatbot = gr.Chatbot(
62
+ scale = 2,
63
+ height=750
64
+ )
65
+ text_box = gr.Dropdown(
66
+ ["what is in the image", "provide alternative title for the image", "how many birds can be seen in the picture?"], label="Animal", info="Will add more animals later!"
67
+ )
68
+
69
+ btn = gr.Button("Submit")
70
+ clicked = btn.click(query_message,
71
+ [chatbot,text_box,image_box],
72
+ chatbot
73
+ ).then(llm_response,
74
+ [chatbot,text_box,image_box],
75
+ chatbot
76
+ )
77
+ gr.Markdown("""
78
+ # Multimodal Chain-of-Thought Reasoning in Language Models
79
+
80
+ <h5 align="center"><i>"Imagine learning a textbook without figures or tables."</i></h5>
81
+
82
+ Multimodal-CoT incorporates vision features in a decoupled training framework. The framework consists of two training stages: (i) rationale generation and (ii) answer inference. Both stages share the same model architecture but differ in the input and output.
83
+ """)
84
+
85
+ with gr.Blocks(theme='snehilsanyal/scikit-learn') as app2:
86
+ gr.Markdown("## MM 2BB ##")
87
+ with gr.Row():
88
+ image_box = gr.Image(type="filepath")
89
+
90
+ chatbot = gr.Chatbot(
91
+ scale = 2,
92
+ height=750
93
+ )
94
+ text_box = gr.Dropdown(
95
+ ["what is in the image", "provide alternative title for the image", "how many birds can be seen in the picture?"], label="Animal", info="Will add more animals later!"
96
+ )
97
+
98
+ btn = gr.Button("Submit")
99
+ clicked = btn.click(query_message,
100
+ [chatbot,text_box,image_box],
101
+ chatbot
102
+ ).then(llm_response,
103
+ [chatbot,text_box,image_box],
104
+ chatbot
105
+ )
106
+ with gr.Blocks(theme='snehilsanyal/scikit-learn') as demo:
107
+ gr.Markdown("# DEMO #")
108
+ gr.TabbedInterface([app1, app2], ["APP #1", "APP #2"])
109
+
110
+ demo.queue()
111
+ demo.launch()