hsienchen commited on
Commit
77fbc21
1 Parent(s): dac4eed

Create app2.py

Browse files
Files changed (1) hide show
  1. app2.py +86 -0
app2.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PIL.Image
2
+ import gradio as gr
3
+ import base64
4
+ import time
5
+ import os
6
+ import google.generativeai as genai
7
+
8
+ import pathlib
9
+
10
+ txt_model = genai.GenerativeModel('gemini-pro')
11
+ vis_model = genai.GenerativeModel('gemini-pro-vision')
12
+
13
+ import os
14
+
15
+ GOOGLE_API_KEY=os.getenv('GOOGLE_API_KEY')
16
+
17
+ genai.configure(api_key=GOOGLE_API_KEY)
18
+
19
+ # Image to Base 64 Converter
20
+ def image_to_base64(image_path):
21
+ with open(image_path, 'rb') as img:
22
+ encoded_string = base64.b64encode(img.read())
23
+ return encoded_string.decode('utf-8')
24
+
25
+ # Function that takes User Inputs and displays it on ChatUI
26
+ def query_message(history,txt,img):
27
+ if not img:
28
+ history += [(txt,None)]
29
+ return history
30
+ base64 = image_to_base64(img)
31
+ data_url = f"data:image/jpeg;base64,{base64}"
32
+ history += [(f"{txt} ![]({data_url})", None)]
33
+ return history
34
+
35
+ # Function that takes User Inputs, generates Response and displays on Chat UI
36
+ def llm_response(history,text,img):
37
+ if not img:
38
+ response = txt_model.generate_content(text)
39
+ history += [(None,response.text)]
40
+ return history
41
+
42
+ else:
43
+ img = PIL.Image.open(img)
44
+ response = vis_model.generate_content([text,img])
45
+ history += [(None,response.text)]
46
+ return history
47
+
48
+ # Interface Code- Selector method
49
+
50
+ def sentence_builder(animal, place):
51
+ return f"""how many {animal}s from the {place} are shown in the picture?"""
52
+
53
+ # gradio block
54
+
55
+ with gr.Blocks(theme='snehilsanyal/scikit-learn') as app:
56
+ gr.Markdown("## MM COT ##")
57
+ with gr.Row():
58
+ image_box = gr.Image(type="filepath")
59
+
60
+ chatbot = gr.Chatbot(
61
+ scale = 2,
62
+ height=750
63
+ )
64
+ text_box = gr.Dropdown(
65
+ ["what is in the image", "provide alternative title for the image", "how many birds can be seen in the picture?"], label="Animal", info="Will add more animals later!"
66
+ )
67
+
68
+ btn = gr.Button("Submit")
69
+ clicked = btn.click(query_message,
70
+ [chatbot,text_box,image_box],
71
+ chatbot
72
+ ).then(llm_response,
73
+ [chatbot,text_box,image_box],
74
+ chatbot
75
+ )
76
+ gr.Markdown("""
77
+ # Multimodal Chain-of-Thought Reasoning in Language Models
78
+
79
+ <h5 align="center"><i>"Imagine learning a textbook without figures or tables."</i></h5>
80
+
81
+ Multimodal-CoT incorporates vision features in a decoupled training framework. The framework consists of two training stages: (i) rationale generation and (ii) answer inference. Both stages share the same model architecture but differ in the input and output.
82
+ """)
83
+
84
+
85
+ app.queue()
86
+ app.launch()