Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -25,30 +25,6 @@ def image_to_base64(image_path):
|
|
25 |
encoded_string = base64.b64encode(img.read())
|
26 |
return encoded_string.decode('utf-8')
|
27 |
|
28 |
-
# Function that takes User Inputs and displays it on ChatUI
|
29 |
-
def query_message(history,txt,img):
|
30 |
-
if not img:
|
31 |
-
history += [(txt,None)]
|
32 |
-
return history
|
33 |
-
base64 = image_to_base64(img)
|
34 |
-
data_url = f"data:image/jpeg;base64,{base64}"
|
35 |
-
history += [(f"{txt} ![]({data_url})", None)]
|
36 |
-
return history
|
37 |
-
|
38 |
-
# Function that takes User Inputs, generates Response and displays on Chat UI
|
39 |
-
def llm_response(history,text,img):
|
40 |
-
if not img:
|
41 |
-
response = txt_model.generate_content(text)
|
42 |
-
history += [(None,response.text)]
|
43 |
-
return history
|
44 |
-
|
45 |
-
else:
|
46 |
-
img = PIL.Image.open(img)
|
47 |
-
response = vis_model.generate_content([text,img])
|
48 |
-
history += [(None,response.text)]
|
49 |
-
return history
|
50 |
-
|
51 |
-
# Function that takes User Inputs and displays it on ChatUI
|
52 |
|
53 |
def output_query_message(img):
|
54 |
if not img:
|
@@ -69,13 +45,7 @@ def output_llm_response(img):
|
|
69 |
response = vis_model.generate_content([txt_prompt_1,img])
|
70 |
return response.text
|
71 |
|
72 |
-
|
73 |
-
# Interface Code- Selector method
|
74 |
-
|
75 |
-
def sentence_builder(animal, place):
|
76 |
-
return f"""how many {animal}s from the {place} are shown in the picture?"""
|
77 |
|
78 |
-
# gradio block
|
79 |
|
80 |
with gr.Blocks(theme='snehilsanyal/scikit-learn') as app1:
|
81 |
with gr.Column():
|
@@ -98,34 +68,10 @@ with gr.Blocks(theme='snehilsanyal/scikit-learn') as app1:
|
|
98 |
Multimodal-CoT incorporates vision features in a decoupled training framework. The framework consists of two training stages: (i) rationale generation and (ii) answer inference. Both stages share the same model architecture but differ in the input and output.
|
99 |
""")
|
100 |
|
101 |
-
|
102 |
-
gr.Markdown("## MM 2BB ##")
|
103 |
-
with gr.Row():
|
104 |
-
image_box = gr.Image(type="filepath")
|
105 |
-
|
106 |
-
chatbot = gr.Chatbot(
|
107 |
-
scale = 2,
|
108 |
-
height=750
|
109 |
-
)
|
110 |
-
text_box = gr.Dropdown(
|
111 |
-
["what is in the image",
|
112 |
-
"provide alternative title for the image",
|
113 |
-
"how many birds can be seen in the picture?"],
|
114 |
-
label="Select--",
|
115 |
-
info="Will add more animals later!"
|
116 |
-
)
|
117 |
-
|
118 |
-
btn = gr.Button("Submit")
|
119 |
-
clicked = btn.click(query_message,
|
120 |
-
[chatbot,text_box,image_box],
|
121 |
-
chatbot
|
122 |
-
).then(llm_response,
|
123 |
-
[chatbot,text_box],
|
124 |
-
chatbot
|
125 |
-
)
|
126 |
with gr.Blocks(theme='snehilsanyal/scikit-learn') as demo:
|
127 |
gr.Markdown("## SOP Camera ##")
|
128 |
-
gr.TabbedInterface([app1,
|
129 |
|
130 |
demo.queue()
|
131 |
demo.launch()
|
|
|
25 |
encoded_string = base64.b64encode(img.read())
|
26 |
return encoded_string.decode('utf-8')
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
def output_query_message(img):
|
30 |
if not img:
|
|
|
45 |
response = vis_model.generate_content([txt_prompt_1,img])
|
46 |
return response.text
|
47 |
|
|
|
|
|
|
|
|
|
|
|
48 |
|
|
|
49 |
|
50 |
with gr.Blocks(theme='snehilsanyal/scikit-learn') as app1:
|
51 |
with gr.Column():
|
|
|
68 |
Multimodal-CoT incorporates vision features in a decoupled training framework. The framework consists of two training stages: (i) rationale generation and (ii) answer inference. Both stages share the same model architecture but differ in the input and output.
|
69 |
""")
|
70 |
|
71 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
with gr.Blocks(theme='snehilsanyal/scikit-learn') as demo:
|
73 |
gr.Markdown("## SOP Camera ##")
|
74 |
+
gr.TabbedInterface([app1, app1], ["Check #1", "Check #2"])
|
75 |
|
76 |
demo.queue()
|
77 |
demo.launch()
|