Spaces:

hsienchen
/

gemini-mm-cot

Sleeping

App Files Files Community

hsienchen commited on Mar 18

Commit

eb46552

•

1 Parent(s): 9720f9c

Update app.py

Browse files

Files changed (1) hide show

app.py +2 -56

app.py CHANGED Viewed

@@ -25,30 +25,6 @@ def image_to_base64(image_path):
         encoded_string = base64.b64encode(img.read())
     return encoded_string.decode('utf-8')
-# Function that takes User Inputs and displays it on ChatUI
-def query_message(history,txt,img):
-    if not img:
-        history += [(txt,None)]
-        return history
-    base64 = image_to_base64(img)
-    data_url = f"data:image/jpeg;base64,{base64}"
-    history += [(f"{txt} ![]({data_url})", None)]
-    return history
-# Function that takes User Inputs, generates Response and displays on Chat UI
-def llm_response(history,text,img):
-    if not img:
-        response = txt_model.generate_content(text)
-        history += [(None,response.text)]
-        return history
-    else:
-        img = PIL.Image.open(img)
-        response = vis_model.generate_content([text,img])
-        history += [(None,response.text)]
-        return history
-# Function that takes User Inputs and displays it on ChatUI
 def output_query_message(img):
     if not img:
@@ -69,13 +45,7 @@ def output_llm_response(img):
         response = vis_model.generate_content([txt_prompt_1,img])
         return response.text
-# Interface Code- Selector method
-def sentence_builder(animal, place):
-    return f"""how many {animal}s from the {place} are shown in the picture?"""
-# gradio block
 with gr.Blocks(theme='snehilsanyal/scikit-learn') as app1:
     with gr.Column():
@@ -98,34 +68,10 @@ with gr.Blocks(theme='snehilsanyal/scikit-learn') as app1:
     Multimodal-CoT incorporates vision features in a decoupled training framework. The framework consists of two training stages: (i) rationale generation and (ii) answer inference. Both stages share the same model architecture but differ in the input and output.
     """)
-with gr.Blocks(theme='snehilsanyal/scikit-learn') as app2:
-    gr.Markdown("## MM 2BB ##")
-    with gr.Row():
-        image_box = gr.Image(type="filepath")
-        chatbot = gr.Chatbot(
-            scale = 2,
-            height=750
-        )
-    text_box = gr.Dropdown(
-                ["what is in the image",
-                 "provide alternative title for the image",
-                 "how many birds can be seen in the picture?"],
-                 label="Select--",
-                 info="Will add more animals later!"
-            )
-    btn = gr.Button("Submit")
-    clicked = btn.click(query_message,
-                        [chatbot,text_box,image_box],
-                        chatbot
-                        ).then(llm_response,
-                                [chatbot,text_box],
-                                chatbot
-                                )
 with gr.Blocks(theme='snehilsanyal/scikit-learn') as demo:
     gr.Markdown("## SOP Camera ##")
-    gr.TabbedInterface([app1, app2], ["Check #1", "Check #2"])
 demo.queue()
 demo.launch()

         encoded_string = base64.b64encode(img.read())
     return encoded_string.decode('utf-8')
 def output_query_message(img):
     if not img:
         response = vis_model.generate_content([txt_prompt_1,img])
         return response.text
 with gr.Blocks(theme='snehilsanyal/scikit-learn') as app1:
     with gr.Column():
     Multimodal-CoT incorporates vision features in a decoupled training framework. The framework consists of two training stages: (i) rationale generation and (ii) answer inference. Both stages share the same model architecture but differ in the input and output.
     """)
 with gr.Blocks(theme='snehilsanyal/scikit-learn') as demo:
     gr.Markdown("## SOP Camera ##")
+    gr.TabbedInterface([app1, app1], ["Check #1", "Check #2"])
 demo.queue()
 demo.launch()