Spaces:

shi-labs
/

VCoder

Build error

App Files Files Community

praeclarumjj3 commited on Dec 20, 2023

Commit

016e4dd

•

1 Parent(s): a97500b

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -58

app.py CHANGED Viewed

@@ -80,48 +80,51 @@ def flag_last_response(state, model_selector, request: gr.Request):
     vote_last_response(state, "flag", model_selector, request)
     return ("",) + (disable_btn,) * 3
-def regenerate(state, image_process_mode, seg_process_mode):
     state.messages[-1][-1] = None
     prev_human_msg = state.messages[-2]
     if type(prev_human_msg[1]) in (tuple, list):
-        prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode, prev_human_msg[1][3], seg_process_mode, None, None)
     state.skip_next = False
-    return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
 def clear_history(request: gr.Request):
     state = default_conversation.copy()
-    return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
 def add_text(state, text, image, image_process_mode, seg, seg_process_mode, depth, depth_process_mode, request: gr.Request):
     logger.info(f"add_text. len: {len(text)}")
     if len(text) <= 0 and image is None:
         state.skip_next = True
-        return (state, state.to_gradio_chatbot(), "", None, None) + (no_change_btn,) * 5
     if args.moderate:
         flagged = violates_moderation(text)
         if flagged:
             state.skip_next = True
-            return (state, state.to_gradio_chatbot(), moderation_msg, None, None) + (
                 no_change_btn,) * 5
-    text = text[:1576]  # Hard cut-off
     if image is not None:
-        text = text[:1200]  # Hard cut-off for images
         if '<image>' not in text:
             text = '<image>\n' + text
         if seg is not None:
             if '<seg>' not in text:
                 text = '<seg>\n' + text
-        text = (text, image, image_process_mode, seg, seg_process_mode, None, None)
         if len(state.get_images(return_pil=True)) > 0:
             state = default_conversation.copy()
     state.append_message(state.roles[0], text)
     state.append_message(state.roles[1], None)
     state.skip_next = False
-    return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
 def http_bot(state, model_selector, temperature, top_p, max_new_tokens, request: gr.Request):
@@ -145,24 +148,6 @@ def http_bot(state, model_selector, temperature, top_p, max_new_tokens, request:
     # Construct prompt
     prompt = state.get_prompt()
-    all_images = state.get_images(return_pil=True)
-    all_image_hash = [hashlib.md5(image.tobytes()).hexdigest() for image in all_images]
-    for image, hash in zip(all_images, all_image_hash):
-        t = datetime.datetime.now()
-        filename = os.path.join(LOGDIR, "serve_images", f"{t.year}-{t.month:02d}-{t.day:02d}", f"{hash}.jpg")
-        if not os.path.isfile(filename):
-            os.makedirs(os.path.dirname(filename), exist_ok=True)
-            image.save(filename)
-    all_segs = state.get_segs(return_pil=True)
-    all_seg_hash = [hashlib.md5(seg.tobytes()).hexdigest() for seg in all_segs]
-    for seg, hash in zip(all_segs, all_seg_hash):
-        t = datetime.datetime.now()
-        filename = os.path.join(LOGDIR, "serve_segs", f"{t.year}-{t.month:02d}-{t.day:02d}", f"{hash}.jpg")
-        if not os.path.isfile(filename):
-            os.makedirs(os.path.dirname(filename), exist_ok=True)
-            seg.save(filename)
     # Make requests
     pload = {
         "model": model_name,
@@ -171,13 +156,15 @@ def http_bot(state, model_selector, temperature, top_p, max_new_tokens, request:
         "top_p": float(top_p),
         "max_new_tokens": min(int(max_new_tokens), 1536),
         "stop": state.sep if state.sep_style in [SeparatorStyle.SINGLE, SeparatorStyle.MPT] else state.sep2,
-        "images": f'List of {len(state.get_images())} images: {all_image_hash}',
-        "segs": f'List of {len(state.get_segs())} segs: {all_seg_hash}',
     }
     logger.info(f"==== request ====\n{pload}")
     pload['images'] = state.get_images()
     pload['segs'] = state.get_segs()
     state.messages[-1][-1] = "▌"
     yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
@@ -207,24 +194,8 @@ def http_bot(state, model_selector, temperature, top_p, max_new_tokens, request:
     state.messages[-1][-1] = state.messages[-1][-1][:-1]
     yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5
-    finish_tstamp = time.time()
     logger.info(f"{output}")
-    with open(get_conv_log_filename(), "a") as fout:
-        data = {
-            "tstamp": round(finish_tstamp, 4),
-            "type": "chat",
-            "model": model_name,
-            "start": round(start_tstamp, 4),
-            "finish": round(start_tstamp, 4),
-            "state": state.dict(),
-            "images": all_image_hash,
-            "segs": all_seg_hash,
-            "ip": request.client.host,
-        }
-        fout.write(json.dumps(data) + "\n")
 title = "<h1 style='margin-bottom: -10px; text-align: center'>VCoder: Versatile Vision Encoders for Multimodal Large Language Models</h1>"
 # style='
@@ -284,6 +255,12 @@ def build_demo(embed_mode):
                     ["Crop", "Resize", "Pad", "Default"],
                     value="Default",
                     label="Preprocess for non-square Seg Map", visible=False)
                 with gr.Accordion("Parameters", open=False) as parameter_row:
                     temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.8, step=0.1, interactive=True, label="Temperature",)
@@ -307,13 +284,8 @@ def build_demo(embed_mode):
         cur_dir = os.path.dirname(os.path.abspath(__file__))
         gr.Examples(examples=[
-            [f"{cur_dir}/examples/people.jpg", f"{cur_dir}/examples/people_pan.png", "What objects can be seen in the image?", "0.9", "1.0"],
-            [f"{cur_dir}/examples/corgi.jpg", f"{cur_dir}/examples/corgi_pan.png", "What objects can be seen in the image?", "0.6", "0.7"],
-            [f"{cur_dir}/examples/friends.jpg", f"{cur_dir}/examples/friends_pan.png", "Can you count the number of people in the image?", "0.8", "0.9"],
-            [f"{cur_dir}/examples/friends.jpg", f"{cur_dir}/examples/friends_pan.png", "What is happening in the image?", "0.8", "0.9"],
-            [f"{cur_dir}/examples/suits.jpg", f"{cur_dir}/examples/suits_pan.png", "What objects can be seen in the image?", "0.5", "0.5"],
-            [f"{cur_dir}/examples/suits.jpg", f"{cur_dir}/examples/suits_ins.png", "What objects can be seen in the image?", "0.5", "0.5"],
-        ], inputs=[imagebox, segbox, textbox, temperature, top_p])
         if not embed_mode:
             gr.Markdown(tos_markdown)
@@ -327,16 +299,16 @@ def build_demo(embed_mode):
             [state, model_selector], [textbox, upvote_btn, downvote_btn, flag_btn])
         flag_btn.click(flag_last_response,
             [state, model_selector], [textbox, upvote_btn, downvote_btn, flag_btn])
-        regenerate_btn.click(regenerate, [state, image_process_mode, seg_process_mode],
-            [state, chatbot, textbox, imagebox, segbox] + btn_list).then(
             http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
             [state, chatbot] + btn_list)
-        clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox, segbox] + btn_list)
-        textbox.submit(add_text, [state, textbox, imagebox, image_process_mode, segbox, seg_process_mode], [state, chatbot, textbox, imagebox, segbox] + btn_list
             ).then(http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
                    [state, chatbot] + btn_list)
-        submit_btn.click(add_text, [state, textbox, imagebox, image_process_mode, segbox, seg_process_mode], [state, chatbot, textbox, imagebox, segbox] + btn_list
             ).then(http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
                    [state, chatbot] + btn_list)

     vote_last_response(state, "flag", model_selector, request)
     return ("",) + (disable_btn,) * 3
+def regenerate(state, image_process_mode, seg_process_mode, depth_process_mode):
     state.messages[-1][-1] = None
     prev_human_msg = state.messages[-2]
     if type(prev_human_msg[1]) in (tuple, list):
+        prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode, prev_human_msg[1][3], seg_process_mode, prev_human_msg[1][5], depth_process_mode)
     state.skip_next = False
+    return (state, state.to_gradio_chatbot(), "", None, None, None, None) + (disable_btn,) * 5
 def clear_history(request: gr.Request):
     state = default_conversation.copy()
+    return (state, state.to_gradio_chatbot(), "", None, None, None, None) + (disable_btn,) * 5
 def add_text(state, text, image, image_process_mode, seg, seg_process_mode, depth, depth_process_mode, request: gr.Request):
     logger.info(f"add_text. len: {len(text)}")
     if len(text) <= 0 and image is None:
         state.skip_next = True
+        return (state, state.to_gradio_chatbot(), "", None, None, None, None) + (no_change_btn,) * 5
     if args.moderate:
         flagged = violates_moderation(text)
         if flagged:
             state.skip_next = True
+            return (state, state.to_gradio_chatbot(), moderation_msg, None, None, None, None) + (
                 no_change_btn,) * 5
+    text = text[:1200]  # Hard cut-off
     if image is not None:
+        text = text[:864]  # Hard cut-off for images
         if '<image>' not in text:
             text = '<image>\n' + text
         if seg is not None:
             if '<seg>' not in text:
                 text = '<seg>\n' + text
+        if depth is not None:
+            if '<depth>' not in text:
+                text = '<depth>\n' + text
+        text = (text, image, image_process_mode, seg, seg_process_mode, depth, depth_process_mode)
         if len(state.get_images(return_pil=True)) > 0:
             state = default_conversation.copy()
     state.append_message(state.roles[0], text)
     state.append_message(state.roles[1], None)
     state.skip_next = False
+    return (state, state.to_gradio_chatbot(), "", None, None, None, None) + (disable_btn,) * 5
 def http_bot(state, model_selector, temperature, top_p, max_new_tokens, request: gr.Request):
     # Construct prompt
     prompt = state.get_prompt()
     # Make requests
     pload = {
         "model": model_name,
         "top_p": float(top_p),
         "max_new_tokens": min(int(max_new_tokens), 1536),
         "stop": state.sep if state.sep_style in [SeparatorStyle.SINGLE, SeparatorStyle.MPT] else state.sep2,
+        "images": f'List of {len(state.get_images())}',
+        "segs": f'List of {len(state.get_segs())}',
+        "depths": f'List of {len(state.get_depths())}',
     }
     logger.info(f"==== request ====\n{pload}")
     pload['images'] = state.get_images()
     pload['segs'] = state.get_segs()
+    pload['depths'] = state.get_depths()
     state.messages[-1][-1] = "▌"
     yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
     state.messages[-1][-1] = state.messages[-1][-1][:-1]
     yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5
     logger.info(f"{output}")
 title = "<h1 style='margin-bottom: -10px; text-align: center'>VCoder: Versatile Vision Encoders for Multimodal Large Language Models</h1>"
 # style='
                     ["Crop", "Resize", "Pad", "Default"],
                     value="Default",
                     label="Preprocess for non-square Seg Map", visible=False)
+                depthbox = gr.Image(type="pil", label="Depth Map")
+                depth_process_mode = gr.Radio(
+                    ["Crop", "Resize", "Pad", "Default"],
+                    value="Default",
+                    label="Preprocess for non-square Depth Map", visible=False)
                 with gr.Accordion("Parameters", open=False) as parameter_row:
                     temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.8, step=0.1, interactive=True, label="Temperature",)
         cur_dir = os.path.dirname(os.path.abspath(__file__))
         gr.Examples(examples=[
+            [f"{cur_dir}/examples/suits.jpg", f"{cur_dir}/examples/suits_pan.png", f"{cur_dir}/examples/suits_depth.jpeg", "Can you describe the depth order of the objects in this image, from closest to farthest?", "0.5", "0.5"],
+        ], inputs=[imagebox, segbox, depthbox, textbox, temperature, top_p])
         if not embed_mode:
             gr.Markdown(tos_markdown)
             [state, model_selector], [textbox, upvote_btn, downvote_btn, flag_btn])
         flag_btn.click(flag_last_response,
             [state, model_selector], [textbox, upvote_btn, downvote_btn, flag_btn])
+        regenerate_btn.click(regenerate, [state, image_process_mode, seg_process_mode, depth_process_mode],
+            [state, chatbot, textbox, imagebox, segbox, depthbox] + btn_list).then(
             http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
             [state, chatbot] + btn_list)
+        clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox, segbox, depthbox] + btn_list)
+        textbox.submit(add_text, [state, textbox, imagebox, image_process_mode, segbox, seg_process_mode, depthbox, depth_process_mode], [state, chatbot, textbox, imagebox, segbox, depthbox] + btn_list
             ).then(http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
                    [state, chatbot] + btn_list)
+        submit_btn.click(add_text, [state, textbox, imagebox, image_process_mode, segbox, seg_process_mode, depthbox, depth_process_mode], [state, chatbot, textbox, imagebox, segbox, depthbox] + btn_list
             ).then(http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
                    [state, chatbot] + btn_list)