Spaces:

shi-labs
/

VCoder

Build error

App Files Files Community

praeclarumjj3 commited on Dec 9, 2023

Commit

d3cee44

•

1 Parent(s): 0bd6903

:zap: Build space

Browse files

Files changed (38) hide show

.DS_Store +0 -0
.gitattributes +3 -0
LICENSE +201 -0
README.md +3 -3
app.py +389 -0
chat.py +205 -0
examples/3.jpg +3 -0
examples/3_ins.png +3 -0
examples/3_pan.png +3 -0
requirements.txt +30 -0
vcoder_llava/.DS_Store +0 -0
vcoder_llava/__init__.py +1 -0
vcoder_llava/constants.py +12 -0
vcoder_llava/data_utils.py +157 -0
vcoder_llava/mm_utils.py +151 -0
vcoder_llava/model/.DS_Store +0 -0
vcoder_llava/model/__init__.py +3 -0
vcoder_llava/model/apply_delta.py +48 -0
vcoder_llava/model/builder.py +152 -0
vcoder_llava/model/consolidate.py +29 -0
vcoder_llava/model/language_model/llava_llama.py +165 -0
vcoder_llava/model/language_model/vcoder_ds_llava_llama.py +145 -0
vcoder_llava/model/language_model/vcoder_llava_llama.py +142 -0
vcoder_llava/model/llava_arch.py +200 -0
vcoder_llava/model/make_delta.py +52 -0
vcoder_llava/model/multimodal_adapter/builder.py +49 -0
vcoder_llava/model/multimodal_depth_adapter/builder.py +50 -0
vcoder_llava/model/multimodal_encoder/builder.py +11 -0
vcoder_llava/model/multimodal_encoder/clip_encoder.py +78 -0
vcoder_llava/model/multimodal_projector/builder.py +51 -0
vcoder_llava/model/utils.py +20 -0
vcoder_llava/model/vcd/vcd_add_noise.py +28 -0
vcoder_llava/model/vcd/vcd_sample.py +250 -0
vcoder_llava/model/vcoder_ds_llava_arch.py +323 -0
vcoder_llava/model/vcoder_llava_arch.py +254 -0
vcoder_llava/questions.py +110 -0
vcoder_llava/utils.py +126 -0
vcoder_llava/vcoder_conversation.py +374 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
 title: VCoder
-emoji: 🏃
-colorFrom: indigo
-colorTo: pink
 sdk: gradio
 sdk_version: 4.8.0
 app_file: app.py

 ---
 title: VCoder
+emoji: ✌️
+colorFrom: yellow
+colorTo: orange
 sdk: gradio
 sdk_version: 4.8.0
 app_file: app.py

app.py ADDED Viewed

	@@ -0,0 +1,389 @@

+import argparse
+import datetime
+import json
+import os
+import time
+import gradio as gr
+import requests
+import hashlib
+from vcoder_llava.vcoder_conversation import (default_conversation, conv_templates,
+                                   SeparatorStyle)
+from vcoder_llava.constants import LOGDIR
+from vcoder_llava.utils import (build_logger, server_error_msg,
+                          violates_moderation, moderation_msg)
+from .chat import Chat
+logger = build_logger("gradio_app", "gradio_web_server.log")
+headers = {"User-Agent": "VCoder Client"}
+no_change_btn = gr.Button.update()
+enable_btn = gr.Button.update(interactive=True)
+disable_btn = gr.Button.update(interactive=False)
+priority = {
+    "vicuna-13b": "aaaaaaa",
+    "koala-13b": "aaaaaab",
+}
+def get_conv_log_filename():
+    t = datetime.datetime.now()
+    name = os.path.join(LOGDIR, f"{t.year}-{t.month:02d}-{t.day:02d}-conv.json")
+    return name
+get_window_url_params = """
+function() {
+    const params = new URLSearchParams(window.location.search);
+    url_params = Object.fromEntries(params);
+    console.log(url_params);
+    return url_params;
+    }
+"""
+def load_demo_refresh_model_list(request: gr.Request):
+    logger.info(f"load_demo. ip: {request.client.host}")
+    state = default_conversation.copy()
+    dropdown_update = gr.Dropdown.update(
+        choices=models,
+        value=models[0] if len(models) > 0 else ""
+    )
+    return state, dropdown_update
+def vote_last_response(state, vote_type, model_selector, request: gr.Request):
+    with open(get_conv_log_filename(), "a") as fout:
+        data = {
+            "tstamp": round(time.time(), 4),
+            "type": vote_type,
+            "model": model_selector,
+            "state": state.dict(),
+        }
+        fout.write(json.dumps(data) + "\n")
+def upvote_last_response(state, model_selector, request: gr.Request):
+    vote_last_response(state, "upvote", model_selector, request)
+    return ("",) + (disable_btn,) * 3
+def downvote_last_response(state, model_selector, request: gr.Request):
+    vote_last_response(state, "downvote", model_selector, request)
+    return ("",) + (disable_btn,) * 3
+def flag_last_response(state, model_selector, request: gr.Request):
+    vote_last_response(state, "flag", model_selector, request)
+    return ("",) + (disable_btn,) * 3
+def regenerate(state, image_process_mode, seg_process_mode):
+    state.messages[-1][-1] = None
+    prev_human_msg = state.messages[-2]
+    if type(prev_human_msg[1]) in (tuple, list):
+        prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode, prev_human_msg[1][3], seg_process_mode, None, None)
+    state.skip_next = False
+    return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
+def clear_history(request: gr.Request):
+    state = default_conversation.copy()
+    return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
+def add_text(state, text, image, image_process_mode, seg, seg_process_mode, depth, depth_process_mode, request: gr.Request):
+    logger.info(f"add_text. len: {len(text)}")
+    if len(text) <= 0 and image is None:
+        state.skip_next = True
+        return (state, state.to_gradio_chatbot(), "", None, None) + (no_change_btn,) * 5
+    if args.moderate:
+        flagged = violates_moderation(text)
+        if flagged:
+            state.skip_next = True
+            return (state, state.to_gradio_chatbot(), moderation_msg, None, None) + (
+                no_change_btn,) * 5
+    text = text[:1576]  # Hard cut-off
+    if image is not None:
+        text = text[:1200]  # Hard cut-off for images
+        if '<image>' not in text:
+            text = '<image>\n' + text
+        if seg is not None:
+            if '<seg>' not in text:
+                text = '<seg>\n' + text
+        text = (text, image, image_process_mode, seg, seg_process_mode, None, None)
+        if len(state.get_images(return_pil=True)) > 0:
+            state = default_conversation.copy()
+    state.append_message(state.roles[0], text)
+    state.append_message(state.roles[1], None)
+    state.skip_next = False
+    return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
+def http_bot(state, model_selector, temperature, top_p, max_new_tokens, request: gr.Request):
+    start_tstamp = time.time()
+    model_name = model_selector
+    if state.skip_next:
+        # This generate call is skipped due to invalid inputs
+        yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5
+        return
+    if len(state.messages) == state.offset + 2:
+        # First round of conversation
+        if "llava" in model_name.lower():
+            template_name = "llava_v1"
+        new_state = conv_templates[template_name].copy()
+        new_state.append_message(new_state.roles[0], state.messages[-2][1])
+        new_state.append_message(new_state.roles[1], None)
+        state = new_state
+    # Construct prompt
+    prompt = state.get_prompt()
+    all_images = state.get_images(return_pil=True)
+    all_image_hash = [hashlib.md5(image.tobytes()).hexdigest() for image in all_images]
+    for image, hash in zip(all_images, all_image_hash):
+        t = datetime.datetime.now()
+        filename = os.path.join(LOGDIR, "serve_images", f"{t.year}-{t.month:02d}-{t.day:02d}", f"{hash}.jpg")
+        if not os.path.isfile(filename):
+            os.makedirs(os.path.dirname(filename), exist_ok=True)
+            image.save(filename)
+    all_segs = state.get_segs(return_pil=True)
+    all_seg_hash = [hashlib.md5(seg.tobytes()).hexdigest() for seg in all_segs]
+    for seg, hash in zip(all_segs, all_seg_hash):
+        t = datetime.datetime.now()
+        filename = os.path.join(LOGDIR, "serve_segs", f"{t.year}-{t.month:02d}-{t.day:02d}", f"{hash}.jpg")
+        if not os.path.isfile(filename):
+            os.makedirs(os.path.dirname(filename), exist_ok=True)
+            seg.save(filename)
+    # Make requests
+    pload = {
+        "model": model_name,
+        "prompt": prompt,
+        "temperature": float(temperature),
+        "top_p": float(top_p),
+        "max_new_tokens": min(int(max_new_tokens), 1536),
+        "stop": state.sep if state.sep_style in [SeparatorStyle.SINGLE, SeparatorStyle.MPT] else state.sep2,
+        "images": f'List of {len(state.get_images())} images: {all_image_hash}',
+        "segs": f'List of {len(state.get_segs())} segs: {all_seg_hash}',
+    }
+    logger.info(f"==== request ====\n{pload}")
+    pload['images'] = state.get_images()
+    pload['segs'] = state.get_segs()
+    state.messages[-1][-1] = "▌"
+    yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
+    try:
+        # Stream output
+        response = chat.generate_stream_gate(pload)
+        for chunk in response:
+            if chunk:
+                data = json.loads(chunk.decode())
+                if data["error_code"] == 0:
+                    output = data["text"][len(prompt):].strip()
+                    state.messages[-1][-1] = output + "▌"
+                    yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
+                else:
+                    output = data["text"] + f" (error_code: {data['error_code']})"
+                    state.messages[-1][-1] = output
+                    yield (state, state.to_gradio_chatbot()) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
+                    return
+                time.sleep(0.03)
+    except:
+        state.messages[-1][-1] = server_error_msg
+        yield (state, state.to_gradio_chatbot()) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
+        return
+    state.messages[-1][-1] = state.messages[-1][-1][:-1]
+    yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5
+    finish_tstamp = time.time()
+    logger.info(f"{output}")
+    with open(get_conv_log_filename(), "a") as fout:
+        data = {
+            "tstamp": round(finish_tstamp, 4),
+            "type": "chat",
+            "model": model_name,
+            "start": round(start_tstamp, 4),
+            "finish": round(start_tstamp, 4),
+            "state": state.dict(),
+            "images": all_image_hash,
+            "segs": all_seg_hash,
+            "ip": request.client.host,
+        }
+        fout.write(json.dumps(data) + "\n")
+title_markdown = ("""
+# 🌋 LLaVA: Large Language and Vision Assistant
+[[Project Page]](https://llava-vl.github.io) [[Paper]](https://arxiv.org/abs/2304.08485) [[Code]](https://github.com/haotian-liu/LLaVA) [[Model]](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)
+""")
+tos_markdown = ("""
+### Terms of use
+By using this service, users are required to agree to the following terms:
+The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
+Please click the "Flag" button if you get any inappropriate answer! We will collect those to keep improving our moderator.
+For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
+""")
+learn_more_markdown = ("""
+### License
+The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA, [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI, and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation.
+""")
+block_css = """
+#buttons button {
+    min-width: min(120px,100%);
+}
+"""
+def build_demo(embed_mode):
+    textbox = gr.Textbox(show_label=False, placeholder="Enter text and press ENTER", container=False)
+    with gr.Blocks(title="LLaVA", theme=gr.themes.Default(), css=block_css) as demo:
+        state = gr.State()
+        if not embed_mode:
+            gr.Markdown(title_markdown)
+        with gr.Row():
+            with gr.Column(scale=3):
+                with gr.Row(elem_id="model_selector_row"):
+                    model_selector = gr.Dropdown(
+                        choices=models,
+                        value=models[0] if len(models) > 0 else "",
+                        interactive=True,
+                        show_label=False,
+                        container=False)
+                # with gr.Row():
+                imagebox = gr.Image(type="pil", label="Image Input")
+                image_process_mode = gr.Radio(
+                    ["Crop", "Resize", "Pad", "Default"],
+                    value="Default",
+                    label="Preprocess for non-square image", visible=False)
+                segbox = gr.Image(type="pil", label="Seg Map")
+                seg_process_mode = gr.Radio(
+                    ["Crop", "Resize", "Pad", "Default"],
+                    value="Default",
+                    label="Preprocess for non-square Seg Map", visible=False)
+                cur_dir = os.path.dirname(os.path.abspath(__file__))
+                gr.Examples(examples=[
+                    [f"{cur_dir}/examples/3.jpg", f"{cur_dir}/examples/3_pan.png", "What objects can be seen in the image?"],
+                    [f"{cur_dir}/examples/3.jpg", f"{cur_dir}/examples/3_ins.png", "What objects can be seen in the image?"],
+                ], inputs=[imagebox, segbox, textbox])
+                with gr.Accordion("Parameters", open=False) as parameter_row:
+                    temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.2, step=0.1, interactive=True, label="Temperature",)
+                    top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.1, interactive=True, label="Top P",)
+                    max_output_tokens = gr.Slider(minimum=0, maximum=1024, value=512, step=64, interactive=True, label="Max output tokens",)
+            with gr.Column(scale=8):
+                chatbot = gr.Chatbot(elem_id="chatbot", label="VCoder Chatbot", height=550)
+                with gr.Row():
+                    with gr.Column(scale=8):
+                        textbox.render()
+                    with gr.Column(scale=1, min_width=50):
+                        submit_btn = gr.Button(value="Send", variant="primary")
+                with gr.Row(elem_id="buttons") as button_row:
+                    upvote_btn = gr.Button(value="👍  Upvote", interactive=False)
+                    downvote_btn = gr.Button(value="👎  Downvote", interactive=False)
+                    flag_btn = gr.Button(value="⚠️  Flag", interactive=False)
+                    #stop_btn = gr.Button(value="⏹️  Stop Generation", interactive=False)
+                    regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
+                    clear_btn = gr.Button(value="🗑️  Clear", interactive=False)
+        if not embed_mode:
+            gr.Markdown(tos_markdown)
+            gr.Markdown(learn_more_markdown)
+        # Register listeners
+        btn_list = [upvote_btn, downvote_btn, flag_btn, regenerate_btn, clear_btn]
+        upvote_btn.click(upvote_last_response,
+            [state, model_selector], [textbox, upvote_btn, downvote_btn, flag_btn])
+        downvote_btn.click(downvote_last_response,
+            [state, model_selector], [textbox, upvote_btn, downvote_btn, flag_btn])
+        flag_btn.click(flag_last_response,
+            [state, model_selector], [textbox, upvote_btn, downvote_btn, flag_btn])
+        regenerate_btn.click(regenerate, [state, image_process_mode, seg_process_mode],
+            [state, chatbot, textbox, imagebox, segbox] + btn_list).then(
+            http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
+            [state, chatbot] + btn_list)
+        clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox, segbox] + btn_list)
+        textbox.submit(add_text, [state, textbox, imagebox, image_process_mode, segbox, seg_process_mode], [state, chatbot, textbox, imagebox, segbox] + btn_list
+            ).then(http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
+                   [state, chatbot] + btn_list)
+        submit_btn.click(add_text, [state, textbox, imagebox, image_process_mode, segbox, seg_process_mode], [state, chatbot, textbox, imagebox, segbox] + btn_list
+            ).then(http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
+                   [state, chatbot] + btn_list)
+        demo.load(load_demo_refresh_model_list, None, [state, model_selector])
+    return demo
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--model-name", type=str)
+    parser.add_argument("--load-8bit", action="store_true")
+    parser.add_argument("--load-4bit", action="store_true")
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--share", action="store_true")
+    parser.add_argument("--moderate", action="store_true")
+    parser.add_argument("--embed", action="store_true")
+    parser.add_argument("--concurrency-count", type=int, default=10)
+    parser.add_argument("--host", type=str, default="0.0.0.0")
+    parser.add_argument("--port", type=int)
+    args = parser.parse_args()
+    logger.info(f"args: {args}")
+    if args.model_name is None:
+        model_paths = args.model_path.split("/")
+        if model_paths[-1].startswith('checkpoint-'):
+            model_name = model_paths[-2] + "_" + model_paths[-1]
+        else:
+            model_name = model_paths[-1]
+    else:
+        model_name = args.model_name
+    models = [model_name]
+    chat = Chat(
+        args.model_path,
+        args.model_base,
+        args.model_name,
+        args.load_8bit,
+        args.load_4bit,
+        args.device,
+        logger
+    )
+    logger.info(args)
+    demo = build_demo(args.embed)
+    demo.queue(
+        concurrency_count=args.concurrency_count,
+        api_open=False
+    ).launch(
+        server_name=args.host,
+        server_port=args.port,
+        share=args.share
+    )

chat.py ADDED Viewed

	@@ -0,0 +1,205 @@

+"""
+A model worker executes the model.
+"""
+import argparse
+import json
+import torch
+from vcoder_llava.utils import server_error_msg
+from vcoder_llava.model.builder import load_pretrained_model
+from vcoder_llava.mm_utils import process_images, load_image_from_base64, tokenizer_seg_token, tokenizer_depth_seg_token, tokenizer_image_token, KeywordsStoppingCriteria
+from vcoder_llava.constants import (
+    IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN,
+    SEG_TOKEN_INDEX, DEFAULT_SEG_TOKEN,
+    DEPTH_TOKEN_INDEX, DEFAULT_DEPTH_TOKEN
+)
+from transformers import TextIteratorStreamer
+class Chat:
+    def __init__(self, model_path, model_base, model_name,
+                 load_8bit, load_4bit, device, logger):
+        if model_path.endswith("/"):
+            model_path = model_path[:-1]
+        if model_name is None:
+            model_paths = model_path.split("/")
+            if model_paths[-1].startswith('checkpoint-'):
+                self.model_name = model_paths[-2] + "_" + model_paths[-1]
+            else:
+                self.model_name = model_paths[-1]
+        else:
+            self.model_name = model_name
+        self.device = device
+        logger.info(f"Loading the model {self.model_name} ...")
+        self.tokenizer, self.model, self.image_processor, self.seg_image_processor, self.depth_image_processor, self.context_len = load_pretrained_model(
+            model_path, model_base, self.model_name, load_8bit, load_4bit, device=self.device)
+        self.is_multimodal = 'llava' in self.model_name.lower()
+        self.is_seg = "seg_llava" in self.model_name.lower()
+        self.is_depth = False
+    @torch.inference_mode()
+    def generate_stream(self, params):
+        tokenizer, model, image_processor, seg_image_processor, depth_image_processor = self.tokenizer, self.model, self.image_processor, self.seg_image_processor, self.depth_image_processor
+        prompt = params["prompt"]
+        ori_prompt = prompt
+        images = params.get("images", None)
+        segs = params.get("segs", None)
+        depths = params.get("depths", None)
+        num_image_tokens = 0
+        num_seg_tokens = 0
+        num_depth_tokens = 0
+        if images is not None and len(images) > 0 and self.is_multimodal:
+            if len(images) > 0:
+                if len(images) != prompt.count(DEFAULT_IMAGE_TOKEN):
+                    raise ValueError("Number of images does not match number of <image> tokens in prompt")
+                images = [load_image_from_base64(image) for image in images]
+                images = process_images(images, image_processor, model.config)
+                if type(images) is list:
+                    images = [image.to(self.model.device, dtype=torch.float16) for image in images]
+                else:
+                    images = images.to(self.model.device, dtype=torch.float16)
+                replace_token = DEFAULT_IMAGE_TOKEN
+                prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
+                num_image_tokens = prompt.count(replace_token) * model.get_vision_tower().num_patches
+                if segs is not None and len(segs) > 0 and self.is_seg:
+                    if len(segs) != prompt.count(DEFAULT_SEG_TOKEN):
+                        raise ValueError("Number of segs does not match number of <seg> tokens in prompt")
+                    segs = [load_image_from_base64(seg) for seg in segs]
+                    segs = process_images(segs, seg_image_processor, model.config)
+                    if type(segs) is list:
+                        segs = [seg.to(self.model.device, dtype=torch.float16) for seg in segs]
+                    else:
+                        segs = segs.to(self.model.device, dtype=torch.float16)
+                    replace_seg_token = DEFAULT_SEG_TOKEN
+                    prompt = prompt.replace(DEFAULT_SEG_TOKEN, replace_seg_token)
+                    num_seg_tokens = prompt.count(replace_seg_token) * model.get_vision_tower().num_patches
+                    if depths is not None and len(depths) > 0 and self.is_depth:
+                        if len(depths) != prompt.count(DEFAULT_DEPTH_TOKEN):
+                            raise ValueError("Number of depths does not match number of <depth> tokens in prompt")
+                        depths = [load_image_from_base64(depth) for depth in depths]
+                        depths = process_images(depths, depth_image_processor, model.config)
+                        if type(depths) is list:
+                            depths = [depth.to(self.model.device, dtype=torch.float16) for depth in depths]
+                        else:
+                            depths = depths.to(self.model.device, dtype=torch.float16)
+                        replace_depth_token = DEFAULT_DEPTH_TOKEN
+                        prompt = prompt.replace(DEFAULT_DEPTH_TOKEN, replace_depth_token)
+                        num_depth_tokens = prompt.count(replace_depth_token) * model.get_vision_tower().num_patches
+                    else:
+                        depths = None
+                else:
+                    segs = None
+                    depths = None
+            else:
+                images = None
+                segs = None
+                depths = None
+            image_args = {"images": images, "segs": segs, "depths": depths}
+        else:
+            images = None
+            segs = None
+            depths = None
+            image_args = {}
+        temperature = float(params.get("temperature", 1.0))
+        top_p = float(params.get("top_p", 1.0))
+        max_context_length = getattr(model.config, 'max_position_embeddings', 2048)
+        max_new_tokens = min(int(params.get("max_new_tokens", 256)), 1024)
+        stop_str = params.get("stop", None)
+        do_sample = True if temperature > 0.001 else False
+        if self.is_seg:
+            if self.is_depth:
+                input_ids = tokenizer_depth_seg_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, SEG_TOKEN_INDEX, DEPTH_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)
+            else:
+                input_ids = tokenizer_seg_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, SEG_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)
+        else:
+            input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=15)
+        max_new_tokens = min(max_new_tokens, max_context_length - input_ids.shape[-1] - num_image_tokens - num_seg_tokens - num_depth_tokens)
+        if max_new_tokens < 1:
+            yield json.dumps({"text": ori_prompt + "Exceeds max token length. Please start a new conversation, thanks.", "error_code": 0}).encode() + b"\0"
+            return
+        generated_text = model.generate(
+            inputs=input_ids,
+            do_sample=do_sample,
+            temperature=temperature,
+            top_p=top_p,
+            max_new_tokens=max_new_tokens,
+            streamer=streamer,
+            stopping_criteria=[stopping_criteria],
+            use_cache=True,
+            **image_args
+        )
+        # thread.start()
+        generated_text = ori_prompt
+        for new_text in streamer:
+            generated_text += new_text
+            if generated_text.endswith(stop_str):
+                generated_text = generated_text[:-len(stop_str)]
+            yield json.dumps({"text": generated_text, "error_code": 0}).encode()
+    def generate_stream_gate(self, params):
+        try:
+            for x in self.generate_stream(params):
+                yield x
+        except ValueError as e:
+            print("Caught ValueError:", e)
+            ret = {
+                "text": server_error_msg,
+                "error_code": 1,
+            }
+            yield json.dumps(ret).encode()
+        except torch.cuda.CudaError as e:
+            print("Caught torch.cuda.CudaError:", e)
+            ret = {
+                "text": server_error_msg,
+                "error_code": 1,
+            }
+            yield json.dumps(ret).encode()
+        except Exception as e:
+            print("Caught Unknown Error", e)
+            ret = {
+                "text": server_error_msg,
+                "error_code": 1,
+            }
+            yield json.dumps(ret).encode()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=21002)
+    parser.add_argument("--worker-address", type=str,
+        default="http://localhost:21002")
+    parser.add_argument("--controller-address", type=str,
+        default="http://localhost:21001")
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--model-name", type=str)
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--multi-modal", action="store_true", help="Multimodal mode is automatically detected with model name, please make sure `llava` is included in the model path.")
+    parser.add_argument("--limit-model-concurrency", type=int, default=5)
+    parser.add_argument("--stream-interval", type=int, default=1)
+    parser.add_argument("--no-register", action="store_true")
+    parser.add_argument("--load-8bit", action="store_true")
+    parser.add_argument("--load-4bit", action="store_true")
+    args = parser.parse_args()

examples/3.jpg ADDED Viewed

Git LFS Details

SHA256: 721367369f53ecefddeeb16383eceab43835e143fd1d9aeed05d2f3ad9356410
Pointer size: 131 Bytes
Size of remote file: 268 kB

examples/3_ins.png ADDED Viewed

Git LFS Details

SHA256: 817b7679286d4079fd0a165d8c7689b5c7a89c0217a6da27be728576cc7a04d8
Pointer size: 129 Bytes
Size of remote file: 8.92 kB

examples/3_pan.png ADDED Viewed

Git LFS Details

SHA256: f2e392734be1a44aee7609459ab50ea01d6a7866d5170b46c15f7c00512d1701
Pointer size: 130 Bytes
Size of remote file: 15.3 kB

requirements.txt ADDED Viewed

	@@ -0,0 +1,30 @@

+--extra-index-url https://download.pytorch.org/whl/cu117
+torch==2.0.0+cu117
+packaging
+Pillow
+huggingface_hub
+matplotlib
+flash-attn
+gradio
+fastapi
+numpy,
+requests
+sentencepiece
+tokenizers>=0.12.1,
+uvicorn
+chardet,
+shortuuid
+httpx==0.24.0,
+spacy
+inflect
+peft==0.4.0
+num2words,
+transformers==4.31.0,
+accelerate==0.21.0,
+bitsandbytes==0.41.0,
+scikit-learn==1.2.2,
+sentencepiece==0.1.99,
+einops==0.6.1
+einops-exts==0.0.4
+timm==0.6.13,
+gradio_client==0.2.9

vcoder_llava/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

vcoder_llava/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .model import LlavaLlamaForCausalLM, VCoderLlavaLlamaForCausalLM, VCoderDSLlavaLlamaForCausalLM

vcoder_llava/constants.py ADDED Viewed

	@@ -0,0 +1,12 @@

+LOGDIR = "."
+# Model Constants
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+SEG_TOKEN_INDEX = -300
+DEFAULT_SEG_TOKEN = "<seg>"
+DEPTH_TOKEN_INDEX = -400
+DEFAULT_DEPTH_TOKEN = "<depth>"

vcoder_llava/data_utils.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import nltk
+import spacy
+from word2number import w2n
+import inflect
+from num2words import num2words
+p = inflect.engine()
+import numpy as np
+import random
+nltk.download('punkt')
+nltk.download('averaged_perceptron_tagger')
+nlp = spacy.load('en_core_web_sm')
+# object names with two words
+SPECIAL_WORDS = ['baseball bat',
+        'baseball glove',
+        'cell phone',
+        'dining table',
+        'fire hydrant',
+        'french fries',
+        'hair drier',
+        'hot dog',
+        'parking meter',
+        'potted plant',
+        'soccer ball',
+        'soccer player',
+        'sports ball',
+        'stop sign',
+        'teddy bear',
+        'tennis racket',
+        'toy figure',
+        'traffic light',
+        'wine glass']
+def _get_nouns(lines):
+    # function to test if something is a noun
+    present_words = []
+    for s in SPECIAL_WORDS:
+        if s in lines:
+            present_words.append(s)
+    for w in present_words:
+        lines = lines.replace(w, "")
+    is_noun = lambda pos: pos[:2] == 'NN' or pos[:2] == 'NNP'
+    # do the nlp stuff
+    tokenized = nltk.word_tokenize(lines)
+    nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)]
+    noun_dict = {}
+    if "objects" in nouns:
+        nouns.remove("objects")
+    if "image" in nouns:
+        nouns.remove("image")
+    for n in nouns:
+        if n not in noun_dict.keys():
+            noun_dict[n] = 1
+        else:
+            noun_dict[n] += 1
+    nouns = {}
+    for k, v in noun_dict.items():
+        if not (k == "bus" or k == "skis"):
+            if v == 1:
+                if p.singular_noun(k):
+                    k = p.singular_noun(k)
+            else:
+                if not p.singular_noun(k):
+                    k = p.plural(k)
+        try:
+            w2n.word_to_num(k)
+        except:
+            if len(k) >= 3:
+                if k == "ski":
+                    k = "skis"
+                elif k == "gras":
+                        k = "grass"
+                nouns[k] = v
+    for w in present_words:
+        nouns[w] = 1
+    return nouns
+def _get_num_nouns(lines):
+    lines = lines.replace(":", "").replace(".", "")
+    doc = nlp(lines)
+    num_nouns = [chunk.text for chunk in doc.noun_chunks if any(token.pos_ == 'NUM' for token in chunk)]
+    num_noun_dict = {}
+    for n in num_nouns:
+        nums = n.split(", ")
+        for n in nums:
+            try:
+                w = " ".join(n.split(' ')[1:])
+                if w == "ski":
+                    w = "skis"
+                num_noun_dict[w] = w2n.word_to_num(n.split(' ')[0])
+            except:
+                pass
+    return num_noun_dict
+def _obtain_nouns(gt):
+    gt = gt.replace("hair dryer", "hair drier").lower()
+    nouns_gt = _get_nouns(gt)
+    num_nouns_gt = _get_num_nouns(gt)
+    com_keys = []
+    for k in nouns_gt.keys():
+        if p.plural(k) in num_nouns_gt.keys():
+            com_keys.append(k)
+    for k in com_keys:
+        del nouns_gt[k]
+    num_nouns_gt = {**num_nouns_gt, **nouns_gt}
+    return num_nouns_gt
+def generate_qa_pairs(text):
+    num_nouns = _obtain_nouns(text)
+    qa_pairs = []
+    for obj, count in num_nouns.items():
+        # Count question
+        if count == 1:
+            plural_obj = p.plural(obj)
+        else:
+            plural_obj = obj
+        count_question = f"How many {plural_obj} are there in the image?"
+        count_answer = f"There {'is' if count == 1 else 'are'} {num2words(count)} {obj} in the image."
+        qa_pairs.append((count_question, count_answer))
+        prob_positive = np.random.uniform(0,1.)
+        if prob_positive > 0.7 or count == 1:
+            numeric_presence_question = f"{'Is' if count == 1 else 'Are'} there {num2words(count)} {obj} in the image?"
+            numeric_presence_answer = "Yes."
+        elif count > 1:
+            numbers = [i for i in range(2, count + 6) if i != count]
+            # Select a random number from the range
+            cnt = random.choice(numbers)
+            numeric_presence_question = f"{'Is' if cnt == 1 else 'Are'} there {num2words(cnt)} {obj} in the image?"
+            numeric_presence_answer = "No."
+        qa_pairs.append((numeric_presence_question, numeric_presence_answer))
+        random.shuffle(qa_pairs)
+    return random.sample(qa_pairs, min(len(qa_pairs), random.choice([1, 2, 3, 4, 5, 6])))
+if __name__ == "__main__":
+    text = "The objects present in the image are: wall, ceiling, shelf, cabinet, counter, dining table, two people, eighteen bottles, two wine glasses, refrigerator, tv, bowl"
+    qa = generate_qa_pairs(text)
+    from icecream import ic
+    ic(qa)

vcoder_llava/mm_utils.py ADDED Viewed

	@@ -0,0 +1,151 @@

+from PIL import Image
+from io import BytesIO
+import base64
+import torch
+from transformers import StoppingCriteria
+from vcoder_llava.constants import IMAGE_TOKEN_INDEX, SEG_TOKEN_INDEX, DEPTH_TOKEN_INDEX
+def load_image_from_base64(image):
+    return Image.open(BytesIO(base64.b64decode(image)))
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+def process_images(images, image_processor, model_cfg):
+    image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+    new_images = []
+    if image_aspect_ratio == 'pad':
+        for image in images:
+            image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
+            image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            new_images.append(image)
+    else:
+        return image_processor(images, return_tensors='pt')['pixel_values']
+    if all(x.shape == new_images[0].shape for x in new_images):
+        new_images = torch.stack(new_images, dim=0)
+    return new_images
+def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids
+def tokenizer_seg_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, seg_token_index=SEG_TOKEN_INDEX, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<seg>\n<image>')]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    for x in insert_separator(prompt_chunks, [seg_token_index, image_token_index] * (offset + 1)):
+        if seg_token_index in x:
+            input_ids.extend(x[offset:-1])
+        else:
+            input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids
+def _tokenizer_depth_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, seg_token_index=SEG_TOKEN_INDEX, depth_token_index=DEPTH_TOKEN_INDEX, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<depth>\n<seg>\n<image>')]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    for x in insert_separator(prompt_chunks, [image_token_index, depth_token_index, seg_token_index] * (offset + 1)):
+        if depth_token_index in x and seg_token_index in x:
+            input_ids.extend(x[:3])
+        else:
+            input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids
+def tokenizer_depth_seg_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, seg_token_index=SEG_TOKEN_INDEX, depth_token_index=DEPTH_TOKEN_INDEX, return_tensors=None):
+    if "<depth>" in prompt:
+        return _tokenizer_depth_token(prompt, tokenizer, image_token_index, seg_token_index, depth_token_index, return_tensors)
+    else:
+        return tokenizer_seg_token(prompt, tokenizer, image_token_index, seg_token_index, return_tensors)
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith('checkpoint-'):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
+                cur_keyword_ids = cur_keyword_ids[1:]
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)"  # TODO
+        offset = min(output_ids.shape[1] - self.start_len, 3)
+        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
+        for keyword_id in self.keyword_ids:
+            if output_ids[0, -keyword_id.shape[0]:] == keyword_id:
+                return True
+        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False

vcoder_llava/model/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

vcoder_llava/model/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
+from .language_model.vcoder_llava_llama import VCoderLlavaLlamaForCausalLM, VCoderLlavaConfig
+from .language_model.vcoder_ds_llava_llama import VCoderDSLlavaLlamaForCausalLM, VCoderDSLlavaConfig

vcoder_llava/model/apply_delta.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""
+Usage:
+python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
+"""
+import argparse
+import torch
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from vcoder_llava import LlavaLlamaForCausalLM
+def apply_delta(base_model_path, target_model_path, delta_path):
+    print("Loading base model")
+    base = AutoModelForCausalLM.from_pretrained(
+        base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+    print("Loading delta")
+    delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+    delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
+    print("Applying delta")
+    for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
+        if name not in base.state_dict():
+            assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
+            continue
+        if param.data.shape == base.state_dict()[name].shape:
+            param.data += base.state_dict()[name]
+        else:
+            assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \
+                f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
+            bparam = base.state_dict()[name]
+            param.data[:bparam.shape[0], :bparam.shape[1]] += bparam
+    print("Saving target model")
+    delta.save_pretrained(target_model_path)
+    delta_tokenizer.save_pretrained(target_model_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-model-path", type=str, required=True)
+    parser.add_argument("--target-model-path", type=str, required=True)
+    parser.add_argument("--delta-path", type=str, required=True)
+    args = parser.parse_args()
+    apply_delta(args.base_model_path, args.target_model_path, args.delta_path)

vcoder_llava/model/builder.py ADDED Viewed

	@@ -0,0 +1,152 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import os
+import warnings
+import shutil
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
+import torch
+from vcoder_llava.model import *
+def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda"):
+    kwargs = {"device_map": device_map}
+    if load_8bit:
+        kwargs['load_in_8bit'] = True
+    elif load_4bit:
+        kwargs['load_in_4bit'] = True
+        kwargs['quantization_config'] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type='nf4'
+        )
+    else:
+        kwargs['torch_dtype'] = torch.float16
+    if 'llava' in model_name.lower():
+        # Load LLaVA model
+        if 'lora' in model_name.lower() and model_base is None:
+            warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged.')
+        if 'lora' in model_name.lower() and model_base is not None:
+            lora_cfg_pretrained = AutoConfig.from_pretrained(model_path)
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            print('Loading LLaVA from base model...')
+            model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
+            token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
+            if model.lm_head.weight.shape[0] != token_num:
+                model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
+                model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
+            print('Loading additional LLaVA weights...')
+            if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
+                non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
+            else:
+                # this is probably from HF Hub
+                from huggingface_hub import hf_hub_download
+                def load_from_hf(repo_id, filename, subfolder=None):
+                    cache_file = hf_hub_download(
+                        repo_id=repo_id,
+                        filename=filename,
+                        subfolder=subfolder)
+                    return torch.load(cache_file, map_location='cpu')
+                non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')
+            non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
+            if any(k.startswith('model.model.') for k in non_lora_trainables):
+                non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
+            model.load_state_dict(non_lora_trainables, strict=False)
+            from peft import PeftModel
+            print('Loading LoRA weights...')
+            model = PeftModel.from_pretrained(model, model_path)
+            print('Merging LoRA weights...')
+            model = model.merge_and_unload()
+            print('Model is loaded...')
+        elif model_base is not None:
+            # this may be mm projector only
+            print('Loading LLaVA from base model...')
+            if 'vcoder_ds_llava' in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+                cfg_pretrained = AutoConfig.from_pretrained(model_path)
+                model = VCoderDSLlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
+            elif 'vcoder_llava' in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+                cfg_pretrained = AutoConfig.from_pretrained(model_path)
+                model = VCoderLlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+                cfg_pretrained = AutoConfig.from_pretrained(model_path)
+                model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
+            mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
+            mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
+            model.load_state_dict(mm_projector_weights, strict=False)
+        else:
+            if 'vcoder_ds_llava' in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+                model = VCoderDSLlavaLlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+            elif 'vcoder_llava' in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+                model = VCoderLlavaLlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+                model = LlavaLlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+    else:
+        # Load language model
+        if model_base is not None:
+            # PEFT model
+            from peft import PeftModel
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            model = AutoModelForCausalLM.from_pretrained(model_base, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto")
+            print(f"Loading LoRA weights from {model_path}")
+            model = PeftModel.from_pretrained(model, model_path)
+            print(f"Merging weights")
+            model = model.merge_and_unload()
+            print('Convert to FP16...')
+            model.to(torch.float16)
+        else:
+            use_fast = False
+            if 'mpt' in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
+                model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs)
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+                model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+    image_processor = None
+    if hasattr(model.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    else:
+        context_len = 2048
+    if 'llava' in model_name.lower():
+        vision_tower = model.get_vision_tower()
+        if not vision_tower.is_loaded:
+            vision_tower.load_model()
+        vision_tower.to(device=device, dtype=torch.float16)
+        image_processor = vision_tower.image_processor
+    seg_image_processor = None
+    if 'vcoder' in model_name.lower():
+        seg_image_processor = image_processor
+    depth_image_processor = None
+    if "ds" in model_name.lower():
+        depth_image_processor = image_processor
+    model.requires_grad_(False)
+    return tokenizer, model, image_processor, seg_image_processor, depth_image_processor, context_len

vcoder_llava/model/consolidate.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""
+Usage:
+python3 -m vcoder_llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
+"""
+import argparse
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from vcoder_llava.model import *
+from vcoder_llava.model.utils import auto_upgrade
+def consolidate_ckpt(src_path, dst_path):
+    print("Loading model")
+    auto_upgrade(src_path)
+    src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+    src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
+    src_model.save_pretrained(dst_path)
+    src_tokenizer.save_pretrained(dst_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--src", type=str, required=True)
+    parser.add_argument("--dst", type=str, required=True)
+    args = parser.parse_args()
+    consolidate_ckpt(args.src, args.dst)

vcoder_llava/model/language_model/llava_llama.py ADDED Viewed

	@@ -0,0 +1,165 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from transformers import AutoConfig, AutoModelForCausalLM, \
+                         LlamaConfig, LlamaModel, LlamaForCausalLM
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
+class LlavaConfig(LlamaConfig):
+    model_type = "llava"
+class LlavaLlamaModel(LlavaMetaModel, LlamaModel):
+    config_class = LlavaConfig
+    def __init__(self, config: LlamaConfig):
+        super(LlavaLlamaModel, self).__init__(config)
+class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM):
+    config_class = LlavaConfig
+    def __init__(self, config):
+        super(LlamaForCausalLM, self).__init__(config)
+        self.model = LlavaLlamaModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        images_cd: Optional[torch.FloatTensor] = None,
+        cd_beta: Optional[torch.FloatTensor] = None,
+        cd_alpha: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images)
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model/pipeline parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "images": kwargs.get("images", None),
+            }
+        )
+        return model_inputs
+    def prepare_inputs_for_generation_cd(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "images": kwargs.get("images_cd", None),
+            }
+        )
+        return model_inputs
+AutoConfig.register("llava", LlavaConfig)
+AutoModelForCausalLM.register(LlavaConfig, LlavaLlamaForCausalLM)

vcoder_llava/model/language_model/vcoder_ds_llava_llama.py ADDED Viewed

	@@ -0,0 +1,145 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from transformers import AutoConfig, AutoModelForCausalLM, \
+                         LlamaConfig, LlamaModel, LlamaForCausalLM
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from ..vcoder_ds_llava_arch import VCoderDSLlavaMetaModel, VCoderDSLlavaMetaForCausalLM
+class VCoderDSLlavaConfig(LlamaConfig):
+    model_type = "vcoder_ds_llava"
+class VCoderDSLlavaLlamaModel(VCoderDSLlavaMetaModel, LlamaModel):
+    config_class = VCoderDSLlavaConfig
+    def __init__(self, config: LlamaConfig):
+        super(VCoderDSLlavaLlamaModel, self).__init__(config)
+class VCoderDSLlavaLlamaForCausalLM(LlamaForCausalLM, VCoderDSLlavaMetaForCausalLM):
+    config_class = VCoderDSLlavaConfig
+    def __init__(self, config):
+        super(LlamaForCausalLM, self).__init__(config)
+        self.model = VCoderDSLlavaLlamaModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        segs: Optional[torch.FloatTensor] = None,
+        depths: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images, segs, depths)
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model/pipeline parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "images": kwargs.get("images", None),
+                "segs": kwargs.get("segs", None),
+                "depths": kwargs.get("depths", None),
+            }
+        )
+        return model_inputs
+AutoConfig.register("vcoder_ds_llava", VCoderDSLlavaConfig)
+AutoModelForCausalLM.register(VCoderDSLlavaConfig, VCoderDSLlavaLlamaForCausalLM)

vcoder_llava/model/language_model/vcoder_llava_llama.py ADDED Viewed

	@@ -0,0 +1,142 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from transformers import AutoConfig, AutoModelForCausalLM, \
+                         LlamaConfig, LlamaModel, LlamaForCausalLM
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from ..vcoder_llava_arch import VCoderLlavaMetaModel, VCoderLlavaMetaForCausalLM
+class VCoderLlavaConfig(LlamaConfig):
+    model_type = "vcoder_llava"
+class VCoderLlavaLlamaModel(VCoderLlavaMetaModel, LlamaModel):
+    config_class = VCoderLlavaConfig
+    def __init__(self, config: LlamaConfig):
+        super(VCoderLlavaLlamaModel, self).__init__(config)
+class VCoderLlavaLlamaForCausalLM(LlamaForCausalLM, VCoderLlavaMetaForCausalLM):
+    config_class = VCoderLlavaConfig
+    def __init__(self, config):
+        super(LlamaForCausalLM, self).__init__(config)
+        self.model = VCoderLlavaLlamaModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        segs: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images, segs)
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model/pipeline parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "images": kwargs.get("images", None),
+                "segs": kwargs.get("segs", None),
+            }
+        )
+        return model_inputs
+AutoConfig.register("vcoder_llava", VCoderLlavaConfig)
+AutoModelForCausalLM.register(VCoderLlavaConfig, VCoderLlavaLlamaForCausalLM)

vcoder_llava/model/llava_arch.py ADDED Viewed

	@@ -0,0 +1,200 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+from abc import ABC, abstractmethod
+import torch
+import torch.nn as nn
+from .multimodal_encoder.builder import build_vision_tower
+from .multimodal_projector.builder import build_vision_projector
+from vcoder_llava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX
+class LlavaMetaModel:
+    def __init__(self, config):
+        super(LlavaMetaModel, self).__init__(config)
+        if hasattr(config, "mm_vision_tower"):
+            self.vision_tower = build_vision_tower(config, delay_load=True)
+            self.mm_projector = build_vision_projector(config)
+    def get_vision_tower(self):
+        vision_tower = getattr(self, 'vision_tower', None)
+        if type(vision_tower) is list:
+            vision_tower = vision_tower[0]
+        return vision_tower
+    def initialize_vision_modules(self, model_args, fsdp=None):
+        vision_tower = model_args.vision_tower
+        mm_vision_select_layer = model_args.mm_vision_select_layer
+        mm_vision_select_feature = model_args.mm_vision_select_feature
+        pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
+        self.config.mm_vision_tower = vision_tower
+        if self.get_vision_tower() is None:
+            vision_tower = build_vision_tower(model_args)
+            if fsdp is not None and len(fsdp) > 0:
+                self.vision_tower = [vision_tower]
+            else:
+                self.vision_tower = vision_tower
+        else:
+            if fsdp is not None and len(fsdp) > 0:
+                vision_tower = self.vision_tower[0]
+            else:
+                vision_tower = self.vision_tower
+            vision_tower.load_model()
+        self.config.use_mm_proj = True
+        self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
+        self.config.mm_hidden_size = vision_tower.hidden_size
+        self.config.mm_vision_select_layer = mm_vision_select_layer
+        self.config.mm_vision_select_feature = mm_vision_select_feature
+        if getattr(self, 'mm_projector', None) is None:
+            self.mm_projector = build_vision_projector(self.config)
+        else:
+            # In case it is frozen by LoRA
+            for p in self.mm_projector.parameters():
+                p.requires_grad = True
+        if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
+            def get_w(weights, keyword):
+                return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
+            self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
+class LlavaMetaForCausalLM(ABC):
+    @abstractmethod
+    def get_model(self):
+        pass
+    def get_vision_tower(self):
+        return self.get_model().get_vision_tower()
+    def encode_images(self, images):
+        image_features = self.get_model().get_vision_tower()(images)
+        image_features = self.get_model().mm_projector(image_features)
+        return image_features
+    def prepare_inputs_labels_for_multimodal(
+        self, input_ids, attention_mask, past_key_values, labels, images
+    ):
+        vision_tower = self.get_vision_tower()
+        if vision_tower is None or images is None or input_ids.shape[1] == 1:
+            if past_key_values is not None and vision_tower is not None and images is not None and input_ids.shape[1] == 1:
+                attention_mask = torch.ones((attention_mask.shape[0], past_key_values[-1][-1].shape[-2] + 1), dtype=attention_mask.dtype, device=attention_mask.device)
+            return input_ids, attention_mask, past_key_values, None, labels
+        if type(images) is list or images.ndim == 5:
+            concat_images = torch.cat([image for image in images], dim=0)
+            image_features = self.encode_images(concat_images)
+            split_sizes = [image.shape[0] for image in images]
+            image_features = torch.split(image_features, split_sizes, dim=0)
+            image_features = [x.flatten(0, 1) for x in image_features]
+        else:
+            image_features = self.encode_images(images)
+        new_input_embeds = []
+        new_labels = [] if labels is not None else None
+        cur_image_idx = 0
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            if (cur_input_ids == IMAGE_TOKEN_INDEX).sum() == 0:
+                # multimodal LLM, but the current sample is not multimodal
+                # FIXME: this is a hacky fix, for deepspeed zero3 to work
+                half_len = cur_input_ids.shape[0] // 2
+                cur_image_features = image_features[cur_image_idx]
+                cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids[:half_len])
+                cur_input_embeds_2 = self.get_model().embed_tokens(cur_input_ids[half_len:])
+                cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0], cur_input_embeds_2], dim=0)
+                new_input_embeds.append(cur_input_embeds)
+                if labels is not None:
+                    new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                continue
+            image_token_indices = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0]
+            cur_new_input_embeds = []
+            if labels is not None:
+                cur_labels = labels[batch_idx]
+                cur_new_labels = []
+                assert cur_labels.shape == cur_input_ids.shape
+            while image_token_indices.numel() > 0:
+                cur_image_features = image_features[cur_image_idx]
+                image_token_start = image_token_indices[0]
+                cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[:image_token_start]))
+                cur_new_input_embeds.append(cur_image_features)
+                if labels is not None:
+                    cur_new_labels.append(cur_labels[:image_token_start])
+                    cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
+                    cur_labels = cur_labels[image_token_start+1:]
+                cur_image_idx += 1
+                cur_input_ids = cur_input_ids[image_token_start+1:]
+                image_token_indices = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0]
+            if cur_input_ids.numel() > 0:
+                cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids))
+                if labels is not None:
+                    cur_new_labels.append(cur_labels)
+            cur_new_input_embeds = [x.to(device=self.device) for x in cur_new_input_embeds]
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds, dim=0)
+            new_input_embeds.append(cur_new_input_embeds)
+            if labels is not None:
+                cur_new_labels = torch.cat(cur_new_labels, dim=0)
+                new_labels.append(cur_new_labels)
+        if any(x.shape != new_input_embeds[0].shape for x in new_input_embeds):
+            max_len = max(x.shape[0] for x in new_input_embeds)
+            new_input_embeds_align = []
+            for cur_new_embed in new_input_embeds:
+                cur_new_embed = torch.cat((cur_new_embed, torch.zeros((max_len - cur_new_embed.shape[0], cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)), dim=0)
+                new_input_embeds_align.append(cur_new_embed)
+            new_input_embeds = torch.stack(new_input_embeds_align, dim=0)
+            if labels is not None:
+                new_labels_align = []
+                _new_labels = new_labels
+                for cur_new_label in new_labels:
+                    cur_new_label = torch.cat((cur_new_label, torch.full((max_len - cur_new_label.shape[0],), IGNORE_INDEX, dtype=cur_new_label.dtype, device=cur_new_label.device)), dim=0)
+                    new_labels_align.append(cur_new_label)
+                new_labels = torch.stack(new_labels_align, dim=0)
+            if attention_mask is not None:
+                new_attention_mask = []
+                for cur_attention_mask, cur_new_labels, cur_new_labels_align in zip(attention_mask, _new_labels, new_labels):
+                    new_attn_mask_pad_left = torch.full((cur_new_labels.shape[0] - labels.shape[1],), True, dtype=attention_mask.dtype, device=attention_mask.device)
+                    new_attn_mask_pad_right = torch.full((cur_new_labels_align.shape[0] - cur_new_labels.shape[0],), False, dtype=attention_mask.dtype, device=attention_mask.device)
+                    cur_new_attention_mask = torch.cat((new_attn_mask_pad_left, cur_attention_mask, new_attn_mask_pad_right), dim=0)
+                    new_attention_mask.append(cur_new_attention_mask)
+                attention_mask = torch.stack(new_attention_mask, dim=0)
+                assert attention_mask.shape == new_labels.shape
+        else:
+            new_input_embeds = torch.stack(new_input_embeds, dim=0)
+            if labels is not None:
+                new_labels  = torch.stack(new_labels, dim=0)
+            if attention_mask is not None:
+                new_attn_mask_pad_left = torch.full((attention_mask.shape[0], new_input_embeds.shape[1] - input_ids.shape[1]), True, dtype=attention_mask.dtype, device=attention_mask.device)
+                attention_mask = torch.cat((new_attn_mask_pad_left, attention_mask), dim=1)
+                assert attention_mask.shape == new_input_embeds.shape[:2]
+        return None, attention_mask, past_key_values, new_input_embeds, new_labels

vcoder_llava/model/make_delta.py ADDED Viewed

	@@ -0,0 +1,52 @@

+"""
+Usage:
+python3 -m vcoder_llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta
+"""
+import argparse
+import torch
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from vcoder_llava.model.utils import auto_upgrade
+def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id):
+    print("Loading base model")
+    base = AutoModelForCausalLM.from_pretrained(
+        base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+    print("Loading target model")
+    auto_upgrade(target_model_path)
+    target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+    print("Calculating delta")
+    for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
+        if name not in base.state_dict():
+            assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
+            continue
+        if param.data.shape == base.state_dict()[name].shape:
+            param.data -= base.state_dict()[name]
+        else:
+            assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
+            bparam = base.state_dict()[name]
+            param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam
+    print("Saving delta")
+    if hub_repo_id:
+        kwargs = {"push_to_hub": True, "repo_id": hub_repo_id}
+    else:
+        kwargs = {}
+    target.save_pretrained(delta_path, **kwargs)
+    target_tokenizer = AutoTokenizer.from_pretrained(target_model_path)
+    target_tokenizer.save_pretrained(delta_path, **kwargs)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-model-path", type=str, required=True)
+    parser.add_argument("--target-model-path", type=str, required=True)
+    parser.add_argument("--delta-path", type=str, required=True)
+    parser.add_argument("--hub-repo-id", type=str, default=None)
+    args = parser.parse_args()
+    make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id)

vcoder_llava/model/multimodal_adapter/builder.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import torch.nn as nn
+import re
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, *args, **kwargs):
+        return x
+    @property
+    def config(self):
+        return {"seg_mm_projector_type": 'identity'}
+class SimpleResBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(channels)
+        self.proj = nn.Sequential(
+            nn.Linear(channels, channels),
+            nn.GELU(),
+            nn.Linear(channels, channels)
+        )
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+def build_seg_projector(config, delay_load=False, **kwargs):
+    projector_type = getattr(config, 'seg_mm_projector_type', 'linear')
+    if projector_type == 'linear':
+        return nn.Linear(config.seg_mm_hidden_size, config.hidden_size)
+    mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
+    if mlp_gelu_match:
+        mlp_depth = int(mlp_gelu_match.group(1))
+        modules = [nn.Linear(config.seg_mm_hidden_size, config.hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+        return nn.Sequential(*modules)
+    if projector_type == 'identity':
+        return IdentityMap()
+    raise ValueError(f'Unknown seg projector type: {projector_type}')

vcoder_llava/model/multimodal_depth_adapter/builder.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch.nn as nn
+import re
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, *args, **kwargs):
+        return x
+    @property
+    def config(self):
+        return {"depth_mm_projector_type": 'identity'}
+class SimpleResBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(channels)
+        self.proj = nn.Sequential(
+            nn.Linear(channels, channels),
+            nn.GELU(),
+            nn.Linear(channels, channels)
+        )
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+def build_depth_projector(config, delay_load=False, **kwargs):
+    projector_type = getattr(config, 'depth_mm_projector_type', 'linear')
+    if projector_type == 'linear':
+        return nn.Linear(config.depth_mm_hidden_size, config.hidden_size)
+    mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
+    if mlp_gelu_match:
+        mlp_depth = int(mlp_gelu_match.group(1))
+        modules = [nn.Linear(config.depth_mm_hidden_size, config.hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+        return nn.Sequential(*modules)
+    if projector_type == 'identity':
+        return IdentityMap()
+    raise ValueError(f'Unknown depth projector type: {projector_type}')

vcoder_llava/model/multimodal_encoder/builder.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import os
+from .clip_encoder import CLIPVisionTower
+def build_vision_tower(vision_tower_cfg, **kwargs):
+    vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
+    is_absolute_path_exists = os.path.exists(vision_tower)
+    if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion"):
+        return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+    raise ValueError(f'Unknown vision tower: {vision_tower}')

vcoder_llava/model/multimodal_encoder/clip_encoder.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import torch
+import torch.nn as nn
+from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
+class CLIPVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+        if not delay_load:
+            self.load_model()
+        else:
+            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
+    def load_model(self):
+        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
+        self.vision_tower.requires_grad_(False)
+        self.is_loaded = True
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == 'patch':
+            image_features = image_features[:, 1:]
+        elif self.select_feature == 'cls_patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+    @property
+    def device(self):
+        return self.vision_tower.device
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2

vcoder_llava/model/multimodal_projector/builder.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import torch
+import torch.nn as nn
+import re
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, *args, **kwargs):
+        return x
+    @property
+    def config(self):
+        return {"mm_projector_type": 'identity'}
+class SimpleResBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(channels)
+        self.proj = nn.Sequential(
+            nn.Linear(channels, channels),
+            nn.GELU(),
+            nn.Linear(channels, channels)
+        )
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+def build_vision_projector(config, delay_load=False, **kwargs):
+    projector_type = getattr(config, 'mm_projector_type', 'linear')
+    if projector_type == 'linear':
+        return nn.Linear(config.mm_hidden_size, config.hidden_size)
+    mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
+    if mlp_gelu_match:
+        mlp_depth = int(mlp_gelu_match.group(1))
+        modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+        return nn.Sequential(*modules)
+    if projector_type == 'identity':
+        return IdentityMap()
+    raise ValueError(f'Unknown projector type: {projector_type}')

vcoder_llava/model/utils.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from transformers import AutoConfig
+def auto_upgrade(config):
+    cfg = AutoConfig.from_pretrained(config)
+    if 'llava' in config and 'llava' not in cfg.model_type:
+        assert cfg.model_type == 'llama'
+        print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
+        print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
+        confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
+        if confirm.lower() in ["y", "yes"]:
+            print("Upgrading checkpoint...")
+            assert len(cfg.architectures) == 1
+            setattr(cfg.__class__, "model_type", "llava")
+            cfg.architectures[0] = 'LlavaLlamaForCausalLM'
+            cfg.save_pretrained(config)
+            print("Checkpoint upgraded.")
+        else:
+            print("Checkpoint upgrade aborted.")
+            exit(1)

vcoder_llava/model/vcd/vcd_add_noise.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import torch
+def add_diffusion_noise(image_tensor, noise_step):
+    num_steps = 1000  # Number of diffusion steps
+    # decide beta in each step
+    betas = torch.linspace(-6,6,num_steps)
+    betas = torch.sigmoid(betas) * (0.5e-2 - 1e-5) + 1e-5
+    # decide alphas in each step
+    alphas = 1 - betas
+    alphas_prod = torch.cumprod(alphas, dim=0)
+    alphas_prod_p = torch.cat([torch.tensor([1]).float(), alphas_prod[:-1]],0) # p for previous
+    alphas_bar_sqrt = torch.sqrt(alphas_prod)
+    one_minus_alphas_bar_log = torch.log(1 - alphas_prod)
+    one_minus_alphas_bar_sqrt = torch.sqrt(1 - alphas_prod)
+    def q_x(x_0,t):
+        noise = torch.randn_like(x_0)
+        alphas_t = alphas_bar_sqrt[t]
+        alphas_1_m_t = one_minus_alphas_bar_sqrt[t]
+        return (alphas_t*x_0 + alphas_1_m_t*noise)
+    noise_delta = int(noise_step) # from 0-999
+    noisy_image = image_tensor.clone()
+    image_tensor_cd = q_x(noisy_image,noise_step)
+    return image_tensor_cd

vcoder_llava/model/vcd/vcd_sample.py ADDED Viewed

	@@ -0,0 +1,250 @@

+import copy
+import inspect
+import warnings
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+import torch
+import torch.distributed as dist
+from torch import nn
+from transformers.generation.logits_process import (
+    LogitsProcessorList,
+)
+from transformers.generation.stopping_criteria import (
+    StoppingCriteria,
+    StoppingCriteriaList,
+    validate_stopping_criteria,
+)
+import transformers
+from transformers.generation.utils import SampleOutput
+def sample(
+    self,
+    input_ids: torch.LongTensor,
+    logits_processor: Optional[LogitsProcessorList] = None,
+    stopping_criteria: Optional[StoppingCriteriaList] = None,
+    logits_warper: Optional[LogitsProcessorList] = None,
+    max_length: Optional[int] = None,
+    pad_token_id: Optional[int] = None,
+    eos_token_id: Optional[Union[int, List[int]]] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    output_scores: Optional[bool] = None,
+    return_dict_in_generate: Optional[bool] = None,
+    synced_gpus: bool = False,
+    streamer: Optional["BaseStreamer"] = None,
+    **model_kwargs,
+) -> Union[SampleOutput, torch.LongTensor]:
+    # init values
+    logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+    stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+    if max_length is not None:
+        warnings.warn(
+            "`max_length` is deprecated in this function, use"
+            " `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.",
+            UserWarning,
+        )
+        stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+    logits_warper = logits_warper if logits_warper is not None else LogitsProcessorList()
+    pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+    eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+    if isinstance(eos_token_id, int):
+        eos_token_id = [eos_token_id]
+    eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
+    output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
+    output_attentions = (
+        output_attentions if output_attentions is not None else self.generation_config.output_attentions
+    )
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
+    )
+    return_dict_in_generate = (
+        return_dict_in_generate
+        if return_dict_in_generate is not None
+        else self.generation_config.return_dict_in_generate
+    )
+    # init attention / hidden states / scores tuples
+    scores = () if (return_dict_in_generate and output_scores) else None
+    decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+    cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+    decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+    # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+    if return_dict_in_generate and self.config.is_encoder_decoder:
+        encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+        encoder_hidden_states = (
+            model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+        )
+    # keep track of which sequences are already finished
+    unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
+    this_peer_finished = False  # used by synced_gpus only
+    # auto-regressive generation
+    while True:
+        if synced_gpus:
+            # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+            # The following logic allows an early break if all peers finished generating their sequence
+            this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+            # send 0.0 if we finished, 1.0 otherwise
+            dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+            # did all peers finish? the reduced sum will be 0.0 then
+            if this_peer_finished_flag.item() == 0.0:
+                break
+        # prepare model inputs
+        model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+        # forward pass to get next token
+        outputs = self(
+            **model_inputs,
+            return_dict=True,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        if synced_gpus and this_peer_finished:
+            continue  # don't waste resources running the code we don't need
+        next_token_logits = outputs.logits[:, -1, :]
+        ## For contrastive decoding initial
+        use_cd = model_kwargs.get("images_cd") != None
+        output_attentions_wo_img = (
+            output_attentions if output_attentions is not None else self.generation_config.output_attentions
+        )
+        output_hidden_states_wo_img = (
+            output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
+        )
+        model_kwargs_cd = model_kwargs.copy()
+        if use_cd:
+            ## cd_comments: forward pass of the model with distorted image input
+            model_inputs_cd = self.prepare_inputs_for_generation_cd(input_ids, **model_kwargs_cd)
+            outputs_cd = self(
+                **model_inputs_cd,
+                return_dict=True,
+                output_attentions=output_attentions_wo_img,
+                output_hidden_states=output_hidden_states_wo_img,
+            )
+            next_token_logits_cd = outputs_cd.logits[:, -1, :]
+            ## cd_comments: pre-process logits from contrastive inputs
+            cd_alpha = model_kwargs.get("cd_alpha") if model_kwargs.get("cd_alpha") is not None else 0.5
+            cd_beta = model_kwargs.get("cd_beta") if model_kwargs.get("cd_beta") is not None else 0.1
+            # version 1  set cutoff for Adaptive Plausibility Constraints
+            # probs = nn.functional.softmax(next_token_logits, dim=-1)
+            # cutoff = cd_beta * probs.max(dim=-1, keepdim=True).values
+            # version 2 set cutoff for Adaptive Plausibility Constraints
+            cutoff = torch.log(torch.tensor(cd_beta)) + next_token_logits.max(dim=-1, keepdim=True).values
+            diffs = (1+cd_alpha)*next_token_logits - cd_alpha*next_token_logits_cd
+            cd_logits = diffs.masked_fill(next_token_logits < cutoff, -float("inf"))
+            ## cd_comments: apply temperature warping and top-k filtering in contrastive decoding
+            cd_logits = logits_processor(input_ids, cd_logits)
+            cd_logits = logits_warper(input_ids, cd_logits)
+            next_token_scores = cd_logits
+            cd_probs = nn.functional.softmax(cd_logits, dim=-1)
+            next_tokens = torch.multinomial(cd_probs, num_samples=1).squeeze(1)
+        else:
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+            next_token_scores = logits_warper(input_ids, next_token_scores)
+            probs = nn.functional.softmax(next_token_scores, dim=-1)
+            next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+        # Store scores, attentions and hidden_states when required
+        if return_dict_in_generate:
+            if output_scores:
+                scores += (next_token_scores,)
+            if output_attentions:
+                decoder_attentions += (
+                    (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                )
+                if self.config.is_encoder_decoder:
+                    cross_attentions += (outputs.cross_attentions,)
+            if output_hidden_states:
+                decoder_hidden_states += (
+                    (outputs.decoder_hidden_states,)
+                    if self.config.is_encoder_decoder
+                    else (outputs.hidden_states,)
+                )
+        # finished sentences should have their next token be a padding token
+        if eos_token_id is not None:
+            if pad_token_id is None:
+                raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
+            next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+        # update generated ids, model inputs, and length for next step
+        input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+        if streamer is not None:
+            streamer.put(next_tokens.cpu())
+        model_kwargs = self._update_model_kwargs_for_generation(
+            outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+        )
+        ## cd_comments: update model_kwargs_cd for contrastive decoding
+        if use_cd:
+            model_kwargs_cd = self._update_model_kwargs_for_generation(
+                outputs_cd, model_kwargs_cd, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+        # if eos_token was found in one sentence, set sentence to finished
+        if eos_token_id_tensor is not None:
+            unfinished_sequences = unfinished_sequences.mul(
+                next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
+            )
+            # stop when each sentence is finished
+            if unfinished_sequences.max() == 0:
+                this_peer_finished = True
+        # stop if we exceed the maximum length
+        if stopping_criteria(input_ids, scores):
+            this_peer_finished = True
+        if this_peer_finished and not synced_gpus:
+            break
+    if streamer is not None:
+        streamer.end()
+    if return_dict_in_generate:
+        if self.config.is_encoder_decoder:
+            return SampleEncoderDecoderOutput(
+                sequences=input_ids,
+                scores=scores,
+                encoder_attentions=encoder_attentions,
+                encoder_hidden_states=encoder_hidden_states,
+                decoder_attentions=decoder_attentions,
+                cross_attentions=cross_attentions,
+                decoder_hidden_states=decoder_hidden_states,
+            )
+        else:
+            return SampleDecoderOnlyOutput(
+                sequences=input_ids,
+                scores=scores,
+                attentions=decoder_attentions,
+                hidden_states=decoder_hidden_states,
+            )
+    else:
+        return input_ids
+def evolve_vcd_sampling():
+    transformers.generation.utils.GenerationMixin.sample = sample

vcoder_llava/model/vcoder_ds_llava_arch.py ADDED Viewed

	@@ -0,0 +1,323 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+from abc import ABC, abstractmethod
+import torch
+import torch.nn as nn
+from .multimodal_encoder.builder import build_vision_tower
+from .multimodal_projector.builder import build_vision_projector
+from .multimodal_adapter.builder import build_seg_projector
+from .multimodal_depth_adapter.builder import build_depth_projector
+from vcoder_llava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, SEG_TOKEN_INDEX, DEPTH_TOKEN_INDEX
+class VCoderDSLlavaMetaModel:
+    def __init__(self, config):
+        super(VCoderDSLlavaMetaModel, self).__init__(config)
+        self.config = config
+        if hasattr(config, "mm_vision_tower"):
+            self.vision_tower = build_vision_tower(config, delay_load=True)
+            self.mm_projector = build_vision_projector(config)
+        if hasattr(config, "seg_mm_projector_type"):
+            self.seg_mm_projector = build_seg_projector(config)
+        if hasattr(config, "use_mm2_proj"):
+            if config.use_mm2_proj:
+                self.mm2_projector = build_vision_projector(config)
+        if hasattr(config, "depth_mm_projector_type"):
+            self.depth_mm_projector = build_depth_projector(config)
+        if hasattr(config, "mm_vcoder_lm_emb"):
+            self.vcoder_lm_emb = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+    def get_vision_tower(self):
+        vision_tower = getattr(self, 'vision_tower', None)
+        if type(vision_tower) is list:
+            vision_tower = vision_tower[0]
+        return vision_tower
+    def initialize_seg_modules(self, model_args, fsdp=None):
+        mm_seg_select_layer = model_args.mm_seg_select_layer
+        mm_seg_select_feature = model_args.mm_seg_select_feature
+        self.config.seg_mm_hidden_size = self.vision_tower.hidden_size
+        self.config.seg_use_mm_proj = True
+        self.config.seg_mm_projector_type = getattr(model_args, 'seg_mm_projector_type', 'linear')
+        self.config.mm_seg_select_layer = mm_seg_select_layer
+        self.config.mm_seg_select_feature = mm_seg_select_feature
+        self.seg_mm_projector = build_seg_projector(self.config)
+        self.vcoder_lm_emb = nn.Embedding(self.config.vocab_size, self.config.hidden_size, self.config.pad_token_id)
+        # use MLP from pretraining stage
+        pretrain_mm2_mlp_adapter = model_args.pretrain_mm2_mlp_adapter
+        if getattr(model_args, "use_mm2_proj"):
+            self.config.use_mm2_proj = model_args.use_mm2_proj
+            self.mm2_projector = build_vision_projector(self.config)
+            if pretrain_mm2_mlp_adapter is not None:
+                mm2_projector_weights = torch.load(pretrain_mm2_mlp_adapter, map_location='cpu')
+                def get_w(weights, keyword):
+                    return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
+                self.mm2_projector.load_state_dict(get_w(mm2_projector_weights, 'mm_projector'))
+    def initialize_depth_modules(self, model_args, fsdp=None):
+        mm_depth_select_layer = model_args.mm_depth_select_layer
+        mm_depth_select_feature = model_args.mm_depth_select_feature
+        self.config.depth_mm_hidden_size = self.vision_tower.hidden_size
+        self.config.depth_use_mm_proj = True
+        self.config.depth_mm_projector_type = getattr(model_args, 'depth_mm_projector_type', 'linear')
+        self.config.mm_depth_select_layer = mm_depth_select_layer
+        self.config.mm_depth_select_feature = mm_depth_select_feature
+        self.depth_mm_projector = build_depth_projector(self.config)
+class VCoderDSLlavaMetaForCausalLM(ABC):
+    @abstractmethod
+    def get_model(self):
+        pass
+    def get_vision_tower(self):
+        return self.get_model().get_vision_tower()
+    def encode_seg_images(self, seg_images):
+        seg_features = self.get_model().get_vision_tower()(seg_images)
+        seg_features = self.get_model().seg_mm_projector(seg_features)
+        return seg_features
+    def encode_depth_images(self, depth_images):
+        depth_features = self.get_model().get_vision_tower()(depth_images)
+        depth_features = self.get_model().seg_mm_projector(depth_features)
+        return depth_features
+    def encode_images(self, images):
+        image_features = self.get_model().get_vision_tower()(images)
+        image_features = self.get_model().mm_projector(image_features)
+        return image_features
+    def encode_images_w_seg(self, images):
+        image_features = self.get_model().get_vision_tower()(images)
+        image_features = self.get_model().mm2_projector(image_features)
+        return image_features
+    def prepare_inputs_labels_for_multimodal(
+        self, input_ids, attention_mask, past_key_values, labels, images, seg_images, depth_images
+    ):
+        vision_tower = self.get_vision_tower()
+        if vision_tower is None or images is None or input_ids.shape[1] == 1:
+            if past_key_values is not None and vision_tower is not None and images is not None and input_ids.shape[1] == 1:
+                attention_mask = torch.ones((attention_mask.shape[0], past_key_values[-1][-1].shape[-2] + 1), dtype=attention_mask.dtype, device=attention_mask.device)
+            return input_ids, attention_mask, past_key_values, None, labels
+        if type(images) is list or images.ndim == 5:
+            concat_images = torch.cat([image for image in images], dim=0)
+            if seg_images is not None and hasattr(self, 'mm2_projector'):
+                image_features = self.encode_images_w_seg(concat_images)
+            else:
+                image_features = self.encode_images(concat_images)
+            split_sizes = [image.shape[0] for image in images]
+            image_features = torch.split(image_features, split_sizes, dim=0)
+            image_features = [x.flatten(0, 1) for x in image_features]
+        else:
+            if seg_images is not None and hasattr(self, 'mm2_projector'):
+                image_features = self.encode_images_w_seg(images)
+            else:
+                image_features = self.encode_images(images)
+        if seg_images is not None:
+            if type(seg_images) is list or seg_images.ndim == 5:
+                concat_seg_images = torch.cat([image for image in seg_images], dim=0)
+                seg_features = self.encode_seg_images(concat_seg_images)
+                split_sizes = [image.shape[0] for image in seg_images]
+                seg_features = torch.split(seg_features, split_sizes, dim=0)
+                seg_features = [x.flatten(0, 1) for x in seg_features]
+            else:
+                seg_features = self.encode_seg_images(seg_images)
+        if depth_images is not None:
+            try:
+                for p in self.get_model().depth_mm_projector.parameters():
+                    p.requires_grad = True
+                if type(depth_images) is list or depth_images.ndim == 5:
+                    concat_depth_images = torch.cat([image for image in depth_images], dim=0)
+                    depth_features = self.encode_depth_images(concat_depth_images)
+                    split_sizes = [image.shape[0] for image in depth_images]
+                    depth_features = torch.split(depth_features, split_sizes, dim=0)
+                    depth_features = [x.flatten(0, 1) for x in depth_features]
+                else:
+                    depth_features = self.encode_depth_images(depth_images)
+            except:
+                depth_images = None
+                mask = input_ids != DEPTH_TOKEN_INDEX # drop depth indices
+                input_ids = input_ids[mask]
+                for p in self.get_model().depth_mm_projector.parameters():
+                    p.requires_grad = False
+        else:
+            for p in self.get_model().depth_mm_projector.parameters():
+                p.requires_grad = False
+        self.get_model().vcoder_lm_emb.weight.data = self.get_model().get_input_embeddings().weight.data.clone()
+        new_input_embeds = []
+        new_labels = [] if labels is not None else None
+        cur_image_idx = 0
+        cur_seg_idx = 0
+        cur_depth_idx = 0
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            if (cur_input_ids == IMAGE_TOKEN_INDEX).sum() == 0 and (cur_input_ids == SEG_TOKEN_INDEX).sum() == 0:
+                # FIXME: this is a hacky fix, for deepspeed zero3 to work
+                cur_image_features = image_features[cur_image_idx]
+                half_len = cur_input_ids.shape[0] // 2
+                if seg_images is not None:
+                    cur_seg_features = seg_features[cur_seg_idx]
+                    if depth_images is not None:
+                        cur_depth_features = depth_features[cur_depth_idx]
+                    cur_input_embeds_1 = self.get_model().vcoder_lm_emb(cur_input_ids[:half_len])
+                    cur_input_embeds_2 = self.get_model().vcoder_lm_emb(cur_input_ids[half_len:])
+                else:
+                    cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids[:half_len])
+                    cur_input_embeds_2 = self.get_model().embed_tokens(cur_input_ids[half_len:])
+                if seg_images is not None:
+                    if depth_images is not None:
+                        cur_input_embeds = torch.cat([cur_input_embeds_1, cur_depth_features[0:0], cur_seg_features[0:0], cur_image_features[0:0], cur_input_embeds_2], dim=0)
+                    else:
+                        cur_input_embeds = torch.cat([cur_input_embeds_1, cur_seg_features[0:0], cur_image_features[0:0], cur_input_embeds_2], dim=0)
+                else:
+                    cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0], cur_input_embeds_2], dim=0)
+                new_input_embeds.append(cur_input_embeds)
+                if labels is not None:
+                    new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                cur_seg_idx += 1
+                cur_depth_idx += 1
+                continue
+            image_token_indices = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0]
+            cur_new_input_embeds = []
+            if labels is not None:
+                cur_labels = labels[batch_idx]
+                cur_new_labels = []
+                assert cur_labels.shape == cur_input_ids.shape
+            while image_token_indices.numel() > 0:
+                cur_image_features = image_features[cur_image_idx]
+                image_token_start = image_token_indices[0]
+                if seg_images is None:
+                    cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[:image_token_start]))
+                else:
+                    cur_new_input_embeds.append(self.get_model().vcoder_lm_emb(cur_input_ids[:image_token_start]))
+                cur_new_input_embeds.append(cur_image_features)
+                if labels is not None:
+                    cur_new_labels.append(cur_labels[:image_token_start])
+                    cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
+                    cur_labels = cur_labels[image_token_start+1:]
+                cur_image_idx += 1
+                cur_input_ids = cur_input_ids[image_token_start+1:]
+                image_token_indices = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0]
+            if seg_images is not None:
+                seg_token_indices = torch.where(cur_input_ids == SEG_TOKEN_INDEX)[0]
+                while seg_token_indices.numel() > 0:
+                    cur_seg_features = seg_features[cur_seg_idx]
+                    seg_token_start = seg_token_indices[0]
+                    if depth_images is None:
+                        cur_new_input_embeds.append(self.get_model().vcoder_lm_emb(cur_input_ids[:seg_token_start]))
+                    cur_new_input_embeds.append(cur_seg_features)
+                    if labels is not None:
+                        if depth_images is None:
+                            cur_new_labels.append(cur_labels[:seg_token_start])
+                        cur_new_labels.append(torch.full((cur_seg_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
+                        cur_labels = cur_labels[seg_token_start+1:]
+                    cur_seg_idx += 1
+                    cur_input_ids = cur_input_ids[seg_token_start+1:]
+                    seg_token_indices = torch.where(cur_input_ids == SEG_TOKEN_INDEX)[0]
+            if depth_images is not None:
+                depth_token_indices = torch.where(cur_input_ids == DEPTH_TOKEN_INDEX)[0]
+                while depth_token_indices.numel() > 0:
+                    cur_depth_features = depth_features[cur_depth_idx]
+                    depth_token_start = depth_token_indices[0]
+                    cur_new_input_embeds.append(self.get_model().vcoder_lm_emb(cur_input_ids[:depth_token_start]))
+                    cur_new_input_embeds.append(cur_depth_features)
+                    if labels is not None:
+                        cur_new_labels.append(cur_labels[:depth_token_start])
+                        cur_new_labels.append(torch.full((cur_depth_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
+                        cur_labels = cur_labels[depth_token_start+1:]
+                    cur_depth_idx += 1
+                    cur_input_ids = cur_input_ids[depth_token_start+1:]
+                    depth_token_indices = torch.where(cur_input_ids == DEPTH_TOKEN_INDEX)[0]
+            if cur_input_ids.numel() > 0:
+                if seg_images is None:
+                    cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids))
+                else:
+                    cur_new_input_embeds.append(self.get_model().vcoder_lm_emb(cur_input_ids))
+                if labels is not None:
+                    cur_new_labels.append(cur_labels)
+            cur_new_input_embeds = [x.to(device=self.device) for x in cur_new_input_embeds]
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds, dim=0)
+            new_input_embeds.append(cur_new_input_embeds)
+            if labels is not None:
+                cur_new_labels = torch.cat(cur_new_labels, dim=0)
+                new_labels.append(cur_new_labels)
+        if any(x.shape != new_input_embeds[0].shape for x in new_input_embeds):
+            max_len = max(x.shape[0] for x in new_input_embeds)
+            new_input_embeds_align = []
+            for cur_new_embed in new_input_embeds:
+                cur_new_embed = torch.cat((cur_new_embed, torch.zeros((max_len - cur_new_embed.shape[0], cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)), dim=0)
+                new_input_embeds_align.append(cur_new_embed)
+            new_input_embeds = torch.stack(new_input_embeds_align, dim=0)
+            if labels is not None:
+                new_labels_align = []
+                _new_labels = new_labels
+                for cur_new_label in new_labels:
+                    cur_new_label = torch.cat((cur_new_label, torch.full((max_len - cur_new_label.shape[0],), IGNORE_INDEX, dtype=cur_new_label.dtype, device=cur_new_label.device)), dim=0)
+                    new_labels_align.append(cur_new_label)
+                new_labels = torch.stack(new_labels_align, dim=0)
+            if attention_mask is not None:
+                new_attention_mask = []
+                for cur_attention_mask, cur_new_labels, cur_new_labels_align in zip(attention_mask, _new_labels, new_labels):
+                    new_attn_mask_pad_left = torch.full((cur_new_labels.shape[0] - labels.shape[1],), True, dtype=attention_mask.dtype, device=attention_mask.device)
+                    new_attn_mask_pad_right = torch.full((cur_new_labels_align.shape[0] - cur_new_labels.shape[0],), False, dtype=attention_mask.dtype, device=attention_mask.device)
+                    cur_new_attention_mask = torch.cat((new_attn_mask_pad_left, cur_attention_mask, new_attn_mask_pad_right), dim=0)
+                    new_attention_mask.append(cur_new_attention_mask)
+                attention_mask = torch.stack(new_attention_mask, dim=0)
+                assert attention_mask.shape == new_labels.shape
+        else:
+            new_input_embeds = torch.stack(new_input_embeds, dim=0)
+            if labels is not None:
+                new_labels  = torch.stack(new_labels, dim=0)
+            if attention_mask is not None:
+                new_attn_mask_pad_left = torch.full((attention_mask.shape[0], new_input_embeds.shape[1] - input_ids.shape[1]), True, dtype=attention_mask.dtype, device=attention_mask.device)
+                attention_mask = torch.cat((new_attn_mask_pad_left, attention_mask), dim=1)
+                assert attention_mask.shape == new_input_embeds.shape[:2]
+        return None, attention_mask, past_key_values, new_input_embeds, new_labels

vcoder_llava/model/vcoder_llava_arch.py ADDED Viewed

	@@ -0,0 +1,254 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+from abc import ABC, abstractmethod
+import torch
+import torch.nn as nn
+from .multimodal_encoder.builder import build_vision_tower
+from .multimodal_projector.builder import build_vision_projector
+from .multimodal_adapter.builder import build_seg_projector
+from vcoder_llava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, SEG_TOKEN_INDEX
+class VCoderLlavaMetaModel:
+    def __init__(self, config):
+        super(VCoderLlavaMetaModel, self).__init__(config)
+        self.config = config
+        if hasattr(config, "mm_vision_tower"):
+            self.vision_tower = build_vision_tower(config, delay_load=True)
+            self.mm_projector = build_vision_projector(config)
+        if hasattr(config, "seg_mm_projector_type"):
+            self.seg_mm_projector = build_seg_projector(config)
+        if hasattr(config, "use_mm2_proj"):
+            if config.use_mm2_proj:
+                self.mm2_projector = build_vision_projector(config)
+        if hasattr(config, "mm_vcoder_lm_emb"):
+            self.vcoder_lm_emb = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+    def get_vision_tower(self):
+        vision_tower = getattr(self, 'vision_tower', None)
+        if type(vision_tower) is list:
+            vision_tower = vision_tower[0]
+        return vision_tower
+    def initialize_seg_modules(self, model_args, fsdp=None):
+        mm_seg_select_layer = model_args.mm_seg_select_layer
+        mm_seg_select_feature = model_args.mm_seg_select_feature
+        self.config.seg_mm_hidden_size = self.vision_tower.hidden_size
+        pretrain_mm2_mlp_adapter = model_args.pretrain_mm2_mlp_adapter
+        self.config.seg_use_mm_proj = True
+        self.config.seg_mm_projector_type = getattr(model_args, 'seg_mm_projector_type', 'linear')
+        self.config.mm_seg_select_layer = mm_seg_select_layer
+        self.config.mm_seg_select_feature = mm_seg_select_feature
+        self.seg_mm_projector = build_seg_projector(self.config)
+        self.vcoder_lm_emb = nn.Embedding(self.config.vocab_size, self.config.hidden_size, self.config.pad_token_id)
+        if getattr(model_args, "use_mm2_proj"):
+            self.config.use_mm2_proj = model_args.use_mm2_proj
+            self.mm2_projector = build_vision_projector(self.config)
+            if pretrain_mm2_mlp_adapter is not None:
+                mm2_projector_weights = torch.load(pretrain_mm2_mlp_adapter, map_location='cpu')
+                def get_w(weights, keyword):
+                    return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
+                self.mm2_projector.load_state_dict(get_w(mm2_projector_weights, 'mm_projector'))
+class VCoderLlavaMetaForCausalLM(ABC):
+    @abstractmethod
+    def get_model(self):
+        pass
+    def get_vision_tower(self):
+        return self.get_model().get_vision_tower()
+    def encode_seg_images(self, seg_images):
+        seg_features = self.get_model().get_vision_tower()(seg_images)
+        seg_features = self.get_model().seg_mm_projector(seg_features)
+        return seg_features
+    def encode_images(self, images):
+        image_features = self.get_model().get_vision_tower()(images)
+        image_features = self.get_model().mm_projector(image_features)
+        return image_features
+    def encode_images_w_seg(self, images):
+        image_features = self.get_model().get_vision_tower()(images)
+        image_features = self.get_model().mm2_projector(image_features)
+        return image_features
+    def prepare_inputs_labels_for_multimodal(
+        self, input_ids, attention_mask, past_key_values, labels, images, seg_images,
+    ):
+        vision_tower = self.get_vision_tower()
+        if vision_tower is None or images is None or input_ids.shape[1] == 1:
+            if past_key_values is not None and vision_tower is not None and images is not None and input_ids.shape[1] == 1:
+                attention_mask = torch.ones((attention_mask.shape[0], past_key_values[-1][-1].shape[-2] + 1), dtype=attention_mask.dtype, device=attention_mask.device)
+            return input_ids, attention_mask, past_key_values, None, labels
+        if type(images) is list or images.ndim == 5:
+            concat_images = torch.cat([image for image in images], dim=0)
+            if seg_images is not None and hasattr(self, 'mm2_projector'):
+                image_features = self.encode_images_w_seg(concat_images)
+            else:
+                image_features = self.encode_images(concat_images)
+            split_sizes = [image.shape[0] for image in images]
+            image_features = torch.split(image_features, split_sizes, dim=0)
+            image_features = [x.flatten(0, 1) for x in image_features]
+        else:
+            if seg_images is not None and hasattr(self, 'mm2_projector'):
+                image_features = self.encode_images_w_seg(images)
+            else:
+                image_features = self.encode_images(images)
+        if seg_images is not None:
+            if type(seg_images) is list or seg_images.ndim == 5:
+                concat_seg_images = torch.cat([image for image in seg_images], dim=0)
+                seg_features = self.encode_seg_images(concat_seg_images)
+                split_sizes = [image.shape[0] for image in seg_images]
+                seg_features = torch.split(seg_features, split_sizes, dim=0)
+                seg_features = [x.flatten(0, 1) for x in seg_features]
+            else:
+                seg_features = self.encode_seg_images(seg_images)
+        self.get_model().vcoder_lm_emb.weight.data = self.get_model().get_input_embeddings().weight.data.clone()
+        new_input_embeds = []
+        new_labels = [] if labels is not None else None
+        cur_image_idx = 0
+        cur_seg_idx = 0
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            if (cur_input_ids == IMAGE_TOKEN_INDEX).sum() == 0 or (cur_input_ids == SEG_TOKEN_INDEX).sum() == 0:
+                # FIXME: this is a hacky fix, for deepspeed zero3 to work
+                cur_image_features = image_features[cur_image_idx]
+                if seg_images is not None:
+                    cur_seg_features = seg_features[cur_seg_idx]
+                half_len = cur_input_ids.shape[0] // 2
+                if seg_images is not None:
+                    cur_input_embeds_1 = self.get_model().vcoder_lm_emb(cur_input_ids[:half_len])
+                    cur_input_embeds_2 = self.get_model().vcoder_lm_emb(cur_input_ids[half_len:])
+                    cur_input_embeds = torch.cat([cur_input_embeds_1, cur_seg_features[0:0], cur_image_features[0:0], cur_input_embeds_2], dim=0)
+                else:
+                    cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids[:half_len])
+                    cur_input_embeds_2 = self.get_model().embed_tokens(cur_input_ids[half_len:])
+                    cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0], cur_input_embeds_2], dim=0)
+                new_input_embeds.append(cur_input_embeds)
+                if labels is not None:
+                    new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                cur_seg_idx += 1
+                continue
+            image_token_indices = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0]
+            cur_new_input_embeds = []
+            if labels is not None:
+                cur_labels = labels[batch_idx]
+                cur_new_labels = []
+                assert cur_labels.shape == cur_input_ids.shape
+            while image_token_indices.numel() > 0:
+                cur_image_features = image_features[cur_image_idx]
+                image_token_start = image_token_indices[0]
+                if seg_images is None:
+                    cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[:image_token_start]))
+                else:
+                    cur_new_input_embeds.append(self.get_model().vcoder_lm_emb(cur_input_ids[:image_token_start]))
+                cur_new_input_embeds.append(cur_image_features)
+                if labels is not None:
+                    cur_new_labels.append(cur_labels[:image_token_start])
+                    cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
+                    cur_labels = cur_labels[image_token_start+1:]
+                cur_image_idx += 1
+                cur_input_ids = cur_input_ids[image_token_start+1:]
+                image_token_indices = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0]
+            if seg_images is not None:
+                seg_token_indices = torch.where(cur_input_ids == SEG_TOKEN_INDEX)[0]
+                while seg_token_indices.numel() > 0:
+                    cur_seg_features = seg_features[cur_seg_idx]
+                    seg_token_start = seg_token_indices[0]
+                    cur_new_input_embeds.append(self.get_model().vcoder_lm_emb(cur_input_ids[:seg_token_start]))
+                    cur_new_input_embeds.append(cur_seg_features)
+                    if labels is not None:
+                        cur_new_labels.append(cur_labels[:seg_token_start])
+                        cur_new_labels.append(torch.full((cur_seg_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
+                        cur_labels = cur_labels[seg_token_start+1:]
+                    cur_seg_idx += 1
+                    cur_input_ids = cur_input_ids[seg_token_start+1:]
+                    seg_token_indices = torch.where(cur_input_ids == SEG_TOKEN_INDEX)[0]
+            if cur_input_ids.numel() > 0:
+                if seg_images is None:
+                    cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids))
+                else:
+                    cur_new_input_embeds.append(self.get_model().vcoder_lm_emb(cur_input_ids))
+                if labels is not None:
+                    cur_new_labels.append(cur_labels)
+            cur_new_input_embeds = [x.to(device=self.device) for x in cur_new_input_embeds]
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds, dim=0)
+            new_input_embeds.append(cur_new_input_embeds)
+            if labels is not None:
+                cur_new_labels = torch.cat(cur_new_labels, dim=0)
+                new_labels.append(cur_new_labels)
+        if any(x.shape != new_input_embeds[0].shape for x in new_input_embeds):
+            max_len = max(x.shape[0] for x in new_input_embeds)
+            new_input_embeds_align = []
+            for cur_new_embed in new_input_embeds:
+                cur_new_embed = torch.cat((cur_new_embed, torch.zeros((max_len - cur_new_embed.shape[0], cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)), dim=0)
+                new_input_embeds_align.append(cur_new_embed)
+            new_input_embeds = torch.stack(new_input_embeds_align, dim=0)
+            if labels is not None:
+                new_labels_align = []
+                _new_labels = new_labels
+                for cur_new_label in new_labels:
+                    cur_new_label = torch.cat((cur_new_label, torch.full((max_len - cur_new_label.shape[0],), IGNORE_INDEX, dtype=cur_new_label.dtype, device=cur_new_label.device)), dim=0)
+                    new_labels_align.append(cur_new_label)
+                new_labels = torch.stack(new_labels_align, dim=0)
+            if attention_mask is not None:
+                new_attention_mask = []
+                for cur_attention_mask, cur_new_labels, cur_new_labels_align in zip(attention_mask, _new_labels, new_labels):
+                    new_attn_mask_pad_left = torch.full((cur_new_labels.shape[0] - labels.shape[1],), True, dtype=attention_mask.dtype, device=attention_mask.device)
+                    new_attn_mask_pad_right = torch.full((cur_new_labels_align.shape[0] - cur_new_labels.shape[0],), False, dtype=attention_mask.dtype, device=attention_mask.device)
+                    cur_new_attention_mask = torch.cat((new_attn_mask_pad_left, cur_attention_mask, new_attn_mask_pad_right), dim=0)
+                    new_attention_mask.append(cur_new_attention_mask)
+                attention_mask = torch.stack(new_attention_mask, dim=0)
+                assert attention_mask.shape == new_labels.shape
+        else:
+            new_input_embeds = torch.stack(new_input_embeds, dim=0)
+            if labels is not None:
+                new_labels  = torch.stack(new_labels, dim=0)
+            if attention_mask is not None:
+                new_attn_mask_pad_left = torch.full((attention_mask.shape[0], new_input_embeds.shape[1] - input_ids.shape[1]), True, dtype=attention_mask.dtype, device=attention_mask.device)
+                attention_mask = torch.cat((new_attn_mask_pad_left, attention_mask), dim=1)
+                assert attention_mask.shape == new_input_embeds.shape[:2]
+        return None, attention_mask, past_key_values, new_input_embeds, new_labels

vcoder_llava/questions.py ADDED Viewed

	@@ -0,0 +1,110 @@

+SEMANTIC_QUESTIONS = [
+    "What objects can be seen in the image? Perceive as done for semantic segmentation.",
+    "What items are depicted in the picture? Consider in terms of semantic segmentation.",
+    "Which elements are present in the visual? Analyze as you would for semantic segmentation.",
+    "Can you identify the objects in the image? Think from a semantic segmentation perspective.",
+    "What are the components visible in the graphic? Examine as if segmenting semantically.",
+    "Which entities can be spotted in the photo? View through the lens of semantic segmentation.",
+    "What are the discernible objects in the snapshot? Envision in relation to semantic segmentation.",
+    "What elements stand out in the illustration? Reflect upon it as for semantic segmentation.",
+    "Can you spot any items within the visual representation? Contemplate in a semantic segmentation context.",
+    "What features are evident in this visual content? Analyze with semantic segmentation in mind.",
+    "Which objects are noticeable in the image? Think of it in terms of semantic layers.",
+    "How would you categorize the objects in this picture? As if you're doing semantic segmentation.",
+    "What constituents can you recognize in the image? Ponder considering semantic segmentation.",
+    "Which components can be distinguished in the photo? Evaluate as per semantic segmentation guidelines.",
+    "What items in the image can you point out? Interpret with a semantic segmentation approach.",
+    "Can you enumerate the objects present in this visual? Think semantically.",
+    "What do you observe in the graphic? Consider its semantic segments.",
+    "How many distinct objects can you identify in the visual? Keeping semantic segmentation in perspective.",
+    "Which items are apparent in this depiction? Assess as one would for semantic segmentation.",
+    "What are the visible entities within this image? Delve into it semantically.",
+    "Can you discern specific objects in the portrayal? Approach it from a semantic segmentation standpoint.",
+]
+INSTANCE_QUESTIONS = [
+    "What objects can be seen in the image? Perceive as done for instance segmentation",
+    "What items are visible in the picture? Analyze as you would for instance segmentation.",
+    "Which elements are present in the visual? Consider from an instance segmentation perspective.",
+    "What are the distinguishable objects in the image? Think in terms of instance segmentation.",
+    "Can you identify the entities in the graphic? Approach it with instance segmentation in mind.",
+    "What components are apparent in the photo? Examine as if performing instance segmentation.",
+    "Which items can be detected in the snapshot? View it through the lens of instance segmentation.",
+    "What features stand out in the illustration? Reflect upon it as for instance segmentation.",
+    "How would you describe the objects in this image? Keeping instance segmentation as a reference.",
+    "What constituents are evident in the visual content? Think from an instance segmentation standpoint.",
+    "Which objects can you spot in the depiction? Evaluate as per instance segmentation guidelines.",
+    "What do you observe in the graphic? Contemplate with instance segmentation considerations.",
+    "Can you discern specific entities in the visual? Approach it in the context of instance segmentation.",
+    "Which components in the image catch your eye? Think of it in relation to instance layers.",
+    "How many distinct items can you pinpoint in the photo? With an instance segmentation approach.",
+    "What elements are noticeable in this portrayal? Analyze while considering instance segmentation.",
+    "Can you list the objects present in the visual representation? Reflecting on instance segmentation.",
+    "What items in the snapshot can you recognize? Interpret with an instance segmentation perspective.",
+    "Which entities are discernible in this depiction? Delve into it from an instance segmentation angle.",
+    "What are the components you can spot within the image? Think instance-wise.",
+    "Can you detail the objects in the visual? Assess as one would for instance segmentation.",
+]
+PANOPTIC_QUESTIONS = [
+    "What objects can be seen in the image? Perceive as done for panoptic segmentation",
+    "What items are evident in the picture? Analyze with a panoptic segmentation perspective.",
+    "Which elements emerge in the visual? Think in terms of panoptic segmentation.",
+    "What are the discernible objects in the graphic? Approach it from a panoptic segmentation viewpoint.",
+    "Can you identify the entities within the image? Consider it as you would for panoptic segmentation.",
+    "What components stand out in the photo? Examine with panoptic segmentation in mind.",
+    "Which items are detectable in the snapshot? Reflect upon it with panoptic segmentation considerations.",
+    "What features can be observed in the illustration? View through the lens of panoptic segmentation.",
+    "How would you describe the objects in this depiction? Keeping panoptic segmentation as a reference.",
+    "What constituents are visible in the visual content? Think from a panoptic segmentation standpoint.",
+    "Which objects can you pinpoint in the image? Evaluate as per panoptic segmentation guidelines.",
+    "What do you perceive in the graphic? Delve into it with panoptic segmentation insights.",
+    "Can you spot specific components in the visual? Contextualize with panoptic segmentation.",
+    "What items in the portrayal catch your attention? Think in relation to panoptic layers.",
+    "How many distinct entities can you recognize in the photo? With a panoptic segmentation approach.",
+    "What elements are present in this visual? Analyze while keeping panoptic segmentation in mind.",
+    "Can you list the objects depicted in the visual representation? Reflecting on panoptic segmentation.",
+    "Which features in the image can you discern? Interpret considering panoptic segmentation.",
+    "What are the components evident in this depiction? Approach it using a panoptic segmentation angle.",
+    "What items can you detect in the visual content? Think panoptically.",
+    "Can you detail the entities present in the image? Assess as one would when considering panoptic segmentation.",
+]
+DEPTH_QUESTIONS = [
+    "what is depth order of objects in the image?",
+    "Can you describe the depth order of the objects in this image, from closest to farthest?",
+    "Which objects in the image appear nearest to the viewer and which seem furthest away?",
+    "Could you list the objects in the image in order of their perceived distance from the foreground to the background?",
+    "In what order do the objects in this image appear based on their depth, starting from the closest?",
+    "How would you rank the objects in this picture from the most proximal to the most distal?",
+    "Can you arrange the objects seen here from those appearing closest to those appearing farthest?",
+    "What is the sequence of objects in this image based on their distance from the front to the back?",
+    "Please identify the order of objects in terms of depth perspective in this image.",
+    "Which objects in the picture seem to be in the front, and which ones appear to be in the back?",
+    "How are the objects in this image layered in depth, from the one nearest to the camera to the one farthest?",
+    "Could you sort the objects in this photo from foreground to background?",
+    "In this image, what is the spatial arrangement of objects from closest to furthest?",
+    "Can you pinpoint the depth hierarchy of these objects, starting from the closest?",
+    "What's the depth sequence of the objects displayed in this picture?",
+    "From nearest to furthest, how would you order the objects in this image?",
+    "How would you describe the spatial positioning of these objects in terms of their depth?",
+    "Can you determine the depth placement of each object in this photo, starting with the nearest?",
+    "What is the arrangement of objects in this scene by depth?",
+    "Could you outline the depth profile of the objects in this image?",
+    "In what depth order do the objects in this image align, from the frontmost to the rearmost?",
+    "How are the objects in this image ordered in terms of their relative distance from the observer?",
+]
+QUESTIONS = {
+    'semantic': SEMANTIC_QUESTIONS,
+    'instance': INSTANCE_QUESTIONS,
+    'panoptic': PANOPTIC_QUESTIONS,
+    'depth': DEPTH_QUESTIONS,
+}
+### Depth Prompts
+# Can you describe the depth order of the objects in this image, from closest to farthest? Return answer in the paragraph format: `The depth order for the objects present in the image is: ...' and then list the objects with their order number (if greater than 1) separated by a hyphen like `person-2'. For example, an acceptable response is "The depth order for objects present in the image is: bicycle, bicycle-2, bicycle-3, pavement, road, bus, tree, sky, building."
+### Seg Prompts
+# What objects can be seen in the image? Return the answer in the paragraph format: The objects present in the image are: ...' and then list the objects with their count in word format (if greater than 1) in front of them, like two people'.

vcoder_llava/utils.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import datetime
+import logging
+import logging.handlers
+import os
+import sys
+import requests
+from vcoder_llava.constants import LOGDIR
+server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
+moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
+handler = None
+def build_logger(logger_name, logger_filename):
+    global handler
+    formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    # Set the format of root handlers
+    if not logging.getLogger().handlers:
+        logging.basicConfig(level=logging.INFO)
+    logging.getLogger().handlers[0].setFormatter(formatter)
+    # Redirect stdout and stderr to loggers
+    stdout_logger = logging.getLogger("stdout")
+    stdout_logger.setLevel(logging.INFO)
+    sl = StreamToLogger(stdout_logger, logging.INFO)
+    sys.stdout = sl
+    stderr_logger = logging.getLogger("stderr")
+    stderr_logger.setLevel(logging.ERROR)
+    sl = StreamToLogger(stderr_logger, logging.ERROR)
+    sys.stderr = sl
+    # Get logger
+    logger = logging.getLogger(logger_name)
+    logger.setLevel(logging.INFO)
+    # Add a file handler for all loggers
+    if handler is None:
+        os.makedirs(LOGDIR, exist_ok=True)
+        filename = os.path.join(LOGDIR, logger_filename)
+        handler = logging.handlers.TimedRotatingFileHandler(
+            filename, when='D', utc=True)
+        handler.setFormatter(formatter)
+        for name, item in logging.root.manager.loggerDict.items():
+            if isinstance(item, logging.Logger):
+                item.addHandler(handler)
+    return logger
+class StreamToLogger(object):
+    """
+    Fake file-like stream object that redirects writes to a logger instance.
+    """
+    def __init__(self, logger, log_level=logging.INFO):
+        self.terminal = sys.stdout
+        self.logger = logger
+        self.log_level = log_level
+        self.linebuf = ''
+    def __getattr__(self, attr):
+        return getattr(self.terminal, attr)
+    def write(self, buf):
+        temp_linebuf = self.linebuf + buf
+        self.linebuf = ''
+        for line in temp_linebuf.splitlines(True):
+            # From the io.TextIOWrapper docs:
+            #   On output, if newline is None, any '\n' characters written
+            #   are translated to the system default line separator.
+            # By default sys.stdout.write() expects '\n' newlines and then
+            # translates them so this is still cross platform.
+            if line[-1] == '\n':
+                self.logger.log(self.log_level, line.rstrip())
+            else:
+                self.linebuf += line
+    def flush(self):
+        if self.linebuf != '':
+            self.logger.log(self.log_level, self.linebuf.rstrip())
+        self.linebuf = ''
+def disable_torch_init():
+    """
+    Disable the redundant torch default initialization to accelerate model creation.
+    """
+    import torch
+    setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
+    setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+def violates_moderation(text):
+    """
+    Check whether the text violates OpenAI moderation API.
+    """
+    url = "https://api.openai.com/v1/moderations"
+    headers = {"Content-Type": "application/json",
+               "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
+    text = text.replace("\n", "")
+    data = "{" + '"input": ' + f'"{text}"' + "}"
+    data = data.encode("utf-8")
+    try:
+        ret = requests.post(url, headers=headers, data=data, timeout=5)
+        flagged = ret.json()["results"][0]["flagged"]
+    except requests.exceptions.RequestException as e:
+        flagged = False
+    except KeyError as e:
+        flagged = False
+    return flagged
+def pretty_print_semaphore(semaphore):
+    if semaphore is None:
+        return "None"
+    return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"

vcoder_llava/vcoder_conversation.py ADDED Viewed

	@@ -0,0 +1,374 @@

+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_2 = auto()
+@dataclasses.dataclass
+class VCoderConversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    skip_next: bool = False
+    def get_prompt(self):
+        messages = self.messages
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _, _, _, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _, _, _, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _, _, _, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n"
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _, _, _, _, _ = message
+                    if i == 0: message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += " " + message + " " + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _, _, _, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    from PIL import Image
+                    msg, image, image_process_mode, _, _, _, _ = msg
+                    if image is not None:
+                        if image_process_mode == "Pad":
+                            def expand2square(pil_img, background_color=(122, 116, 104)):
+                                width, height = pil_img.size
+                                if width == height:
+                                    return pil_img
+                                elif width > height:
+                                    result = Image.new(pil_img.mode, (width, width), background_color)
+                                    result.paste(pil_img, (0, (width - height) // 2))
+                                    return result
+                                else:
+                                    result = Image.new(pil_img.mode, (height, height), background_color)
+                                    result.paste(pil_img, ((height - width) // 2, 0))
+                                    return result
+                            image = expand2square(image)
+                        elif image_process_mode in ["Default", "Crop"]:
+                            pass
+                        elif image_process_mode == "Resize":
+                            image = image.resize((336, 336))
+                        else:
+                            raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+                        max_hw, min_hw = max(image.size), min(image.size)
+                        aspect_ratio = max_hw / min_hw
+                        max_len, min_len = 800, 400
+                        shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                        longest_edge = int(shortest_edge * aspect_ratio)
+                        W, H = image.size
+                        if longest_edge != max(image.size):
+                            if H > W:
+                                H, W = longest_edge, shortest_edge
+                            else:
+                                H, W = shortest_edge, longest_edge
+                            image = image.resize((W, H))
+                        if return_pil:
+                            images.append(image)
+                        else:
+                            buffered = BytesIO()
+                            image.save(buffered, format="PNG")
+                            img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                            images.append(img_b64_str)
+        return images
+    def get_segs(self, return_pil=False):
+        segs = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    from PIL import Image
+                    msg, _, _, seg, seg_process_mode, _, _ = msg
+                    if seg is not None:
+                        if seg_process_mode == "Pad":
+                            def expand2square(pil_img, background_color=(122, 116, 104)):
+                                width, height = pil_img.size
+                                if width == height:
+                                    return pil_img
+                                elif width > height:
+                                    result = Image.new(pil_img.mode, (width, width), background_color)
+                                    result.paste(pil_img, (0, (width - height) // 2))
+                                    return result
+                                else:
+                                    result = Image.new(pil_img.mode, (height, height), background_color)
+                                    result.paste(pil_img, ((height - width) // 2, 0))
+                                    return result
+                            seg = expand2square(seg)
+                        elif seg_process_mode in ["Default", "Crop"]:
+                            pass
+                        elif seg_process_mode == "Resize":
+                            seg = seg.resize((336, 336))
+                        else:
+                            raise ValueError(f"Invalid image_process_mode: {seg_process_mode}")
+                        max_hw, min_hw = max(seg.size), min(seg.size)
+                        aspect_ratio = max_hw / min_hw
+                        max_len, min_len = 800, 400
+                        shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                        longest_edge = int(shortest_edge * aspect_ratio)
+                        W, H = seg.size
+                        if longest_edge != max(seg.size):
+                            if H > W:
+                                H, W = longest_edge, shortest_edge
+                            else:
+                                H, W = shortest_edge, longest_edge
+                            seg = seg.resize((W, H))
+                        if return_pil:
+                            segs.append(seg)
+                        else:
+                            buffered = BytesIO()
+                            seg.save(buffered, format="PNG")
+                            img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                            segs.append(img_b64_str)
+        return segs
+    def get_depths(self, return_pil=False):
+        depths = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    from PIL import Image
+                    msg, _, _, _, _, depth, depth_process_mode = msg
+                    if depth is not None:
+                        if depth_process_mode == "Pad":
+                            def expand2square(pil_img, background_color=(122, 116, 104)):
+                                width, height = pil_img.size
+                                if width == height:
+                                    return pil_img
+                                elif width > height:
+                                    result = Image.new(pil_img.mode, (width, width), background_color)
+                                    result.paste(pil_img, (0, (width - height) // 2))
+                                    return result
+                                else:
+                                    result = Image.new(pil_img.mode, (height, height), background_color)
+                                    result.paste(pil_img, ((height - width) // 2, 0))
+                                    return result
+                            depth = expand2square(depth)
+                        elif depth_process_mode in ["Default", "Crop"]:
+                            pass
+                        elif depth_process_mode == "Resize":
+                            depth = depth.resize((336, 336))
+                        else:
+                            raise ValueError(f"Invalid image_process_mode: {depth_process_mode}")
+                        max_hw, min_hw = max(depth.size), min(depth.size)
+                        aspect_ratio = max_hw / min_hw
+                        max_len, min_len = 800, 400
+                        shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                        longest_edge = int(shortest_edge * aspect_ratio)
+                        W, H = depth.size
+                        if longest_edge != max(depth.size):
+                            if H > W:
+                                H, W = longest_edge, shortest_edge
+                            else:
+                                H, W = shortest_edge, longest_edge
+                            depth = depth.resize((W, H))
+                        if return_pil:
+                            depths.append(depth)
+                        else:
+                            buffered = BytesIO()
+                            depth.save(buffered, format="PNG")
+                            img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                            depths.append(img_b64_str)
+        return depths
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    msg, image, image_process_mode, seg, seg_process_mode, depth, depth_process_mode = msg
+                    if image is not None:
+                        max_hw, min_hw = max(image.size), min(image.size)
+                        aspect_ratio = max_hw / min_hw
+                        max_len, min_len = 800, 400
+                        shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                        longest_edge = int(shortest_edge * aspect_ratio)
+                        W, H = image.size
+                        if H > W:
+                            H, W = longest_edge, shortest_edge
+                        else:
+                            H, W = shortest_edge, longest_edge
+                        image = image.resize((W, H))
+                        buffered = BytesIO()
+                        image.save(buffered, format="JPEG")
+                        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                        img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
+                        msg = img_str + msg.replace('<image>', '').strip()
+                    if seg is not None:
+                        W, H = seg.size
+                        if H > W:
+                            H, W = longest_edge, shortest_edge
+                        else:
+                            H, W = shortest_edge, longest_edge
+                        seg = seg.resize((W, H))
+                        seg_buffered = BytesIO()
+                        seg.save(seg_buffered, format="JPEG")
+                        seg_b64_str = base64.b64encode(seg_buffered.getvalue()).decode()
+                        seg_str = f'<img src="data:image/png;base64,{seg_b64_str}" alt="user upload seg" />'
+                        msg = seg_str + msg.replace('<seg>', '').strip()
+                    if depth is not None:
+                        W, H = depth.size
+                        if H > W:
+                            H, W = longest_edge, shortest_edge
+                        else:
+                            H, W = shortest_edge, longest_edge
+                        depth = depth.resize((W, H))
+                        depth_buffered = BytesIO()
+                        depth.save(depth_buffered, format="JPEG")
+                        depth_b64_str = base64.b64encode(depth_buffered.getvalue()).decode()
+                        depth_str = f'<img src="data:image/png;base64,{depth_b64_str}" alt="user upload depth" />'
+                        msg = depth_str + msg.replace('<depth>', '').strip()
+                    ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return VCoderConversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version)
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_vicuna_v1 = VCoderConversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llava_v1 = VCoderConversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+default_conversation = conv_vicuna_v1
+conv_templates = {
+    "v1": conv_vicuna_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    "llava_v1": conv_llava_v1,
+}
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())